New src/hotspot/cpu/riscv/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2025, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/universe.hpp"
  34 #include "nativeInst_riscv.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/method.hpp"
  37 #include "oops/objArrayKlass.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "prims/upcallLinker.hpp"
  41 #include "runtime/continuation.hpp"
  42 #include "runtime/continuationEntry.inline.hpp"
  43 #include "runtime/frame.inline.hpp"
  44 #include "runtime/handles.inline.hpp"
  45 #include "runtime/javaThread.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubCodeGenerator.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "utilities/align.hpp"
  50 #include "utilities/powerOfTwo.hpp"
  51 #ifdef COMPILER2
  52 #include "opto/runtime.hpp"
  53 #endif
  54 
  55 // Declaration and definition of StubGenerator (no .hpp file).
  56 // For a more detailed description of the stub routine structure
  57 // see the comment in stubRoutines.hpp
  58 
  59 #undef __
  60 #define __ _masm->
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(uint& counter) {
  79     __ incrementw(ExternalAddress((address)&counter));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save x1 (ra) as the return PC at the base of the frame and
 102   // link x8 (fp) below it as the frame pointer installing sp (x2)
 103   // into fp.
 104   //
 105   // we save x10-x17, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save x5 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 116   // volatile
 117   //
 118   // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
 119   // registers and C expects to be callee-save
 120   //
 121   // so the stub frame looks like this when we enter Java code
 122   //
 123   //     [ return_from_Java     ] <--- sp
 124   //     [ argument word n      ]
 125   //      ...
 126   // -35 [ argument word 1      ]
 127   // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
 128   // -33 [ saved f27            ]
 129   // -32 [ saved f26            ]
 130   // -31 [ saved f25            ]
 131   // -30 [ saved f24            ]
 132   // -29 [ saved f23            ]
 133   // -28 [ saved f22            ]
 134   // -27 [ saved f21            ]
 135   // -26 [ saved f20            ]
 136   // -25 [ saved f19            ]
 137   // -24 [ saved f18            ]
 138   // -23 [ saved f9             ]
 139   // -22 [ saved f8             ]
 140   // -21 [ saved x27            ]
 141   // -20 [ saved x26            ]
 142   // -19 [ saved x25            ]
 143   // -18 [ saved x24            ]
 144   // -17 [ saved x23            ]
 145   // -16 [ saved x22            ]
 146   // -15 [ saved x21            ]
 147   // -14 [ saved x20            ]
 148   // -13 [ saved x19            ]
 149   // -12 [ saved x18            ]
 150   // -11 [ saved x9             ]
 151   // -10 [ call wrapper   (x10) ]
 152   //  -9 [ result         (x11) ]
 153   //  -8 [ result type    (x12) ]
 154   //  -7 [ method         (x13) ]
 155   //  -6 [ entry point    (x14) ]
 156   //  -5 [ parameters     (x15) ]
 157   //  -4 [ parameter size (x16) ]
 158   //  -3 [ thread         (x17) ]
 159   //  -2 [ saved fp       (x8)  ]
 160   //  -1 [ saved ra       (x1)  ]
 161   //   0 [                      ] <--- fp == saved sp (x2)
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off  = -34,
 166 
 167     frm_off            = sp_after_call_off,
 168     f27_off            = -33,
 169     f26_off            = -32,
 170     f25_off            = -31,
 171     f24_off            = -30,
 172     f23_off            = -29,
 173     f22_off            = -28,
 174     f21_off            = -27,
 175     f20_off            = -26,
 176     f19_off            = -25,
 177     f18_off            = -24,
 178     f9_off             = -23,
 179     f8_off             = -22,
 180 
 181     x27_off            = -21,
 182     x26_off            = -20,
 183     x25_off            = -19,
 184     x24_off            = -18,
 185     x23_off            = -17,
 186     x22_off            = -16,
 187     x21_off            = -15,
 188     x20_off            = -14,
 189     x19_off            = -13,
 190     x18_off            = -12,
 191     x9_off             = -11,
 192 
 193     call_wrapper_off   = -10,
 194     result_off         = -9,
 195     result_type_off    = -8,
 196     method_off         = -7,
 197     entry_point_off    = -6,
 198     parameters_off     = -5,
 199     parameter_size_off = -4,
 200     thread_off         = -3,
 201     fp_f               = -2,
 202     retaddr_off        = -1,
 203   };
 204 
 205   address generate_call_stub(address& return_address) {
 206     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 207            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 208            "adjust this code");
 209 
 210     StubGenStubId stub_id = StubGenStubId::call_stub_id;
 211     StubCodeMark mark(this, stub_id);
 212     address start = __ pc();
 213 
 214     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 215 
 216     const Address frm_save      (fp, frm_off           * wordSize);
 217     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 218     const Address result        (fp, result_off         * wordSize);
 219     const Address result_type   (fp, result_type_off    * wordSize);
 220     const Address method        (fp, method_off         * wordSize);
 221     const Address entry_point   (fp, entry_point_off    * wordSize);
 222     const Address parameters    (fp, parameters_off     * wordSize);
 223     const Address parameter_size(fp, parameter_size_off * wordSize);
 224 
 225     const Address thread        (fp, thread_off         * wordSize);
 226 
 227     const Address f27_save      (fp, f27_off            * wordSize);
 228     const Address f26_save      (fp, f26_off            * wordSize);
 229     const Address f25_save      (fp, f25_off            * wordSize);
 230     const Address f24_save      (fp, f24_off            * wordSize);
 231     const Address f23_save      (fp, f23_off            * wordSize);
 232     const Address f22_save      (fp, f22_off            * wordSize);
 233     const Address f21_save      (fp, f21_off            * wordSize);
 234     const Address f20_save      (fp, f20_off            * wordSize);
 235     const Address f19_save      (fp, f19_off            * wordSize);
 236     const Address f18_save      (fp, f18_off            * wordSize);
 237     const Address f9_save       (fp, f9_off             * wordSize);
 238     const Address f8_save       (fp, f8_off             * wordSize);
 239 
 240     const Address x27_save      (fp, x27_off            * wordSize);
 241     const Address x26_save      (fp, x26_off            * wordSize);
 242     const Address x25_save      (fp, x25_off            * wordSize);
 243     const Address x24_save      (fp, x24_off            * wordSize);
 244     const Address x23_save      (fp, x23_off            * wordSize);
 245     const Address x22_save      (fp, x22_off            * wordSize);
 246     const Address x21_save      (fp, x21_off            * wordSize);
 247     const Address x20_save      (fp, x20_off            * wordSize);
 248     const Address x19_save      (fp, x19_off            * wordSize);
 249     const Address x18_save      (fp, x18_off            * wordSize);
 250 
 251     const Address x9_save       (fp, x9_off             * wordSize);
 252 
 253     // stub code
 254 
 255     address riscv_entry = __ pc();
 256 
 257     // set up frame and move sp to end of save area
 258     __ enter();
 259     __ addi(sp, fp, sp_after_call_off * wordSize);
 260 
 261     // save register parameters and Java temporary/global registers
 262     // n.b. we save thread even though it gets installed in
 263     // xthread because we want to sanity check tp later
 264     __ sd(c_rarg7, thread);
 265     __ sw(c_rarg6, parameter_size);
 266     __ sd(c_rarg5, parameters);
 267     __ sd(c_rarg4, entry_point);
 268     __ sd(c_rarg3, method);
 269     __ sd(c_rarg2, result_type);
 270     __ sd(c_rarg1, result);
 271     __ sd(c_rarg0, call_wrapper);
 272 
 273     __ sd(x9, x9_save);
 274 
 275     __ sd(x18, x18_save);
 276     __ sd(x19, x19_save);
 277     __ sd(x20, x20_save);
 278     __ sd(x21, x21_save);
 279     __ sd(x22, x22_save);
 280     __ sd(x23, x23_save);
 281     __ sd(x24, x24_save);
 282     __ sd(x25, x25_save);
 283     __ sd(x26, x26_save);
 284     __ sd(x27, x27_save);
 285 
 286     __ fsd(f8,  f8_save);
 287     __ fsd(f9,  f9_save);
 288     __ fsd(f18, f18_save);
 289     __ fsd(f19, f19_save);
 290     __ fsd(f20, f20_save);
 291     __ fsd(f21, f21_save);
 292     __ fsd(f22, f22_save);
 293     __ fsd(f23, f23_save);
 294     __ fsd(f24, f24_save);
 295     __ fsd(f25, f25_save);
 296     __ fsd(f26, f26_save);
 297     __ fsd(f27, f27_save);
 298 
 299     __ frrm(t0);
 300     __ sd(t0, frm_save);
 301     // Set frm to the state we need. We do want Round to Nearest. We
 302     // don't want non-IEEE rounding modes.
 303     Label skip_fsrmi;
 304     guarantee(__ RoundingMode::rne == 0, "must be");
 305     __ beqz(t0, skip_fsrmi);
 306     __ fsrmi(__ RoundingMode::rne);
 307     __ bind(skip_fsrmi);
 308 
 309     // install Java thread in global register now we have saved
 310     // whatever value it held
 311     __ mv(xthread, c_rarg7);
 312 
 313     // And method
 314     __ mv(xmethod, c_rarg3);
 315 
 316     // set up the heapbase register
 317     __ reinit_heapbase();
 318 
 319 #ifdef ASSERT
 320     // make sure we have no pending exceptions
 321     {
 322       Label L;
 323       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 324       __ beqz(t0, L);
 325       __ stop("StubRoutines::call_stub: entered with pending exception");
 326       __ BIND(L);
 327     }
 328 #endif
 329     // pass parameters if any
 330     __ mv(esp, sp);
 331     __ slli(t0, c_rarg6, LogBytesPerWord);
 332     __ sub(t0, sp, t0); // Move SP out of the way
 333     __ andi(sp, t0, -2 * wordSize);
 334 
 335     BLOCK_COMMENT("pass parameters if any");
 336     Label parameters_done;
 337     // parameter count is still in c_rarg6
 338     // and parameter pointer identifying param 1 is in c_rarg5
 339     __ beqz(c_rarg6, parameters_done);
 340 
 341     address loop = __ pc();
 342     __ ld(t0, Address(c_rarg5, 0));
 343     __ addi(c_rarg5, c_rarg5, wordSize);
 344     __ subi(c_rarg6, c_rarg6, 1);
 345     __ push_reg(t0);
 346     __ bgtz(c_rarg6, loop);
 347 
 348     __ BIND(parameters_done);
 349 
 350     // call Java entry -- passing methdoOop, and current sp
 351     //      xmethod: Method*
 352     //      x19_sender_sp: sender sp
 353     BLOCK_COMMENT("call Java function");
 354     __ mv(x19_sender_sp, sp);
 355     __ jalr(c_rarg4);
 356 
 357     // save current address for use by exception handling code
 358 
 359     return_address = __ pc();
 360 
 361     // store result depending on type (everything that is not
 362     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 363     // n.b. this assumes Java returns an integral result in x10
 364     // and a floating result in j_farg0
 365     __ ld(j_rarg2, result);
 366     Label is_long, is_float, is_double, exit;
 367     __ ld(j_rarg1, result_type);
 368     __ mv(t0, (u1)T_OBJECT);
 369     __ beq(j_rarg1, t0, is_long);
 370     __ mv(t0, (u1)T_LONG);
 371     __ beq(j_rarg1, t0, is_long);
 372     __ mv(t0, (u1)T_FLOAT);
 373     __ beq(j_rarg1, t0, is_float);
 374     __ mv(t0, (u1)T_DOUBLE);
 375     __ beq(j_rarg1, t0, is_double);
 376 
 377     // handle T_INT case
 378     __ sw(x10, Address(j_rarg2));
 379 
 380     __ BIND(exit);
 381 
 382     // pop parameters
 383     __ addi(esp, fp, sp_after_call_off * wordSize);
 384 
 385 #ifdef ASSERT
 386     // verify that threads correspond
 387     {
 388       Label L, S;
 389       __ ld(t0, thread);
 390       __ bne(xthread, t0, S);
 391       __ get_thread(t0);
 392       __ beq(xthread, t0, L);
 393       __ BIND(S);
 394       __ stop("StubRoutines::call_stub: threads must correspond");
 395       __ BIND(L);
 396     }
 397 #endif
 398 
 399     __ pop_cont_fastpath(xthread);
 400 
 401     // restore callee-save registers
 402     __ fld(f27, f27_save);
 403     __ fld(f26, f26_save);
 404     __ fld(f25, f25_save);
 405     __ fld(f24, f24_save);
 406     __ fld(f23, f23_save);
 407     __ fld(f22, f22_save);
 408     __ fld(f21, f21_save);
 409     __ fld(f20, f20_save);
 410     __ fld(f19, f19_save);
 411     __ fld(f18, f18_save);
 412     __ fld(f9,  f9_save);
 413     __ fld(f8,  f8_save);
 414 
 415     __ ld(x27, x27_save);
 416     __ ld(x26, x26_save);
 417     __ ld(x25, x25_save);
 418     __ ld(x24, x24_save);
 419     __ ld(x23, x23_save);
 420     __ ld(x22, x22_save);
 421     __ ld(x21, x21_save);
 422     __ ld(x20, x20_save);
 423     __ ld(x19, x19_save);
 424     __ ld(x18, x18_save);
 425 
 426     __ ld(x9, x9_save);
 427 
 428     // restore frm
 429     Label skip_fsrm;
 430     __ ld(t0, frm_save);
 431     __ frrm(t1);
 432     __ beq(t0, t1, skip_fsrm);
 433     __ fsrm(t0);
 434     __ bind(skip_fsrm);
 435 
 436     __ ld(c_rarg0, call_wrapper);
 437     __ ld(c_rarg1, result);
 438     __ ld(c_rarg2, result_type);
 439     __ ld(c_rarg3, method);
 440     __ ld(c_rarg4, entry_point);
 441     __ ld(c_rarg5, parameters);
 442     __ ld(c_rarg6, parameter_size);
 443     __ ld(c_rarg7, thread);
 444 
 445     // leave frame and return to caller
 446     __ leave();
 447     __ ret();
 448 
 449     // handle return types different from T_INT
 450 
 451     __ BIND(is_long);
 452     __ sd(x10, Address(j_rarg2, 0));
 453     __ j(exit);
 454 
 455     __ BIND(is_float);
 456     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 457     __ j(exit);
 458 
 459     __ BIND(is_double);
 460     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 461     __ j(exit);
 462 
 463     return start;
 464   }
 465 
 466   // Return point for a Java call if there's an exception thrown in
 467   // Java code.  The exception is caught and transformed into a
 468   // pending exception stored in JavaThread that can be tested from
 469   // within the VM.
 470   //
 471   // Note: Usually the parameters are removed by the callee. In case
 472   // of an exception crossing an activation frame boundary, that is
 473   // not the case if the callee is compiled code => need to setup the
 474   // sp.
 475   //
 476   // x10: exception oop
 477 
 478   address generate_catch_exception() {
 479     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
 480     StubCodeMark mark(this, stub_id);
 481     address start = __ pc();
 482 
 483     // same as in generate_call_stub():
 484     const Address thread(fp, thread_off * wordSize);
 485 
 486 #ifdef ASSERT
 487     // verify that threads correspond
 488     {
 489       Label L, S;
 490       __ ld(t0, thread);
 491       __ bne(xthread, t0, S);
 492       __ get_thread(t0);
 493       __ beq(xthread, t0, L);
 494       __ bind(S);
 495       __ stop("StubRoutines::catch_exception: threads must correspond");
 496       __ bind(L);
 497     }
 498 #endif
 499 
 500     // set pending exception
 501     __ verify_oop(x10);
 502 
 503     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 504     __ mv(t0, (address)__FILE__);
 505     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 506     __ mv(t0, (int)__LINE__);
 507     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 508 
 509     // complete return to VM
 510     assert(StubRoutines::_call_stub_return_address != nullptr,
 511            "_call_stub_return_address must have been generated before");
 512     __ j(RuntimeAddress(StubRoutines::_call_stub_return_address));
 513 
 514     return start;
 515   }
 516 
 517   // Continuation point for runtime calls returning with a pending
 518   // exception.  The pending exception check happened in the runtime
 519   // or native call stub.  The pending exception in Thread is
 520   // converted into a Java-level exception.
 521   //
 522   // Contract with Java-level exception handlers:
 523   // x10: exception
 524   // x13: throwing pc
 525   //
 526   // NOTE: At entry of this stub, exception-pc must be in RA !!
 527 
 528   // NOTE: this is always used as a jump target within generated code
 529   // so it just needs to be generated code with no x86 prolog
 530 
 531   address generate_forward_exception() {
 532     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
 533     StubCodeMark mark(this, stub_id);
 534     address start = __ pc();
 535 
 536     // Upon entry, RA points to the return address returning into
 537     // Java (interpreted or compiled) code; i.e., the return address
 538     // becomes the throwing pc.
 539     //
 540     // Arguments pushed before the runtime call are still on the stack
 541     // but the exception handler will reset the stack pointer ->
 542     // ignore them.  A potential result in registers can be ignored as
 543     // well.
 544 
 545 #ifdef ASSERT
 546     // make sure this code is only executed if there is a pending exception
 547     {
 548       Label L;
 549       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 550       __ bnez(t0, L);
 551       __ stop("StubRoutines::forward exception: no pending exception (1)");
 552       __ bind(L);
 553     }
 554 #endif
 555 
 556     // compute exception handler into x9
 557 
 558     // call the VM to find the handler address associated with the
 559     // caller address. pass thread in x10 and caller pc (ret address)
 560     // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
 561     // the stack.
 562     __ mv(c_rarg1, ra);
 563     // ra will be trashed by the VM call so we move it to x9
 564     // (callee-saved) because we also need to pass it to the handler
 565     // returned by this call.
 566     __ mv(x9, ra);
 567     BLOCK_COMMENT("call exception_handler_for_return_address");
 568     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 569                          SharedRuntime::exception_handler_for_return_address),
 570                     xthread, c_rarg1);
 571     // we should not really care that ra is no longer the callee
 572     // address. we saved the value the handler needs in x9 so we can
 573     // just copy it to x13. however, the C2 handler will push its own
 574     // frame and then calls into the VM and the VM code asserts that
 575     // the PC for the frame above the handler belongs to a compiled
 576     // Java method. So, we restore ra here to satisfy that assert.
 577     __ mv(ra, x9);
 578     // setup x10 & x13 & clear pending exception
 579     __ mv(x13, x9);
 580     __ mv(x9, x10);
 581     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 582     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 583 
 584 #ifdef ASSERT
 585     // make sure exception is set
 586     {
 587       Label L;
 588       __ bnez(x10, L);
 589       __ stop("StubRoutines::forward exception: no pending exception (2)");
 590       __ bind(L);
 591     }
 592 #endif
 593 
 594     // continue at exception handler
 595     // x10: exception
 596     // x13: throwing pc
 597     // x9: exception handler
 598     __ verify_oop(x10);
 599     __ jr(x9);
 600 
 601     return start;
 602   }
 603 
 604   // Non-destructive plausibility checks for oops
 605   //
 606   // Arguments:
 607   //    x10: oop to verify
 608   //    t0: error message
 609   //
 610   // Stack after saving c_rarg3:
 611   //    [tos + 0]: saved c_rarg3
 612   //    [tos + 1]: saved c_rarg2
 613   //    [tos + 2]: saved ra
 614   //    [tos + 3]: saved t1
 615   //    [tos + 4]: saved x10
 616   //    [tos + 5]: saved t0
 617   address generate_verify_oop() {
 618 
 619     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
 620     StubCodeMark mark(this, stub_id);
 621     address start = __ pc();
 622 
 623     Label exit, error;
 624 
 625     __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
 626 
 627     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 628     __ ld(c_rarg3, Address(c_rarg2));
 629     __ addi(c_rarg3, c_rarg3, 1);
 630     __ sd(c_rarg3, Address(c_rarg2));
 631 
 632     // object is in x10
 633     // make sure object is 'reasonable'
 634     __ beqz(x10, exit); // if obj is null it is OK
 635 
 636     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 637     bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
 638 
 639     // return if everything seems ok
 640     __ bind(exit);
 641 
 642     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp);  // pop c_rarg2 and c_rarg3
 643     __ ret();
 644 
 645     // handle errors
 646     __ bind(error);
 647     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
 648 
 649     __ push_reg(RegSet::range(x0, x31), sp);
 650     // debug(char* msg, int64_t pc, int64_t regs[])
 651     __ mv(c_rarg0, t0);             // pass address of error message
 652     __ mv(c_rarg1, ra);             // pass return address
 653     __ mv(c_rarg2, sp);             // pass address of regs on stack
 654 #ifndef PRODUCT
 655     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 656 #endif
 657     BLOCK_COMMENT("call MacroAssembler::debug");
 658     __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 659     __ ebreak();
 660 
 661     return start;
 662   }
 663 
 664   // The inner part of zero_words().
 665   //
 666   // Inputs:
 667   // x28: the HeapWord-aligned base address of an array to zero.
 668   // x29: the count in HeapWords, x29 > 0.
 669   //
 670   // Returns x28 and x29, adjusted for the caller to clear.
 671   // x28: the base address of the tail of words left to clear.
 672   // x29: the number of words in the tail.
 673   //      x29 < MacroAssembler::zero_words_block_size.
 674 
 675   address generate_zero_blocks() {
 676     Label done;
 677 
 678     const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
 679 
 680     __ align(CodeEntryAlignment);
 681     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
 682     StubCodeMark mark(this, stub_id);
 683     address start = __ pc();
 684 
 685     if (UseBlockZeroing) {
 686       // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero
 687       // after alignment.
 688       Label small;
 689       int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize;
 690       __ mv(tmp1, low_limit);
 691       __ blt(cnt, tmp1, small);
 692       __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
 693       __ bind(small);
 694     }
 695 
 696     {
 697       // Clear the remaining blocks.
 698       Label loop;
 699       __ mv(tmp1, MacroAssembler::zero_words_block_size);
 700       __ blt(cnt, tmp1, done);
 701       __ bind(loop);
 702       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 703         __ sd(zr, Address(base, i * wordSize));
 704       }
 705       __ addi(base, base, MacroAssembler::zero_words_block_size * wordSize);
 706       __ subi(cnt, cnt, MacroAssembler::zero_words_block_size);
 707       __ bge(cnt, tmp1, loop);
 708       __ bind(done);
 709     }
 710 
 711     __ ret();
 712 
 713     return start;
 714   }
 715 
 716   typedef enum {
 717     copy_forwards = 1,
 718     copy_backwards = -1
 719   } copy_direction;
 720 
 721   // Bulk copy of blocks of 8 words.
 722   //
 723   // count is a count of words.
 724   //
 725   // Precondition: count >= 8
 726   //
 727   // Postconditions:
 728   //
 729   // The least significant bit of count contains the remaining count
 730   // of words to copy.  The rest of count is trash.
 731   //
 732   // s and d are adjusted to point to the remaining words to copy
 733   //
 734   void generate_copy_longs(StubGenStubId stub_id, Label &start,
 735                            Register s, Register d, Register count) {
 736     BasicType type;
 737     copy_direction direction;
 738     switch (stub_id) {
 739     case copy_byte_f_id:
 740       direction = copy_forwards;
 741       type = T_BYTE;
 742       break;
 743     case copy_byte_b_id:
 744       direction = copy_backwards;
 745       type = T_BYTE;
 746       break;
 747     default:
 748       ShouldNotReachHere();
 749     }
 750     int unit = wordSize * direction;
 751     int bias = wordSize;
 752 
 753     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 754       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 755 
 756     const Register stride = x30;
 757 
 758     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 759       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 760     assert_different_registers(s, d, count, t0);
 761 
 762     Label again, drain;
 763     StubCodeMark mark(this, stub_id);
 764     __ align(CodeEntryAlignment);
 765     __ bind(start);
 766 
 767     if (direction == copy_forwards) {
 768       __ sub(s, s, bias);
 769       __ sub(d, d, bias);
 770     }
 771 
 772 #ifdef ASSERT
 773     // Make sure we are never given < 8 words
 774     {
 775       Label L;
 776 
 777       __ mv(t0, 8);
 778       __ bge(count, t0, L);
 779       __ stop("genrate_copy_longs called with < 8 words");
 780       __ bind(L);
 781     }
 782 #endif
 783 
 784     __ ld(tmp_reg0, Address(s, 1 * unit));
 785     __ ld(tmp_reg1, Address(s, 2 * unit));
 786     __ ld(tmp_reg2, Address(s, 3 * unit));
 787     __ ld(tmp_reg3, Address(s, 4 * unit));
 788     __ ld(tmp_reg4, Address(s, 5 * unit));
 789     __ ld(tmp_reg5, Address(s, 6 * unit));
 790     __ ld(tmp_reg6, Address(s, 7 * unit));
 791     __ ld(tmp_reg7, Address(s, 8 * unit));
 792     __ addi(s, s, 8 * unit);
 793 
 794     __ subi(count, count, 16);
 795     __ bltz(count, drain);
 796 
 797     __ bind(again);
 798 
 799     __ sd(tmp_reg0, Address(d, 1 * unit));
 800     __ sd(tmp_reg1, Address(d, 2 * unit));
 801     __ sd(tmp_reg2, Address(d, 3 * unit));
 802     __ sd(tmp_reg3, Address(d, 4 * unit));
 803     __ sd(tmp_reg4, Address(d, 5 * unit));
 804     __ sd(tmp_reg5, Address(d, 6 * unit));
 805     __ sd(tmp_reg6, Address(d, 7 * unit));
 806     __ sd(tmp_reg7, Address(d, 8 * unit));
 807 
 808     __ ld(tmp_reg0, Address(s, 1 * unit));
 809     __ ld(tmp_reg1, Address(s, 2 * unit));
 810     __ ld(tmp_reg2, Address(s, 3 * unit));
 811     __ ld(tmp_reg3, Address(s, 4 * unit));
 812     __ ld(tmp_reg4, Address(s, 5 * unit));
 813     __ ld(tmp_reg5, Address(s, 6 * unit));
 814     __ ld(tmp_reg6, Address(s, 7 * unit));
 815     __ ld(tmp_reg7, Address(s, 8 * unit));
 816 
 817     __ addi(s, s, 8 * unit);
 818     __ addi(d, d, 8 * unit);
 819 
 820     __ subi(count, count, 8);
 821     __ bgez(count, again);
 822 
 823     // Drain
 824     __ bind(drain);
 825 
 826     __ sd(tmp_reg0, Address(d, 1 * unit));
 827     __ sd(tmp_reg1, Address(d, 2 * unit));
 828     __ sd(tmp_reg2, Address(d, 3 * unit));
 829     __ sd(tmp_reg3, Address(d, 4 * unit));
 830     __ sd(tmp_reg4, Address(d, 5 * unit));
 831     __ sd(tmp_reg5, Address(d, 6 * unit));
 832     __ sd(tmp_reg6, Address(d, 7 * unit));
 833     __ sd(tmp_reg7, Address(d, 8 * unit));
 834     __ addi(d, d, 8 * unit);
 835 
 836     {
 837       Label L1, L2;
 838       __ test_bit(t0, count, 2);
 839       __ beqz(t0, L1);
 840 
 841       __ ld(tmp_reg0, Address(s, 1 * unit));
 842       __ ld(tmp_reg1, Address(s, 2 * unit));
 843       __ ld(tmp_reg2, Address(s, 3 * unit));
 844       __ ld(tmp_reg3, Address(s, 4 * unit));
 845       __ addi(s, s, 4 * unit);
 846 
 847       __ sd(tmp_reg0, Address(d, 1 * unit));
 848       __ sd(tmp_reg1, Address(d, 2 * unit));
 849       __ sd(tmp_reg2, Address(d, 3 * unit));
 850       __ sd(tmp_reg3, Address(d, 4 * unit));
 851       __ addi(d, d, 4 * unit);
 852 
 853       __ bind(L1);
 854 
 855       if (direction == copy_forwards) {
 856         __ addi(s, s, bias);
 857         __ addi(d, d, bias);
 858       }
 859 
 860       __ test_bit(t0, count, 1);
 861       __ beqz(t0, L2);
 862       if (direction == copy_backwards) {
 863         __ addi(s, s, 2 * unit);
 864         __ ld(tmp_reg0, Address(s));
 865         __ ld(tmp_reg1, Address(s, wordSize));
 866         __ addi(d, d, 2 * unit);
 867         __ sd(tmp_reg0, Address(d));
 868         __ sd(tmp_reg1, Address(d, wordSize));
 869       } else {
 870         __ ld(tmp_reg0, Address(s));
 871         __ ld(tmp_reg1, Address(s, wordSize));
 872         __ addi(s, s, 2 * unit);
 873         __ sd(tmp_reg0, Address(d));
 874         __ sd(tmp_reg1, Address(d, wordSize));
 875         __ addi(d, d, 2 * unit);
 876       }
 877       __ bind(L2);
 878     }
 879 
 880     __ ret();
 881   }
 882 
 883   Label copy_f, copy_b;
 884 
 885   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 886 
 887   void copy_memory_v(Register s, Register d, Register count, int step) {
 888     bool is_backward = step < 0;
 889     int granularity = g_uabs(step);
 890 
 891     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 892     assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
 893     Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
 894     Label loop_forward, loop_backward, done;
 895 
 896     __ mv(dst, d);
 897     __ mv(src, s);
 898     __ mv(cnt, count);
 899 
 900     __ bind(loop_forward);
 901     __ vsetvli(vl, cnt, sew, Assembler::m8);
 902     if (is_backward) {
 903       __ bne(vl, cnt, loop_backward);
 904     }
 905 
 906     __ vlex_v(v0, src, sew);
 907     __ sub(cnt, cnt, vl);
 908     if (sew != Assembler::e8) {
 909       // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 910       __ slli(vl, vl, sew);
 911     }
 912     __ add(src, src, vl);
 913 
 914     __ vsex_v(v0, dst, sew);
 915     __ add(dst, dst, vl);
 916     __ bnez(cnt, loop_forward);
 917 
 918     if (is_backward) {
 919       __ j(done);
 920 
 921       __ bind(loop_backward);
 922       __ sub(t0, cnt, vl);
 923       if (sew != Assembler::e8) {
 924         // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 925         __ slli(t0, t0, sew);
 926       }
 927       __ add(tmp1, s, t0);
 928       __ vlex_v(v0, tmp1, sew);
 929       __ add(tmp2, d, t0);
 930       __ vsex_v(v0, tmp2, sew);
 931       __ sub(cnt, cnt, vl);
 932       __ bnez(cnt, loop_forward);
 933       __ bind(done);
 934     }
 935   }
 936 
 937   // All-singing all-dancing memory copy.
 938   //
 939   // Copy count units of memory from s to d.  The size of a unit is
 940   // step, which can be positive or negative depending on the direction
 941   // of copy.
 942   //
 943   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 944                    Register s, Register d, Register count, int step) {
 945     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 946     if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
 947       return copy_memory_v(s, d, count, step);
 948     }
 949 
 950     bool is_backwards = step < 0;
 951     int granularity = g_uabs(step);
 952 
 953     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
 954     const Register gct1 = x28, gct2 = x29, gct3 = t2;
 955 
 956     Label same_aligned;
 957     Label copy_big, copy32_loop, copy8_loop, copy_small, done;
 958 
 959     // The size of copy32_loop body increases significantly with ZGC GC barriers.
 960     // Need conditional far branches to reach a point beyond the loop in this case.
 961     bool is_far = UseZGC;
 962 
 963     __ beqz(count, done, is_far);
 964     __ slli(cnt, count, exact_log2(granularity));
 965     if (is_backwards) {
 966       __ add(src, s, cnt);
 967       __ add(dst, d, cnt);
 968     } else {
 969       __ mv(src, s);
 970       __ mv(dst, d);
 971     }
 972 
 973     if (is_aligned) {
 974       __ subi(t0, cnt, 32);
 975       __ bgez(t0, copy32_loop);
 976       __ subi(t0, cnt, 8);
 977       __ bgez(t0, copy8_loop, is_far);
 978       __ j(copy_small);
 979     } else {
 980       __ mv(t0, 16);
 981       __ blt(cnt, t0, copy_small, is_far);
 982 
 983       __ xorr(t0, src, dst);
 984       __ andi(t0, t0, 0b111);
 985       __ bnez(t0, copy_small, is_far);
 986 
 987       __ bind(same_aligned);
 988       __ andi(t0, src, 0b111);
 989       __ beqz(t0, copy_big);
 990       if (is_backwards) {
 991         __ addi(src, src, step);
 992         __ addi(dst, dst, step);
 993       }
 994       bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
 995       bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
 996       if (!is_backwards) {
 997         __ addi(src, src, step);
 998         __ addi(dst, dst, step);
 999       }
1000       __ subi(cnt, cnt, granularity);
1001       __ beqz(cnt, done, is_far);
1002       __ j(same_aligned);
1003 
1004       __ bind(copy_big);
1005       __ mv(t0, 32);
1006       __ blt(cnt, t0, copy8_loop, is_far);
1007     }
1008 
1009     __ bind(copy32_loop);
1010     if (is_backwards) {
1011       __ subi(src, src, wordSize * 4);
1012       __ subi(dst, dst, wordSize * 4);
1013     }
1014     // we first load 32 bytes, then write it, so the direction here doesn't matter
1015     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src),     gct1);
1016     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8),  gct1);
1017     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
1018     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
1019 
1020     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst),     tmp3, gct1, gct2, gct3);
1021     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8),  tmp4, gct1, gct2, gct3);
1022     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
1023     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
1024 
1025     if (!is_backwards) {
1026       __ addi(src, src, wordSize * 4);
1027       __ addi(dst, dst, wordSize * 4);
1028     }
1029     __ subi(t0, cnt, 32 + wordSize * 4);
1030     __ subi(cnt, cnt, wordSize * 4);
1031     __ bgez(t0, copy32_loop); // cnt >= 32, do next loop
1032 
1033     __ beqz(cnt, done); // if that's all - done
1034 
1035     __ subi(t0, cnt, 8); // if not - copy the reminder
1036     __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
1037 
1038     __ bind(copy8_loop);
1039     if (is_backwards) {
1040       __ subi(src, src, wordSize);
1041       __ subi(dst, dst, wordSize);
1042     }
1043     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1044     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1045 
1046     if (!is_backwards) {
1047       __ addi(src, src, wordSize);
1048       __ addi(dst, dst, wordSize);
1049     }
1050     __ subi(t0, cnt, 8 + wordSize);
1051     __ subi(cnt, cnt, wordSize);
1052     __ bgez(t0, copy8_loop); // cnt >= 8, do next loop
1053 
1054     __ beqz(cnt, done); // if that's all - done
1055 
1056     __ bind(copy_small);
1057     if (is_backwards) {
1058       __ addi(src, src, step);
1059       __ addi(dst, dst, step);
1060     }
1061 
1062     bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
1063     bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
1064 
1065     if (!is_backwards) {
1066       __ addi(src, src, step);
1067       __ addi(dst, dst, step);
1068     }
1069     __ subi(cnt, cnt, granularity);
1070     __ bgtz(cnt, copy_small);
1071 
1072     __ bind(done);
1073   }
1074 
1075   // Scan over array at a for count oops, verifying each one.
1076   // Preserves a and count, clobbers t0 and t1.
1077   void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1078     Label loop, end;
1079     __ mv(t1, zr);
1080     __ slli(t0, count, exact_log2(size));
1081     __ bind(loop);
1082     __ bgeu(t1, t0, end);
1083 
1084     __ add(temp, a, t1);
1085     if (size == (size_t)wordSize) {
1086       __ ld(temp, Address(temp, 0));
1087       __ verify_oop(temp);
1088     } else {
1089       __ lwu(temp, Address(temp, 0));
1090       __ decode_heap_oop(temp); // calls verify_oop
1091     }
1092     __ add(t1, t1, size);
1093     __ j(loop);
1094     __ bind(end);
1095   }
1096 
1097   // Arguments:
1098   //   stub_id - is used to name the stub and identify all details of
1099   //             how to perform the copy.
1100   //
1101   //   entry - is assigned to the stub's post push entry point unless
1102   //           it is null
1103   //
1104   // Inputs:
1105   //   c_rarg0   - source array address
1106   //   c_rarg1   - destination array address
1107   //   c_rarg2   - element count, treated as ssize_t, can be zero
1108   //
1109   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1110   // the hardware handle it.  The two dwords within qwords that span
1111   // cache line boundaries will still be loaded and stored atomically.
1112   //
1113   // Side Effects: entry is set to the (post push) entry point so it
1114   //               can be used by the corresponding conjoint copy
1115   //               method
1116   //
1117   address generate_disjoint_copy(StubGenStubId stub_id, address* entry) {
1118     size_t size;
1119     bool aligned;
1120     bool is_oop;
1121     bool dest_uninitialized;
1122     switch (stub_id) {
1123     case jbyte_disjoint_arraycopy_id:
1124       size = sizeof(jbyte);
1125       aligned = false;
1126       is_oop = false;
1127       dest_uninitialized = false;
1128       break;
1129     case arrayof_jbyte_disjoint_arraycopy_id:
1130       size = sizeof(jbyte);
1131       aligned = true;
1132       is_oop = false;
1133       dest_uninitialized = false;
1134       break;
1135     case jshort_disjoint_arraycopy_id:
1136       size = sizeof(jshort);
1137       aligned = false;
1138       is_oop = false;
1139       dest_uninitialized = false;
1140       break;
1141     case arrayof_jshort_disjoint_arraycopy_id:
1142       size = sizeof(jshort);
1143       aligned = true;
1144       is_oop = false;
1145       dest_uninitialized = false;
1146       break;
1147     case jint_disjoint_arraycopy_id:
1148       size = sizeof(jint);
1149       aligned = false;
1150       is_oop = false;
1151       dest_uninitialized = false;
1152       break;
1153     case arrayof_jint_disjoint_arraycopy_id:
1154       size = sizeof(jint);
1155       aligned = true;
1156       is_oop = false;
1157       dest_uninitialized = false;
1158       break;
1159     case jlong_disjoint_arraycopy_id:
1160       // since this is always aligned we can (should!) use the same
1161       // stub as for case arrayof_jlong_disjoint_arraycopy
1162       ShouldNotReachHere();
1163       break;
1164     case arrayof_jlong_disjoint_arraycopy_id:
1165       size = sizeof(jlong);
1166       aligned = true;
1167       is_oop = false;
1168       dest_uninitialized = false;
1169       break;
1170     case oop_disjoint_arraycopy_id:
1171       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1172       aligned = !UseCompressedOops;
1173       is_oop = true;
1174       dest_uninitialized = false;
1175       break;
1176     case arrayof_oop_disjoint_arraycopy_id:
1177       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1178       aligned = !UseCompressedOops;
1179       is_oop = true;
1180       dest_uninitialized = false;
1181       break;
1182     case oop_disjoint_arraycopy_uninit_id:
1183       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1184       aligned = !UseCompressedOops;
1185       is_oop = true;
1186       dest_uninitialized = true;
1187       break;
1188     case arrayof_oop_disjoint_arraycopy_uninit_id:
1189       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1190       aligned = !UseCompressedOops;
1191       is_oop = true;
1192       dest_uninitialized = true;
1193       break;
1194     default:
1195       ShouldNotReachHere();
1196       break;
1197     }
1198 
1199     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1200     RegSet saved_reg = RegSet::of(s, d, count);
1201     __ align(CodeEntryAlignment);
1202     StubCodeMark mark(this, stub_id);
1203     address start = __ pc();
1204     __ enter();
1205 
1206     if (entry != nullptr) {
1207       *entry = __ pc();
1208       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1209       BLOCK_COMMENT("Entry:");
1210     }
1211 
1212     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1213     if (dest_uninitialized) {
1214       decorators |= IS_DEST_UNINITIALIZED;
1215     }
1216     if (aligned) {
1217       decorators |= ARRAYCOPY_ALIGNED;
1218     }
1219 
1220     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1221     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1222 
1223     if (is_oop) {
1224       // save regs before copy_memory
1225       __ push_reg(RegSet::of(d, count), sp);
1226     }
1227 
1228     {
1229       // UnsafeMemoryAccess page error: continue after unsafe access
1230       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1231       UnsafeMemoryAccessMark umam(this, add_entry, true);
1232       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1233     }
1234 
1235     if (is_oop) {
1236       __ pop_reg(RegSet::of(d, count), sp);
1237       if (VerifyOops) {
1238         verify_oop_array(size, d, count, t2);
1239       }
1240     }
1241 
1242     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1243 
1244     __ leave();
1245     __ mv(x10, zr); // return 0
1246     __ ret();
1247     return start;
1248   }
1249 
1250   // Arguments:
1251   //   stub_id - is used to name the stub and identify all details of
1252   //             how to perform the copy.
1253   //
1254   //   nooverlap_target - identifes the (post push) entry for the
1255   //             corresponding disjoint copy routine which can be
1256   //             jumped to if the ranges do not actually overlap
1257   //
1258   //   entry - is assigned to the stub's post push entry point unless
1259   //           it is null
1260   //
1261   // Inputs:
1262   //   c_rarg0   - source array address
1263   //   c_rarg1   - destination array address
1264   //   c_rarg2   - element count, treated as ssize_t, can be zero
1265   //
1266   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1267   // the hardware handle it.  The two dwords within qwords that span
1268   // cache line boundaries will still be loaded and stored atomically.
1269   //
1270   // Side Effects:
1271   //   entry is set to the no-overlap entry point so it can be used by
1272   //   some other conjoint copy method
1273   //
1274   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
1275     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1276     RegSet saved_regs = RegSet::of(s, d, count);
1277     int size;
1278     bool aligned;
1279     bool is_oop;
1280     bool dest_uninitialized;
1281     switch (stub_id) {
1282     case jbyte_arraycopy_id:
1283       size = sizeof(jbyte);
1284       aligned = false;
1285       is_oop = false;
1286       dest_uninitialized = false;
1287       break;
1288     case arrayof_jbyte_arraycopy_id:
1289       size = sizeof(jbyte);
1290       aligned = true;
1291       is_oop = false;
1292       dest_uninitialized = false;
1293       break;
1294     case jshort_arraycopy_id:
1295       size = sizeof(jshort);
1296       aligned = false;
1297       is_oop = false;
1298       dest_uninitialized = false;
1299       break;
1300     case arrayof_jshort_arraycopy_id:
1301       size = sizeof(jshort);
1302       aligned = true;
1303       is_oop = false;
1304       dest_uninitialized = false;
1305       break;
1306     case jint_arraycopy_id:
1307       size = sizeof(jint);
1308       aligned = false;
1309       is_oop = false;
1310       dest_uninitialized = false;
1311       break;
1312     case arrayof_jint_arraycopy_id:
1313       size = sizeof(jint);
1314       aligned = true;
1315       is_oop = false;
1316       dest_uninitialized = false;
1317       break;
1318     case jlong_arraycopy_id:
1319       // since this is always aligned we can (should!) use the same
1320       // stub as for case arrayof_jlong_disjoint_arraycopy
1321       ShouldNotReachHere();
1322       break;
1323     case arrayof_jlong_arraycopy_id:
1324       size = sizeof(jlong);
1325       aligned = true;
1326       is_oop = false;
1327       dest_uninitialized = false;
1328       break;
1329     case oop_arraycopy_id:
1330       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1331       aligned = !UseCompressedOops;
1332       is_oop = true;
1333       dest_uninitialized = false;
1334       break;
1335     case arrayof_oop_arraycopy_id:
1336       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1337       aligned = !UseCompressedOops;
1338       is_oop = true;
1339       dest_uninitialized = false;
1340       break;
1341     case oop_arraycopy_uninit_id:
1342       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1343       aligned = !UseCompressedOops;
1344       is_oop = true;
1345       dest_uninitialized = true;
1346       break;
1347     case arrayof_oop_arraycopy_uninit_id:
1348       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1349       aligned = !UseCompressedOops;
1350       is_oop = true;
1351       dest_uninitialized = true;
1352       break;
1353     default:
1354       ShouldNotReachHere();
1355     }
1356 
1357     StubCodeMark mark(this, stub_id);
1358     address start = __ pc();
1359     __ enter();
1360 
1361     if (entry != nullptr) {
1362       *entry = __ pc();
1363       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1364       BLOCK_COMMENT("Entry:");
1365     }
1366 
1367     // use fwd copy when (d-s) above_equal (count*size)
1368     __ sub(t0, d, s);
1369     __ slli(t1, count, exact_log2(size));
1370     Label L_continue;
1371     __ bltu(t0, t1, L_continue);
1372     __ j(nooverlap_target);
1373     __ bind(L_continue);
1374 
1375     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1376     if (dest_uninitialized) {
1377       decorators |= IS_DEST_UNINITIALIZED;
1378     }
1379     if (aligned) {
1380       decorators |= ARRAYCOPY_ALIGNED;
1381     }
1382 
1383     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1384     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1385 
1386     if (is_oop) {
1387       // save regs before copy_memory
1388       __ push_reg(RegSet::of(d, count), sp);
1389     }
1390 
1391     {
1392       // UnsafeMemoryAccess page error: continue after unsafe access
1393       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1394       UnsafeMemoryAccessMark umam(this, add_entry, true);
1395       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1396     }
1397 
1398     if (is_oop) {
1399       __ pop_reg(RegSet::of(d, count), sp);
1400       if (VerifyOops) {
1401         verify_oop_array(size, d, count, t2);
1402       }
1403     }
1404     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1405     __ leave();
1406     __ mv(x10, zr); // return 0
1407     __ ret();
1408     return start;
1409   }
1410 
1411   // Helper for generating a dynamic type check.
1412   // Smashes t0, t1.
1413   void generate_type_check(Register sub_klass,
1414                            Register super_check_offset,
1415                            Register super_klass,
1416                            Register result,
1417                            Register tmp1,
1418                            Register tmp2,
1419                            Label& L_success) {
1420     assert_different_registers(sub_klass, super_check_offset, super_klass);
1421 
1422     BLOCK_COMMENT("type_check:");
1423 
1424     Label L_miss;
1425 
1426     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
1427     __ check_klass_subtype_slow_path(sub_klass, super_klass, tmp1, tmp2, &L_success, nullptr);
1428 
1429     // Fall through on failure!
1430     __ BIND(L_miss);
1431   }
1432 
1433   //
1434   //  Generate checkcasting array copy stub
1435   //
1436   //  Input:
1437   //    c_rarg0   - source array address
1438   //    c_rarg1   - destination array address
1439   //    c_rarg2   - element count, treated as ssize_t, can be zero
1440   //    c_rarg3   - size_t ckoff (super_check_offset)
1441   //    c_rarg4   - oop ckval (super_klass)
1442   //
1443   //  Output:
1444   //    x10 ==  0  -  success
1445   //    x10 == -1^K - failure, where K is partial transfer count
1446   //
1447   address generate_checkcast_copy(StubGenStubId stub_id, address* entry) {
1448     bool dest_uninitialized;
1449     switch (stub_id) {
1450     case checkcast_arraycopy_id:
1451       dest_uninitialized = false;
1452       break;
1453     case checkcast_arraycopy_uninit_id:
1454       dest_uninitialized = true;
1455       break;
1456     default:
1457       ShouldNotReachHere();
1458     }
1459 
1460     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1461 
1462     // Input registers (after setup_arg_regs)
1463     const Register from        = c_rarg0;   // source array address
1464     const Register to          = c_rarg1;   // destination array address
1465     const Register count       = c_rarg2;   // elementscount
1466     const Register ckoff       = c_rarg3;   // super_check_offset
1467     const Register ckval       = c_rarg4;   // super_klass
1468 
1469     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1470     RegSet wb_post_saved_regs  = RegSet::of(count);
1471 
1472     // Registers used as temps (x7, x9, x18 are save-on-entry)
1473     const Register count_save  = x19;       // orig elementscount
1474     const Register start_to    = x18;       // destination array start address
1475     const Register copied_oop  = x7;        // actual oop copied
1476     const Register r9_klass    = x9;        // oop._klass
1477 
1478     // Registers used as gc temps (x15, x16, x17 are save-on-call)
1479     const Register gct1 = x15, gct2 = x16, gct3 = x17;
1480 
1481     //---------------------------------------------------------------
1482     // Assembler stub will be used for this call to arraycopy
1483     // if the two arrays are subtypes of Object[] but the
1484     // destination array type is not equal to or a supertype
1485     // of the source type.  Each element must be separately
1486     // checked.
1487 
1488     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1489                                copied_oop, r9_klass, count_save);
1490 
1491     __ align(CodeEntryAlignment);
1492     StubCodeMark mark(this, stub_id);
1493     address start = __ pc();
1494 
1495     __ enter(); // required for proper stackwalking of RuntimeStub frame
1496 
1497     // Caller of this entry point must set up the argument registers.
1498     if (entry != nullptr) {
1499       *entry = __ pc();
1500       BLOCK_COMMENT("Entry:");
1501     }
1502 
1503     // Empty array:  Nothing to do
1504     __ beqz(count, L_done);
1505 
1506     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1507 
1508 #ifdef ASSERT
1509     BLOCK_COMMENT("assert consistent ckoff/ckval");
1510     // The ckoff and ckval must be mutually consistent,
1511     // even though caller generates both.
1512     { Label L;
1513       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1514       __ lwu(start_to, Address(ckval, sco_offset));
1515       __ beq(ckoff, start_to, L);
1516       __ stop("super_check_offset inconsistent");
1517       __ bind(L);
1518     }
1519 #endif //ASSERT
1520 
1521     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1522     if (dest_uninitialized) {
1523       decorators |= IS_DEST_UNINITIALIZED;
1524     }
1525 
1526     bool is_oop = true;
1527     int element_size = UseCompressedOops ? 4 : 8;
1528 
1529     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1530     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1531 
1532     // save the original count
1533     __ mv(count_save, count);
1534 
1535     // Copy from low to high addresses
1536     __ mv(start_to, to);              // Save destination array start address
1537     __ j(L_load_element);
1538 
1539     // ======== begin loop ========
1540     // (Loop is rotated; its entry is L_load_element.)
1541     // Loop control:
1542     //   for count to 0 do
1543     //     copied_oop = load_heap_oop(from++)
1544     //     ... generate_type_check ...
1545     //     store_heap_oop(to++, copied_oop)
1546     //   end
1547 
1548     __ align(OptoLoopAlignment);
1549 
1550     __ BIND(L_store_element);
1551     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1552                       Address(to, 0), copied_oop,
1553                       gct1, gct2, gct3);
1554     __ addi(to, to, UseCompressedOops ? 4 : 8);
1555     __ subi(count, count, 1);
1556     __ beqz(count, L_do_card_marks);
1557 
1558     // ======== loop entry is here ========
1559     __ BIND(L_load_element);
1560     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1561                      copied_oop, Address(from, 0),
1562                      gct1);
1563     __ addi(from, from, UseCompressedOops ? 4 : 8);
1564     __ beqz(copied_oop, L_store_element);
1565 
1566     __ load_klass(r9_klass, copied_oop);// query the object klass
1567 
1568     BLOCK_COMMENT("type_check:");
1569     generate_type_check(r9_klass, /*sub_klass*/
1570                         ckoff,    /*super_check_offset*/
1571                         ckval,    /*super_klass*/
1572                         x10,      /*result*/
1573                         gct1,     /*tmp1*/
1574                         gct2,     /*tmp2*/
1575                         L_store_element);
1576 
1577     // Fall through on failure!
1578 
1579     // ======== end loop ========
1580 
1581     // It was a real error; we must depend on the caller to finish the job.
1582     // Register count = remaining oops, count_orig = total oops.
1583     // Emit GC store barriers for the oops we have copied and report
1584     // their number to the caller.
1585 
1586     __ sub(count, count_save, count);     // K = partially copied oop count
1587     __ xori(count, count, -1);            // report (-1^K) to caller
1588     __ beqz(count, L_done_pop);
1589 
1590     __ BIND(L_do_card_marks);
1591     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs);
1592 
1593     __ bind(L_done_pop);
1594     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1595     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1596 
1597     __ bind(L_done);
1598     __ mv(x10, count);
1599     __ leave();
1600     __ ret();
1601 
1602     return start;
1603   }
1604 
1605   // Perform range checks on the proposed arraycopy.
1606   // Kills temp, but nothing else.
1607   // Also, clean the sign bits of src_pos and dst_pos.
1608   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1609                               Register src_pos, // source position (c_rarg1)
1610                               Register dst,     // destination array oo (c_rarg2)
1611                               Register dst_pos, // destination position (c_rarg3)
1612                               Register length,
1613                               Register temp,
1614                               Label& L_failed) {
1615     BLOCK_COMMENT("arraycopy_range_checks:");
1616 
1617     assert_different_registers(t0, temp);
1618 
1619     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1620     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1621     __ addw(temp, length, src_pos);
1622     __ bgtu(temp, t0, L_failed);
1623 
1624     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1625     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1626     __ addw(temp, length, dst_pos);
1627     __ bgtu(temp, t0, L_failed);
1628 
1629     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1630     __ zext(src_pos, src_pos, 32);
1631     __ zext(dst_pos, dst_pos, 32);
1632 
1633     BLOCK_COMMENT("arraycopy_range_checks done");
1634   }
1635 
1636   address generate_unsafecopy_common_error_exit() {
1637     address start = __ pc();
1638     __ mv(x10, 0);
1639     __ leave();
1640     __ ret();
1641     return start;
1642   }
1643 
1644   //
1645   //  Generate 'unsafe' set memory stub
1646   //  Though just as safe as the other stubs, it takes an unscaled
1647   //  size_t (# bytes) argument instead of an element count.
1648   //
1649   //  Input:
1650   //    c_rarg0   - destination array address
1651   //    c_rarg1   - byte count (size_t)
1652   //    c_rarg2   - byte value
1653   //
1654   address generate_unsafe_setmemory() {
1655     __ align(CodeEntryAlignment);
1656     StubGenStubId stub_id = StubGenStubId::unsafe_setmemory_id;
1657     StubCodeMark mark(this, stub_id);
1658     address start = __ pc();
1659 
1660     // bump this on entry, not on exit:
1661     // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
1662 
1663     Label L_fill_elements;
1664 
1665     const Register dest = c_rarg0;
1666     const Register count = c_rarg1;
1667     const Register value = c_rarg2;
1668     const Register cnt_words = x28; // temp register
1669     const Register tmp_reg   = x29; // temp register
1670 
1671     // Mark remaining code as such which performs Unsafe accesses.
1672     UnsafeMemoryAccessMark umam(this, true, false);
1673 
1674     __ enter(); // required for proper stackwalking of RuntimeStub frame
1675 
1676     // if count < 8, jump to L_fill_elements
1677     __ mv(tmp_reg, 8); // 8 bytes fill by element
1678     __ bltu(count, tmp_reg, L_fill_elements);
1679 
1680     // Propagate byte to 64-bit width
1681     // 8 bit -> 16 bit
1682     __ zext(value, value, 8);
1683     __ slli(tmp_reg, value, 8);
1684     __ orr(value, value, tmp_reg);
1685     // 16 bit -> 32 bit
1686     __ slli(tmp_reg, value, 16);
1687     __ orr(value, value, tmp_reg);
1688     // 32 bit -> 64 bit
1689     __ slli(tmp_reg, value, 32);
1690     __ orr(value, value, tmp_reg);
1691 
1692     // Align source address at 8 bytes address boundary.
1693     Label L_skip_align1, L_skip_align2, L_skip_align4;
1694     // One byte misalignment happens.
1695     __ test_bit(tmp_reg, dest, 0);
1696     __ beqz(tmp_reg, L_skip_align1);
1697     __ sb(value, Address(dest, 0));
1698     __ addi(dest, dest, 1);
1699     __ subi(count, count, 1);
1700 
1701     __ bind(L_skip_align1);
1702     // Two bytes misalignment happens.
1703     __ test_bit(tmp_reg, dest, 1);
1704     __ beqz(tmp_reg, L_skip_align2);
1705     __ sh(value, Address(dest, 0));
1706     __ addi(dest, dest, 2);
1707     __ subi(count, count, 2);
1708 
1709     __ bind(L_skip_align2);
1710     // Four bytes misalignment happens.
1711     __ test_bit(tmp_reg, dest, 2);
1712     __ beqz(tmp_reg, L_skip_align4);
1713     __ sw(value, Address(dest, 0));
1714     __ addi(dest, dest, 4);
1715     __ subi(count, count, 4);
1716     __ bind(L_skip_align4);
1717 
1718     //  Fill large chunks
1719     __ srli(cnt_words, count, 3); // number of words
1720     __ slli(tmp_reg, cnt_words, 3);
1721     __ sub(count, count, tmp_reg);
1722     {
1723       __ fill_words(dest, cnt_words, value);
1724     }
1725 
1726     // Handle copies less than 8 bytes
1727     __ bind(L_fill_elements);
1728     Label L_fill_2, L_fill_1, L_exit;
1729     __ test_bit(tmp_reg, count, 2);
1730     __ beqz(tmp_reg, L_fill_2);
1731     __ sb(value, Address(dest, 0));
1732     __ sb(value, Address(dest, 1));
1733     __ sb(value, Address(dest, 2));
1734     __ sb(value, Address(dest, 3));
1735     __ addi(dest, dest, 4);
1736 
1737     __ bind(L_fill_2);
1738     __ test_bit(tmp_reg, count, 1);
1739     __ beqz(tmp_reg, L_fill_1);
1740     __ sb(value, Address(dest, 0));
1741     __ sb(value, Address(dest, 1));
1742     __ addi(dest, dest, 2);
1743 
1744     __ bind(L_fill_1);
1745     __ test_bit(tmp_reg, count, 0);
1746     __ beqz(tmp_reg, L_exit);
1747     __ sb(value, Address(dest, 0));
1748 
1749     __ bind(L_exit);
1750     __ leave();
1751     __ ret();
1752 
1753     return start;
1754   }
1755 
1756   //
1757   //  Generate 'unsafe' array copy stub
1758   //  Though just as safe as the other stubs, it takes an unscaled
1759   //  size_t argument instead of an element count.
1760   //
1761   //  Input:
1762   //    c_rarg0   - source array address
1763   //    c_rarg1   - destination array address
1764   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1765   //
1766   // Examines the alignment of the operands and dispatches
1767   // to a long, int, short, or byte copy loop.
1768   //
1769   address generate_unsafe_copy(address byte_copy_entry,
1770                                address short_copy_entry,
1771                                address int_copy_entry,
1772                                address long_copy_entry) {
1773     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1774                 int_copy_entry != nullptr && long_copy_entry != nullptr);
1775     Label L_long_aligned, L_int_aligned, L_short_aligned;
1776     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1777 
1778     __ align(CodeEntryAlignment);
1779     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
1780     StubCodeMark mark(this, stub_id);
1781     address start = __ pc();
1782     __ enter(); // required for proper stackwalking of RuntimeStub frame
1783 
1784     // bump this on entry, not on exit:
1785     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1786 
1787     __ orr(t0, s, d);
1788     __ orr(t0, t0, count);
1789 
1790     __ andi(t0, t0, BytesPerLong - 1);
1791     __ beqz(t0, L_long_aligned);
1792     __ andi(t0, t0, BytesPerInt - 1);
1793     __ beqz(t0, L_int_aligned);
1794     __ test_bit(t0, t0, 0);
1795     __ beqz(t0, L_short_aligned);
1796     __ j(RuntimeAddress(byte_copy_entry));
1797 
1798     __ BIND(L_short_aligned);
1799     __ srli(count, count, LogBytesPerShort);  // size => short_count
1800     __ j(RuntimeAddress(short_copy_entry));
1801     __ BIND(L_int_aligned);
1802     __ srli(count, count, LogBytesPerInt);    // size => int_count
1803     __ j(RuntimeAddress(int_copy_entry));
1804     __ BIND(L_long_aligned);
1805     __ srli(count, count, LogBytesPerLong);   // size => long_count
1806     __ j(RuntimeAddress(long_copy_entry));
1807 
1808     return start;
1809   }
1810 
1811   //
1812   //  Generate generic array copy stubs
1813   //
1814   //  Input:
1815   //    c_rarg0    -  src oop
1816   //    c_rarg1    -  src_pos (32-bits)
1817   //    c_rarg2    -  dst oop
1818   //    c_rarg3    -  dst_pos (32-bits)
1819   //    c_rarg4    -  element count (32-bits)
1820   //
1821   //  Output:
1822   //    x10 ==  0  -  success
1823   //    x10 == -1^K - failure, where K is partial transfer count
1824   //
1825   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
1826                                 address int_copy_entry, address oop_copy_entry,
1827                                 address long_copy_entry, address checkcast_copy_entry) {
1828     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1829                 int_copy_entry != nullptr && oop_copy_entry != nullptr &&
1830                 long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
1831     Label L_failed, L_failed_0, L_objArray;
1832     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1833 
1834     // Input registers
1835     const Register src        = c_rarg0;  // source array oop
1836     const Register src_pos    = c_rarg1;  // source position
1837     const Register dst        = c_rarg2;  // destination array oop
1838     const Register dst_pos    = c_rarg3;  // destination position
1839     const Register length     = c_rarg4;
1840 
1841     // Registers used as temps
1842     const Register dst_klass = c_rarg5;
1843 
1844     __ align(CodeEntryAlignment);
1845 
1846     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
1847     StubCodeMark mark(this, stub_id);
1848 
1849     address start = __ pc();
1850 
1851     __ enter(); // required for proper stackwalking of RuntimeStub frame
1852 
1853     // bump this on entry, not on exit:
1854     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1855 
1856     //-----------------------------------------------------------------------
1857     // Assembler stub will be used for this call to arraycopy
1858     // if the following conditions are met:
1859     //
1860     // (1) src and dst must not be null.
1861     // (2) src_pos must not be negative.
1862     // (3) dst_pos must not be negative.
1863     // (4) length  must not be negative.
1864     // (5) src klass and dst klass should be the same and not null.
1865     // (6) src and dst should be arrays.
1866     // (7) src_pos + length must not exceed length of src.
1867     // (8) dst_pos + length must not exceed length of dst.
1868     //
1869 
1870     // if src is null then return -1
1871     __ beqz(src, L_failed);
1872 
1873     // if [src_pos < 0] then return -1
1874     __ sext(t0, src_pos, 32);
1875     __ bltz(t0, L_failed);
1876 
1877     // if dst is null then return -1
1878     __ beqz(dst, L_failed);
1879 
1880     // if [dst_pos < 0] then return -1
1881     __ sext(t0, dst_pos, 32);
1882     __ bltz(t0, L_failed);
1883 
1884     // registers used as temp
1885     const Register scratch_length    = x28; // elements count to copy
1886     const Register scratch_src_klass = x29; // array klass
1887     const Register lh                = x30; // layout helper
1888 
1889     // if [length < 0] then return -1
1890     __ sext(scratch_length, length, 32); // length (elements count, 32-bits value)
1891     __ bltz(scratch_length, L_failed);
1892 
1893     __ load_klass(scratch_src_klass, src);
1894 #ifdef ASSERT
1895     {
1896       BLOCK_COMMENT("assert klasses not null {");
1897       Label L1, L2;
1898       __ bnez(scratch_src_klass, L2);   // it is broken if klass is null
1899       __ bind(L1);
1900       __ stop("broken null klass");
1901       __ bind(L2);
1902       __ load_klass(t0, dst, t1);
1903       __ beqz(t0, L1);     // this would be broken also
1904       BLOCK_COMMENT("} assert klasses not null done");
1905     }
1906 #endif
1907 
1908     // Load layout helper (32-bits)
1909     //
1910     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1911     // 32        30    24            16              8     2                 0
1912     //
1913     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1914     //
1915 
1916     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1917 
1918     // Handle objArrays completely differently...
1919     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1920     __ lw(lh, Address(scratch_src_klass, lh_offset));
1921     __ mv(t0, objArray_lh);
1922     __ beq(lh, t0, L_objArray);
1923 
1924     // if [src->klass() != dst->klass()] then return -1
1925     __ load_klass(t1, dst);
1926     __ bne(t1, scratch_src_klass, L_failed);
1927 
1928     // if src->is_Array() isn't null then return -1
1929     // i.e. (lh >= 0)
1930     __ bgez(lh, L_failed);
1931 
1932     // At this point, it is known to be a typeArray (array_tag 0x3).
1933 #ifdef ASSERT
1934     {
1935       BLOCK_COMMENT("assert primitive array {");
1936       Label L;
1937       __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1938       __ bge(lh, t1, L);
1939       __ stop("must be a primitive array");
1940       __ bind(L);
1941       BLOCK_COMMENT("} assert primitive array done");
1942     }
1943 #endif
1944 
1945     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1946                            t1, L_failed);
1947 
1948     // TypeArrayKlass
1949     //
1950     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1951     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1952     //
1953 
1954     const Register t0_offset = t0;    // array offset
1955     const Register x30_elsize = lh;   // element size
1956 
1957     // Get array_header_in_bytes()
1958     int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1959     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1960     __ slli(t0_offset, lh, XLEN - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1961     __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1962 
1963     __ add(src, src, t0_offset);           // src array offset
1964     __ add(dst, dst, t0_offset);           // dst array offset
1965     BLOCK_COMMENT("choose copy loop based on element size");
1966 
1967     // next registers should be set before the jump to corresponding stub
1968     const Register from     = c_rarg0;  // source array address
1969     const Register to       = c_rarg1;  // destination array address
1970     const Register count    = c_rarg2;  // elements count
1971 
1972     // 'from', 'to', 'count' registers should be set in such order
1973     // since they are the same as 'src', 'src_pos', 'dst'.
1974 
1975     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1976 
1977     // The possible values of elsize are 0-3, i.e. exact_log2(element
1978     // size in bytes).  We do a simple bitwise binary search.
1979   __ BIND(L_copy_bytes);
1980     __ test_bit(t0, x30_elsize, 1);
1981     __ bnez(t0, L_copy_ints);
1982     __ test_bit(t0, x30_elsize, 0);
1983     __ bnez(t0, L_copy_shorts);
1984     __ add(from, src, src_pos); // src_addr
1985     __ add(to, dst, dst_pos); // dst_addr
1986     __ sext(count, scratch_length, 32); // length
1987     __ j(RuntimeAddress(byte_copy_entry));
1988 
1989   __ BIND(L_copy_shorts);
1990     __ shadd(from, src_pos, src, t0, 1); // src_addr
1991     __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1992     __ sext(count, scratch_length, 32); // length
1993     __ j(RuntimeAddress(short_copy_entry));
1994 
1995   __ BIND(L_copy_ints);
1996     __ test_bit(t0, x30_elsize, 0);
1997     __ bnez(t0, L_copy_longs);
1998     __ shadd(from, src_pos, src, t0, 2); // src_addr
1999     __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
2000     __ sext(count, scratch_length, 32); // length
2001     __ j(RuntimeAddress(int_copy_entry));
2002 
2003   __ BIND(L_copy_longs);
2004 #ifdef ASSERT
2005     {
2006       BLOCK_COMMENT("assert long copy {");
2007       Label L;
2008       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
2009       __ sext(lh, lh, 32);
2010       __ mv(t0, LogBytesPerLong);
2011       __ beq(x30_elsize, t0, L);
2012       __ stop("must be long copy, but elsize is wrong");
2013       __ bind(L);
2014       BLOCK_COMMENT("} assert long copy done");
2015     }
2016 #endif
2017     __ shadd(from, src_pos, src, t0, 3); // src_addr
2018     __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
2019     __ sext(count, scratch_length, 32); // length
2020     __ j(RuntimeAddress(long_copy_entry));
2021 
2022     // ObjArrayKlass
2023   __ BIND(L_objArray);
2024     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2025 
2026     Label L_plain_copy, L_checkcast_copy;
2027     // test array classes for subtyping
2028     __ load_klass(t2, dst);
2029     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
2030 
2031     // Identically typed arrays can be copied without element-wise checks.
2032     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2033                            t1, L_failed);
2034 
2035     __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2036     __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2037     __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2038     __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2039     __ sext(count, scratch_length, 32); // length
2040   __ BIND(L_plain_copy);
2041     __ j(RuntimeAddress(oop_copy_entry));
2042 
2043   __ BIND(L_checkcast_copy);
2044     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
2045     {
2046       // Before looking at dst.length, make sure dst is also an objArray.
2047       __ lwu(t0, Address(t2, lh_offset));
2048       __ mv(t1, objArray_lh);
2049       __ bne(t0, t1, L_failed);
2050 
2051       // It is safe to examine both src.length and dst.length.
2052       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2053                              t2, L_failed);
2054 
2055       __ load_klass(dst_klass, dst); // reload
2056 
2057       // Marshal the base address arguments now, freeing registers.
2058       __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2059       __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2060       __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2061       __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2062       __ sext(count, length, 32); // length (reloaded)
2063       const Register sco_temp = c_rarg3; // this register is free now
2064       assert_different_registers(from, to, count, sco_temp,
2065                                  dst_klass, scratch_src_klass);
2066 
2067       // Generate the type check.
2068       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2069       __ lwu(sco_temp, Address(dst_klass, sco_offset));
2070 
2071       // Smashes t0, t1
2072       generate_type_check(scratch_src_klass, sco_temp, dst_klass, noreg, noreg, noreg, L_plain_copy);
2073 
2074       // Fetch destination element klass from the ObjArrayKlass header.
2075       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2076       __ ld(dst_klass, Address(dst_klass, ek_offset));
2077       __ lwu(sco_temp, Address(dst_klass, sco_offset));
2078 
2079       // the checkcast_copy loop needs two extra arguments:
2080       assert(c_rarg3 == sco_temp, "#3 already in place");
2081       // Set up arguments for checkcast_copy_entry.
2082       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
2083       __ j(RuntimeAddress(checkcast_copy_entry));
2084     }
2085 
2086   __ BIND(L_failed);
2087     __ mv(x10, -1);
2088     __ leave();   // required for proper stackwalking of RuntimeStub frame
2089     __ ret();
2090 
2091     return start;
2092   }
2093 
2094   //
2095   // Generate stub for array fill. If "aligned" is true, the
2096   // "to" address is assumed to be heapword aligned.
2097   //
2098   // Arguments for generated stub:
2099   //   to:    c_rarg0
2100   //   value: c_rarg1
2101   //   count: c_rarg2 treated as signed
2102   //
2103   address generate_fill(StubGenStubId stub_id) {
2104     BasicType t;
2105     bool aligned;
2106 
2107     switch (stub_id) {
2108     case jbyte_fill_id:
2109       t = T_BYTE;
2110       aligned = false;
2111       break;
2112     case jshort_fill_id:
2113       t = T_SHORT;
2114       aligned = false;
2115       break;
2116     case jint_fill_id:
2117       t = T_INT;
2118       aligned = false;
2119       break;
2120     case arrayof_jbyte_fill_id:
2121       t = T_BYTE;
2122       aligned = true;
2123       break;
2124     case arrayof_jshort_fill_id:
2125       t = T_SHORT;
2126       aligned = true;
2127       break;
2128     case arrayof_jint_fill_id:
2129       t = T_INT;
2130       aligned = true;
2131       break;
2132     default:
2133       ShouldNotReachHere();
2134     };
2135 
2136     __ align(CodeEntryAlignment);
2137     StubCodeMark mark(this, stub_id);
2138     address start = __ pc();
2139 
2140     BLOCK_COMMENT("Entry:");
2141 
2142     const Register to        = c_rarg0;  // source array address
2143     const Register value     = c_rarg1;  // value
2144     const Register count     = c_rarg2;  // elements count
2145 
2146     const Register bz_base   = x28;      // base for block_zero routine
2147     const Register cnt_words = x29;      // temp register
2148     const Register tmp_reg   = t1;
2149 
2150     __ enter();
2151 
2152     Label L_fill_elements;
2153 
2154     int shift = -1;
2155     switch (t) {
2156       case T_BYTE:
2157         shift = 0;
2158         // Short arrays (< 8 bytes) fill by element
2159         __ mv(tmp_reg, 8 >> shift);
2160         __ bltu(count, tmp_reg, L_fill_elements);
2161 
2162         // Zero extend value
2163         // 8 bit -> 16 bit
2164         __ zext(value, value, 8);
2165         __ slli(tmp_reg, value, 8);
2166         __ orr(value, value, tmp_reg);
2167 
2168         // 16 bit -> 32 bit
2169         __ slli(tmp_reg, value, 16);
2170         __ orr(value, value, tmp_reg);
2171         break;
2172       case T_SHORT:
2173         shift = 1;
2174         // Short arrays (< 8 bytes) fill by element
2175         __ mv(tmp_reg, 8 >> shift);
2176         __ bltu(count, tmp_reg, L_fill_elements);
2177 
2178         // Zero extend value
2179         // 16 bit -> 32 bit
2180         __ zext(value, value, 16);
2181         __ slli(tmp_reg, value, 16);
2182         __ orr(value, value, tmp_reg);
2183         break;
2184       case T_INT:
2185         shift = 2;
2186         // Short arrays (< 8 bytes) fill by element
2187         __ mv(tmp_reg, 8 >> shift);
2188         __ bltu(count, tmp_reg, L_fill_elements);
2189         break;
2190       default: ShouldNotReachHere();
2191     }
2192 
2193     // Align source address at 8 bytes address boundary.
2194     Label L_skip_align1, L_skip_align2, L_skip_align4;
2195     if (!aligned) {
2196       switch (t) {
2197         case T_BYTE:
2198           // One byte misalignment happens only for byte arrays.
2199           __ test_bit(tmp_reg, to, 0);
2200           __ beqz(tmp_reg, L_skip_align1);
2201           __ sb(value, Address(to, 0));
2202           __ addi(to, to, 1);
2203           __ subiw(count, count, 1);
2204           __ bind(L_skip_align1);
2205           // Fallthrough
2206         case T_SHORT:
2207           // Two bytes misalignment happens only for byte and short (char) arrays.
2208           __ test_bit(tmp_reg, to, 1);
2209           __ beqz(tmp_reg, L_skip_align2);
2210           __ sh(value, Address(to, 0));
2211           __ addi(to, to, 2);
2212           __ subiw(count, count, 2 >> shift);
2213           __ bind(L_skip_align2);
2214           // Fallthrough
2215         case T_INT:
2216           // Align to 8 bytes, we know we are 4 byte aligned to start.
2217           __ test_bit(tmp_reg, to, 2);
2218           __ beqz(tmp_reg, L_skip_align4);
2219           __ sw(value, Address(to, 0));
2220           __ addi(to, to, 4);
2221           __ subiw(count, count, 4 >> shift);
2222           __ bind(L_skip_align4);
2223           break;
2224         default: ShouldNotReachHere();
2225       }
2226     }
2227 
2228     //
2229     //  Fill large chunks
2230     //
2231     __ srliw(cnt_words, count, 3 - shift); // number of words
2232 
2233     // 32 bit -> 64 bit
2234     __ zext(value, value, 32);
2235     __ slli(tmp_reg, value, 32);
2236     __ orr(value, value, tmp_reg);
2237 
2238     __ slli(tmp_reg, cnt_words, 3 - shift);
2239     __ subw(count, count, tmp_reg);
2240     {
2241       __ fill_words(to, cnt_words, value);
2242     }
2243 
2244     // Handle copies less than 8 bytes.
2245     // Address may not be heapword aligned.
2246     Label L_fill_1, L_fill_2, L_exit;
2247     __ bind(L_fill_elements);
2248     switch (t) {
2249       case T_BYTE:
2250         __ test_bit(tmp_reg, count, 2);
2251         __ beqz(tmp_reg, L_fill_2);
2252         __ sb(value, Address(to, 0));
2253         __ sb(value, Address(to, 1));
2254         __ sb(value, Address(to, 2));
2255         __ sb(value, Address(to, 3));
2256         __ addi(to, to, 4);
2257 
2258         __ bind(L_fill_2);
2259         __ test_bit(tmp_reg, count, 1);
2260         __ beqz(tmp_reg, L_fill_1);
2261         __ sb(value, Address(to, 0));
2262         __ sb(value, Address(to, 1));
2263         __ addi(to, to, 2);
2264 
2265         __ bind(L_fill_1);
2266         __ test_bit(tmp_reg, count, 0);
2267         __ beqz(tmp_reg, L_exit);
2268         __ sb(value, Address(to, 0));
2269         break;
2270       case T_SHORT:
2271         __ test_bit(tmp_reg, count, 1);
2272         __ beqz(tmp_reg, L_fill_2);
2273         __ sh(value, Address(to, 0));
2274         __ sh(value, Address(to, 2));
2275         __ addi(to, to, 4);
2276 
2277         __ bind(L_fill_2);
2278         __ test_bit(tmp_reg, count, 0);
2279         __ beqz(tmp_reg, L_exit);
2280         __ sh(value, Address(to, 0));
2281         break;
2282       case T_INT:
2283         __ beqz(count, L_exit);
2284         __ sw(value, Address(to, 0));
2285         break;
2286       default: ShouldNotReachHere();
2287     }
2288     __ bind(L_exit);
2289     __ leave();
2290     __ ret();
2291 
2292     return start;
2293   }
2294 
2295   void generate_arraycopy_stubs() {
2296     address entry                     = nullptr;
2297     address entry_jbyte_arraycopy     = nullptr;
2298     address entry_jshort_arraycopy    = nullptr;
2299     address entry_jint_arraycopy      = nullptr;
2300     address entry_oop_arraycopy       = nullptr;
2301     address entry_jlong_arraycopy     = nullptr;
2302     address entry_checkcast_arraycopy = nullptr;
2303 
2304     generate_copy_longs(StubGenStubId::copy_byte_f_id, copy_f, c_rarg0, c_rarg1, t1);
2305     generate_copy_longs(StubGenStubId::copy_byte_b_id, copy_b, c_rarg0, c_rarg1, t1);
2306 
2307     address ucm_common_error_exit     = generate_unsafecopy_common_error_exit();
2308     UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit);
2309 
2310     StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2311 
2312     //*** jbyte
2313     // Always need aligned and unaligned versions
2314     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
2315     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
2316     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
2317     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
2318 
2319     //*** jshort
2320     // Always need aligned and unaligned versions
2321     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
2322     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
2323     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
2324     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
2325 
2326     //*** jint
2327     // Aligned versions
2328     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
2329     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
2330     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2331     // entry_jint_arraycopy always points to the unaligned version
2332     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
2333     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
2334 
2335     //*** jlong
2336     // It is always aligned
2337     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
2338     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
2339     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2340     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2341 
2342     //*** oops
2343     StubRoutines::_arrayof_oop_disjoint_arraycopy
2344       = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
2345     StubRoutines::_arrayof_oop_arraycopy
2346       = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
2347     // Aligned versions without pre-barriers
2348     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2349       = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
2350     StubRoutines::_arrayof_oop_arraycopy_uninit
2351       = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
2352 
2353     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2354     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2355     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2356     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2357 
2358     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
2359     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
2360 
2361 
2362     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
2363                                                               entry_jshort_arraycopy,
2364                                                               entry_jint_arraycopy,
2365                                                               entry_jlong_arraycopy);
2366 
2367     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
2368                                                                entry_jshort_arraycopy,
2369                                                                entry_jint_arraycopy,
2370                                                                entry_oop_arraycopy,
2371                                                                entry_jlong_arraycopy,
2372                                                                entry_checkcast_arraycopy);
2373 
2374     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
2375     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
2376     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
2377     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
2378     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
2379     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
2380 
2381     StubRoutines::_unsafe_setmemory    = generate_unsafe_setmemory();
2382   }
2383 
2384   void generate_aes_loadkeys(const Register &key, VectorRegister *working_vregs, int rounds) {
2385     const int step = 16;
2386     for (int i = 0; i < rounds; i++) {
2387       __ vle32_v(working_vregs[i], key);
2388       // The keys are stored in little-endian array, while we need
2389       // to operate in big-endian.
2390       // So performing an endian-swap here with vrev8.v instruction
2391       __ vrev8_v(working_vregs[i], working_vregs[i]);
2392       __ addi(key, key, step);
2393     }
2394   }
2395 
2396   void generate_aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2397     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2398 
2399     __ vxor_vv(res, res, working_vregs[0]);
2400     for (int i = 1; i < rounds - 1; i++) {
2401       __ vaesem_vv(res, working_vregs[i]);
2402     }
2403     __ vaesef_vv(res, working_vregs[rounds - 1]);
2404   }
2405 
2406   // Arguments:
2407   //
2408   // Inputs:
2409   //   c_rarg0   - source byte array address
2410   //   c_rarg1   - destination byte array address
2411   //   c_rarg2   - K (key) in little endian int array
2412   //
2413   address generate_aescrypt_encryptBlock() {
2414     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2415 
2416     __ align(CodeEntryAlignment);
2417     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
2418     StubCodeMark mark(this, stub_id);
2419 
2420     Label L_aes128, L_aes192;
2421 
2422     const Register from        = c_rarg0;  // source array address
2423     const Register to          = c_rarg1;  // destination array address
2424     const Register key         = c_rarg2;  // key array address
2425     const Register keylen      = c_rarg3;
2426 
2427     VectorRegister working_vregs[] = {
2428       v4, v5, v6, v7, v8, v9, v10, v11,
2429       v12, v13, v14, v15, v16, v17, v18
2430     };
2431     const VectorRegister res   = v19;
2432 
2433     address start = __ pc();
2434     __ enter();
2435 
2436     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2437 
2438     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2439     __ vle32_v(res, from);
2440 
2441     __ mv(t2, 52);
2442     __ blt(keylen, t2, L_aes128);
2443     __ beq(keylen, t2, L_aes192);
2444     // Else we fallthrough to the biggest case (256-bit key size)
2445 
2446     // Note: the following function performs key += 15*16
2447     generate_aes_loadkeys(key, working_vregs, 15);
2448     generate_aes_encrypt(res, working_vregs, 15);
2449     __ vse32_v(res, to);
2450     __ mv(c_rarg0, 0);
2451     __ leave();
2452     __ ret();
2453 
2454   __ bind(L_aes192);
2455     // Note: the following function performs key += 13*16
2456     generate_aes_loadkeys(key, working_vregs, 13);
2457     generate_aes_encrypt(res, working_vregs, 13);
2458     __ vse32_v(res, to);
2459     __ mv(c_rarg0, 0);
2460     __ leave();
2461     __ ret();
2462 
2463   __ bind(L_aes128);
2464     // Note: the following function performs key += 11*16
2465     generate_aes_loadkeys(key, working_vregs, 11);
2466     generate_aes_encrypt(res, working_vregs, 11);
2467     __ vse32_v(res, to);
2468     __ mv(c_rarg0, 0);
2469     __ leave();
2470     __ ret();
2471 
2472     return start;
2473   }
2474 
2475   void generate_aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2476     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2477 
2478     __ vxor_vv(res, res, working_vregs[rounds - 1]);
2479     for (int i = rounds - 2; i > 0; i--) {
2480       __ vaesdm_vv(res, working_vregs[i]);
2481     }
2482     __ vaesdf_vv(res, working_vregs[0]);
2483   }
2484 
2485   // Arguments:
2486   //
2487   // Inputs:
2488   //   c_rarg0   - source byte array address
2489   //   c_rarg1   - destination byte array address
2490   //   c_rarg2   - K (key) in little endian int array
2491   //
2492   address generate_aescrypt_decryptBlock() {
2493     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2494 
2495     __ align(CodeEntryAlignment);
2496     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
2497     StubCodeMark mark(this, stub_id);
2498 
2499     Label L_aes128, L_aes192;
2500 
2501     const Register from        = c_rarg0;  // source array address
2502     const Register to          = c_rarg1;  // destination array address
2503     const Register key         = c_rarg2;  // key array address
2504     const Register keylen      = c_rarg3;
2505 
2506     VectorRegister working_vregs[] = {
2507       v4, v5, v6, v7, v8, v9, v10, v11,
2508       v12, v13, v14, v15, v16, v17, v18
2509     };
2510     const VectorRegister res   = v19;
2511 
2512     address start = __ pc();
2513     __ enter(); // required for proper stackwalking of RuntimeStub frame
2514 
2515     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2516 
2517     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2518     __ vle32_v(res, from);
2519 
2520     __ mv(t2, 52);
2521     __ blt(keylen, t2, L_aes128);
2522     __ beq(keylen, t2, L_aes192);
2523     // Else we fallthrough to the biggest case (256-bit key size)
2524 
2525     // Note: the following function performs key += 15*16
2526     generate_aes_loadkeys(key, working_vregs, 15);
2527     generate_aes_decrypt(res, working_vregs, 15);
2528     __ vse32_v(res, to);
2529     __ mv(c_rarg0, 0);
2530     __ leave();
2531     __ ret();
2532 
2533   __ bind(L_aes192);
2534     // Note: the following function performs key += 13*16
2535     generate_aes_loadkeys(key, working_vregs, 13);
2536     generate_aes_decrypt(res, working_vregs, 13);
2537     __ vse32_v(res, to);
2538     __ mv(c_rarg0, 0);
2539     __ leave();
2540     __ ret();
2541 
2542   __ bind(L_aes128);
2543     // Note: the following function performs key += 11*16
2544     generate_aes_loadkeys(key, working_vregs, 11);
2545     generate_aes_decrypt(res, working_vregs, 11);
2546     __ vse32_v(res, to);
2547     __ mv(c_rarg0, 0);
2548     __ leave();
2549     __ ret();
2550 
2551     return start;
2552   }
2553 
2554   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
2555   void compare_string_8_x_LU(Register tmpL, Register tmpU,
2556                              Register strL, Register strU, Label& DIFF) {
2557     const Register tmp = x30, tmpLval = x12;
2558 
2559     int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
2560     assert((base_offset % (UseCompactObjectHeaders ? 4 :
2561                            (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be");
2562 
2563 #ifdef ASSERT
2564     if (AvoidUnalignedAccesses) {
2565       Label align_ok;
2566       __ andi(t0, strL, 0x7);
2567       __ beqz(t0, align_ok);
2568       __ stop("bad alignment");
2569       __ bind(align_ok);
2570     }
2571 #endif
2572     __ ld(tmpLval, Address(strL));
2573     __ addi(strL, strL, wordSize);
2574 
2575     // compare first 4 characters
2576     __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
2577     __ addi(strU, strU, wordSize);
2578     __ inflate_lo32(tmpL, tmpLval);
2579     __ xorr(tmp, tmpU, tmpL);
2580     __ bnez(tmp, DIFF);
2581 
2582     // compare second 4 characters
2583     __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
2584     __ addi(strU, strU, wordSize);
2585     __ inflate_hi32(tmpL, tmpLval);
2586     __ xorr(tmp, tmpU, tmpL);
2587     __ bnez(tmp, DIFF);
2588   }
2589 
2590   // x10  = result
2591   // x11  = str1
2592   // x12  = cnt1
2593   // x13  = str2
2594   // x14  = cnt2
2595   // x28  = tmp1
2596   // x29  = tmp2
2597   // x30  = tmp3
2598   address generate_compare_long_string_different_encoding(StubGenStubId stub_id) {
2599     bool isLU;
2600     switch (stub_id) {
2601     case compare_long_string_LU_id:
2602       isLU = true;
2603       break;
2604     case compare_long_string_UL_id:
2605       isLU = false;
2606       break;
2607     default:
2608       ShouldNotReachHere();
2609     };
2610     __ align(CodeEntryAlignment);
2611     StubCodeMark mark(this, stub_id);
2612     address entry = __ pc();
2613     Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
2614     const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
2615                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
2616 
2617     int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
2618     assert((base_offset % (UseCompactObjectHeaders ? 4 :
2619                            (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be");
2620 
2621     Register strU = isLU ? str2 : str1,
2622              strL = isLU ? str1 : str2,
2623              tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
2624              tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
2625 
2626     if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
2627       // Load 4 bytes from strL to make sure main loop is 8-byte aligned
2628       // cnt2 is >= 68 here, no need to check it for >= 0
2629       __ lwu(tmpL, Address(strL));
2630       __ addi(strL, strL, wordSize / 2);
2631       __ load_long_misaligned(tmpU, Address(strU), tmp4, (base_offset % 8) != 0 ? 4 : 8);
2632       __ addi(strU, strU, wordSize);
2633       __ inflate_lo32(tmp3, tmpL);
2634       __ mv(tmpL, tmp3);
2635       __ xorr(tmp3, tmpU, tmpL);
2636       __ bnez(tmp3, CALCULATE_DIFFERENCE);
2637       __ subi(cnt2, cnt2, wordSize / 2);
2638     }
2639 
2640     // we are now 8-bytes aligned on strL when AvoidUnalignedAccesses is true
2641     __ subi(cnt2, cnt2, wordSize * 2);
2642     __ bltz(cnt2, TAIL);
2643     __ bind(SMALL_LOOP); // smaller loop
2644       __ subi(cnt2, cnt2, wordSize * 2);
2645       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2646       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2647       __ bgez(cnt2, SMALL_LOOP);
2648       __ addi(t0, cnt2, wordSize * 2);
2649       __ beqz(t0, DONE);
2650     __ bind(TAIL);  // 1..15 characters left
2651       // Aligned access. Load bytes in portions - 4, 2, 1.
2652 
2653       __ addi(t0, cnt2, wordSize);
2654       __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
2655       __ bltz(t0, LOAD_LAST);
2656       // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
2657       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2658       __ subi(cnt2, cnt2, wordSize);
2659       __ beqz(cnt2, DONE);  // no character left
2660       __ bind(LOAD_LAST);   // cnt2 = 1..7 characters left
2661 
2662       __ subi(cnt2, cnt2, wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
2663       __ slli(t0, cnt2, 1);     // t0 is now an offset in strU which points to last 16 bytes
2664       __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
2665       __ add(strU, strU, t0);   // Address of last 16 bytes in UTF-16 string
2666       __ load_int_misaligned(tmpL, Address(strL), t0, false);
2667       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
2668       __ inflate_lo32(tmp3, tmpL);
2669       __ mv(tmpL, tmp3);
2670       __ xorr(tmp3, tmpU, tmpL);
2671       __ bnez(tmp3, CALCULATE_DIFFERENCE);
2672 
2673       __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
2674       __ addi(strU, strU, wordSize);   // Address of last 8 bytes in UTF-16 string
2675       __ load_int_misaligned(tmpL, Address(strL), t0, false);
2676       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
2677       __ inflate_lo32(tmp3, tmpL);
2678       __ mv(tmpL, tmp3);
2679       __ xorr(tmp3, tmpU, tmpL);
2680       __ bnez(tmp3, CALCULATE_DIFFERENCE);
2681       __ j(DONE); // no character left
2682 
2683       // Find the first different characters in the longwords and
2684       // compute their difference.
2685     __ bind(CALCULATE_DIFFERENCE);
2686       // count bits of trailing zero chars
2687       __ ctzc_bits(tmp4, tmp3);
2688       __ srl(tmp1, tmp1, tmp4);
2689       __ srl(tmp2, tmp2, tmp4);
2690       __ zext(tmp1, tmp1, 16);
2691       __ zext(tmp2, tmp2, 16);
2692       __ sub(result, tmp1, tmp2);
2693     __ bind(DONE);
2694       __ ret();
2695     return entry;
2696   }
2697 
2698   address generate_method_entry_barrier() {
2699     __ align(CodeEntryAlignment);
2700     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
2701     StubCodeMark mark(this, stub_id);
2702 
2703     Label deoptimize_label;
2704 
2705     address start = __ pc();
2706 
2707     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
2708 
2709     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
2710       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
2711       Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
2712       __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
2713       __ lwu(t1, t1);
2714       __ sw(t1, thread_epoch_addr);
2715       // There are two ways this can work:
2716       // - The writer did system icache shootdown after the instruction stream update.
2717       //   Hence do nothing.
2718       // - The writer trust us to make sure our icache is in sync before entering.
2719       //   Hence use cmodx fence (fence.i, may change).
2720       if (UseCtxFencei) {
2721         __ cmodx_fence();
2722       }
2723       __ membar(__ LoadLoad);
2724     }
2725 
2726     __ set_last_Java_frame(sp, fp, ra);
2727 
2728     __ enter();
2729     __ addi(t1, sp, wordSize);
2730 
2731     __ subi(sp, sp, 4 * wordSize);
2732 
2733     __ push_call_clobbered_registers();
2734 
2735     __ mv(c_rarg0, t1);
2736     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
2737 
2738     __ reset_last_Java_frame(true);
2739 
2740     __ mv(t0, x10);
2741 
2742     __ pop_call_clobbered_registers();
2743 
2744     __ bnez(t0, deoptimize_label);
2745 
2746     __ leave();
2747     __ ret();
2748 
2749     __ BIND(deoptimize_label);
2750 
2751     __ ld(t0, Address(sp, 0));
2752     __ ld(fp, Address(sp, wordSize));
2753     __ ld(ra, Address(sp, wordSize * 2));
2754     __ ld(t1, Address(sp, wordSize * 3));
2755 
2756     __ mv(sp, t0);
2757     __ jr(t1);
2758 
2759     return start;
2760   }
2761 
2762   // x10  = result
2763   // x11  = str1
2764   // x12  = cnt1
2765   // x13  = str2
2766   // x14  = cnt2
2767   // x28  = tmp1
2768   // x29  = tmp2
2769   // x30  = tmp3
2770   // x31  = tmp4
2771   address generate_compare_long_string_same_encoding(StubGenStubId stub_id) {
2772     bool isLL;
2773     switch (stub_id) {
2774     case compare_long_string_LL_id:
2775       isLL = true;
2776       break;
2777     case compare_long_string_UU_id:
2778       isLL = false;
2779       break;
2780     default:
2781       ShouldNotReachHere();
2782     };
2783     __ align(CodeEntryAlignment);
2784     StubCodeMark mark(this, stub_id);
2785     address entry = __ pc();
2786     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
2787           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
2788     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
2789                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
2790     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
2791 
2792     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
2793     // update cnt2 counter with already loaded 8 bytes
2794     __ subi(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
2795     // update pointers, because of previous read
2796     __ addi(str1, str1, wordSize);
2797     __ addi(str2, str2, wordSize);
2798     // less than 16 bytes left?
2799     __ subi(cnt2, cnt2, isLL ? 16 : 8);
2800     __ push_reg(spilled_regs, sp);
2801     __ bltz(cnt2, TAIL);
2802     __ bind(SMALL_LOOP);
2803       // compare 16 bytes of strings with same encoding
2804       __ ld(tmp5, Address(str1));
2805       __ addi(str1, str1, 8);
2806       __ xorr(tmp4, tmp1, tmp2);
2807       __ ld(cnt1, Address(str2));
2808       __ addi(str2, str2, 8);
2809       __ bnez(tmp4, DIFF);
2810       __ ld(tmp1, Address(str1));
2811       __ addi(str1, str1, 8);
2812       __ xorr(tmp4, tmp5, cnt1);
2813       __ ld(tmp2, Address(str2));
2814       __ addi(str2, str2, 8);
2815       __ bnez(tmp4, DIFF2);
2816 
2817       __ subi(cnt2, cnt2, isLL ? 16 : 8);
2818       __ bgez(cnt2, SMALL_LOOP);
2819     __ bind(TAIL);
2820       __ addi(cnt2, cnt2, isLL ? 16 : 8);
2821       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
2822       __ subi(cnt2, cnt2, isLL ? 8 : 4);
2823       __ blez(cnt2, CHECK_LAST);
2824       __ xorr(tmp4, tmp1, tmp2);
2825       __ bnez(tmp4, DIFF);
2826       __ ld(tmp1, Address(str1));
2827       __ addi(str1, str1, 8);
2828       __ ld(tmp2, Address(str2));
2829       __ addi(str2, str2, 8);
2830       __ subi(cnt2, cnt2, isLL ? 8 : 4);
2831     __ bind(CHECK_LAST);
2832       if (!isLL) {
2833         __ add(cnt2, cnt2, cnt2); // now in bytes
2834       }
2835       __ xorr(tmp4, tmp1, tmp2);
2836       __ bnez(tmp4, DIFF);
2837       __ add(str1, str1, cnt2);
2838       __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
2839       __ add(str2, str2, cnt2);
2840       __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
2841       __ xorr(tmp4, tmp5, cnt1);
2842       __ beqz(tmp4, LENGTH_DIFF);
2843       // Find the first different characters in the longwords and
2844       // compute their difference.
2845     __ bind(DIFF2);
2846       // count bits of trailing zero chars
2847       __ ctzc_bits(tmp3, tmp4, isLL);
2848       __ srl(tmp5, tmp5, tmp3);
2849       __ srl(cnt1, cnt1, tmp3);
2850       if (isLL) {
2851         __ zext(tmp5, tmp5, 8);
2852         __ zext(cnt1, cnt1, 8);
2853       } else {
2854         __ zext(tmp5, tmp5, 16);
2855         __ zext(cnt1, cnt1, 16);
2856       }
2857       __ sub(result, tmp5, cnt1);
2858       __ j(LENGTH_DIFF);
2859     __ bind(DIFF);
2860       // count bits of trailing zero chars
2861       __ ctzc_bits(tmp3, tmp4, isLL);
2862       __ srl(tmp1, tmp1, tmp3);
2863       __ srl(tmp2, tmp2, tmp3);
2864       if (isLL) {
2865         __ zext(tmp1, tmp1, 8);
2866         __ zext(tmp2, tmp2, 8);
2867       } else {
2868         __ zext(tmp1, tmp1, 16);
2869         __ zext(tmp2, tmp2, 16);
2870       }
2871       __ sub(result, tmp1, tmp2);
2872       __ j(LENGTH_DIFF);
2873     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
2874       __ xorr(tmp4, tmp1, tmp2);
2875       __ bnez(tmp4, DIFF);
2876     __ bind(LENGTH_DIFF);
2877       __ pop_reg(spilled_regs, sp);
2878       __ ret();
2879     return entry;
2880   }
2881 
2882   void generate_compare_long_strings() {
2883     StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(StubGenStubId::compare_long_string_LL_id);
2884     StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(StubGenStubId::compare_long_string_UU_id);
2885     StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(StubGenStubId::compare_long_string_LU_id);
2886     StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(StubGenStubId::compare_long_string_UL_id);
2887   }
2888 
2889   // x10 result
2890   // x11 src
2891   // x12 src count
2892   // x13 pattern
2893   // x14 pattern count
2894   address generate_string_indexof_linear(StubGenStubId stub_id)
2895   {
2896     bool needle_isL;
2897     bool haystack_isL;
2898     switch (stub_id) {
2899     case string_indexof_linear_ll_id:
2900       needle_isL = true;
2901       haystack_isL = true;
2902       break;
2903     case string_indexof_linear_ul_id:
2904       needle_isL = true;
2905       haystack_isL = false;
2906       break;
2907     case string_indexof_linear_uu_id:
2908       needle_isL = false;
2909       haystack_isL = false;
2910       break;
2911     default:
2912       ShouldNotReachHere();
2913     };
2914 
2915     __ align(CodeEntryAlignment);
2916     StubCodeMark mark(this, stub_id);
2917     address entry = __ pc();
2918 
2919     int needle_chr_size = needle_isL ? 1 : 2;
2920     int haystack_chr_size = haystack_isL ? 1 : 2;
2921     int needle_chr_shift = needle_isL ? 0 : 1;
2922     int haystack_chr_shift = haystack_isL ? 0 : 1;
2923     bool isL = needle_isL && haystack_isL;
2924     // parameters
2925     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
2926     // temporary registers
2927     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
2928     // redefinitions
2929     Register ch1 = x28, ch2 = x29;
2930     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
2931 
2932     __ push_reg(spilled_regs, sp);
2933 
2934     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
2935           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
2936           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
2937           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
2938           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
2939           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
2940 
2941     __ ld(ch1, Address(needle));
2942     __ ld(ch2, Address(haystack));
2943     // src.length - pattern.length
2944     __ sub(haystack_len, haystack_len, needle_len);
2945 
2946     // first is needle[0]
2947     __ zext(first, ch1, needle_isL ? 8 : 16);
2948 
2949     uint64_t mask0101 = UCONST64(0x0101010101010101);
2950     uint64_t mask0001 = UCONST64(0x0001000100010001);
2951     __ mv(mask1, haystack_isL ? mask0101 : mask0001);
2952     __ mul(first, first, mask1);
2953     uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
2954     uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
2955     __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
2956     if (needle_isL != haystack_isL) {
2957       __ mv(tmp, ch1);
2958     }
2959     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
2960     __ blez(haystack_len, L_SMALL);
2961 
2962     if (needle_isL != haystack_isL) {
2963       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2964     }
2965     // xorr, sub, orr, notr, andr
2966     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
2967     // eg:
2968     // first:        aa aa aa aa aa aa aa aa
2969     // ch2:          aa aa li nx jd ka aa aa
2970     // match_mask:   80 80 00 00 00 00 80 80
2971     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2972 
2973     // search first char of needle, if success, goto L_HAS_ZERO;
2974     __ bnez(match_mask, L_HAS_ZERO);
2975     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
2976     __ addi(result, result, wordSize / haystack_chr_size);
2977     __ addi(haystack, haystack, wordSize);
2978     __ bltz(haystack_len, L_POST_LOOP);
2979 
2980     __ bind(L_LOOP);
2981     __ ld(ch2, Address(haystack));
2982     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2983     __ bnez(match_mask, L_HAS_ZERO);
2984 
2985     __ bind(L_LOOP_PROCEED);
2986     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
2987     __ addi(haystack, haystack, wordSize);
2988     __ addi(result, result, wordSize / haystack_chr_size);
2989     __ bgez(haystack_len, L_LOOP);
2990 
2991     __ bind(L_POST_LOOP);
2992     __ mv(ch2, -wordSize / haystack_chr_size);
2993     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
2994     __ ld(ch2, Address(haystack));
2995     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2996     __ neg(haystack_len, haystack_len);
2997     __ xorr(ch2, first, ch2);
2998     __ sub(match_mask, ch2, mask1);
2999     __ orr(ch2, ch2, mask2);
3000     __ mv(trailing_zeros, -1); // all bits set
3001     __ j(L_SMALL_PROCEED);
3002 
3003     __ align(OptoLoopAlignment);
3004     __ bind(L_SMALL);
3005     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3006     __ neg(haystack_len, haystack_len);
3007     if (needle_isL != haystack_isL) {
3008       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3009     }
3010     __ xorr(ch2, first, ch2);
3011     __ sub(match_mask, ch2, mask1);
3012     __ orr(ch2, ch2, mask2);
3013     __ mv(trailing_zeros, -1); // all bits set
3014 
3015     __ bind(L_SMALL_PROCEED);
3016     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
3017     __ notr(ch2, ch2);
3018     __ andr(match_mask, match_mask, ch2);
3019     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
3020     __ beqz(match_mask, NOMATCH);
3021 
3022     __ bind(L_SMALL_HAS_ZERO_LOOP);
3023     // count bits of trailing zero chars
3024     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, ch2, tmp);
3025     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3026     __ mv(ch2, wordSize / haystack_chr_size);
3027     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
3028     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3029     __ mv(trailing_zeros, wordSize / haystack_chr_size);
3030     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3031 
3032     __ bind(L_SMALL_CMP_LOOP);
3033     __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
3034     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3035     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
3036     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3037     __ addi(trailing_zeros, trailing_zeros, 1);
3038     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
3039     __ beq(first, ch2, L_SMALL_CMP_LOOP);
3040 
3041     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
3042     __ beqz(match_mask, NOMATCH);
3043     // count bits of trailing zero chars
3044     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3045     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3046     __ addi(result, result, 1);
3047     __ addi(haystack, haystack, haystack_chr_size);
3048     __ j(L_SMALL_HAS_ZERO_LOOP);
3049 
3050     __ align(OptoLoopAlignment);
3051     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
3052     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3053     __ j(DONE);
3054 
3055     __ align(OptoLoopAlignment);
3056     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
3057     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3058     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3059     __ j(DONE);
3060 
3061     __ align(OptoLoopAlignment);
3062     __ bind(L_HAS_ZERO);
3063     // count bits of trailing zero chars
3064     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3065     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3066     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
3067     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
3068     __ subi(result, result, 1); // array index from 0, so result -= 1
3069 
3070     __ bind(L_HAS_ZERO_LOOP);
3071     __ mv(needle_len, wordSize / haystack_chr_size);
3072     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
3073     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
3074     // load next 8 bytes from haystack, and increase result index
3075     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3076     __ addi(result, result, 1);
3077     __ mv(trailing_zeros, wordSize / haystack_chr_size);
3078     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3079 
3080     // compare one char
3081     __ bind(L_CMP_LOOP);
3082     __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
3083     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
3084     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3085     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3086     __ addi(trailing_zeros, trailing_zeros, 1); // next char index
3087     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
3088     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
3089     __ beq(needle_len, ch2, L_CMP_LOOP);
3090 
3091     __ bind(L_CMP_LOOP_NOMATCH);
3092     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
3093     // count bits of trailing zero chars
3094     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, needle_len, ch2);
3095     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3096     __ addi(haystack, haystack, haystack_chr_size);
3097     __ j(L_HAS_ZERO_LOOP);
3098 
3099     __ align(OptoLoopAlignment);
3100     __ bind(L_CMP_LOOP_LAST_CMP);
3101     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
3102     __ j(DONE);
3103 
3104     __ align(OptoLoopAlignment);
3105     __ bind(L_CMP_LOOP_LAST_CMP2);
3106     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3107     __ addi(result, result, 1);
3108     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3109     __ j(DONE);
3110 
3111     __ align(OptoLoopAlignment);
3112     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
3113     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
3114     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
3115     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
3116     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
3117     // result by analyzed characters value, so, we can just reset lower bits
3118     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
3119     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
3120     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
3121     // index of last analyzed substring inside current octet. So, haystack in at
3122     // respective start address. We need to advance it to next octet
3123     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
3124     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
3125     __ andi(result, result, haystack_isL ? -8 : -4);
3126     __ slli(tmp, match_mask, haystack_chr_shift);
3127     __ sub(haystack, haystack, tmp);
3128     __ sext(haystack_len, haystack_len, 32);
3129     __ j(L_LOOP_PROCEED);
3130 
3131     __ align(OptoLoopAlignment);
3132     __ bind(NOMATCH);
3133     __ mv(result, -1);
3134 
3135     __ bind(DONE);
3136     __ pop_reg(spilled_regs, sp);
3137     __ ret();
3138     return entry;
3139   }
3140 
3141   void generate_string_indexof_stubs()
3142   {
3143     StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_ll_id);
3144     StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_uu_id);
3145     StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_ul_id);
3146   }
3147 
3148 #ifdef COMPILER2
3149   void generate_lookup_secondary_supers_table_stub() {
3150     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
3151     StubCodeMark mark(this, stub_id);
3152 
3153     const Register
3154       r_super_klass  = x10,
3155       r_array_base   = x11,
3156       r_array_length = x12,
3157       r_array_index  = x13,
3158       r_sub_klass    = x14,
3159       result         = x15,
3160       r_bitmap       = x16;
3161 
3162     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
3163       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
3164       Label L_success;
3165       __ enter();
3166       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, result,
3167                                              r_array_base, r_array_length, r_array_index,
3168                                              r_bitmap, slot, /*stub_is_near*/true);
3169       __ leave();
3170       __ ret();
3171     }
3172   }
3173 
3174   // Slow path implementation for UseSecondarySupersTable.
3175   address generate_lookup_secondary_supers_table_slow_path_stub() {
3176     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
3177     StubCodeMark mark(this, stub_id);
3178 
3179     address start = __ pc();
3180     const Register
3181       r_super_klass  = x10,        // argument
3182       r_array_base   = x11,        // argument
3183       temp1          = x12,        // tmp
3184       r_array_index  = x13,        // argument
3185       result         = x15,        // argument
3186       r_bitmap       = x16;        // argument
3187 
3188 
3189     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
3190     __ ret();
3191 
3192     return start;
3193   }
3194 
3195   address generate_mulAdd()
3196   {
3197     __ align(CodeEntryAlignment);
3198     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
3199     StubCodeMark mark(this, stub_id);
3200 
3201     address entry = __ pc();
3202 
3203     const Register out     = x10;
3204     const Register in      = x11;
3205     const Register offset  = x12;
3206     const Register len     = x13;
3207     const Register k       = x14;
3208     const Register tmp     = x28;
3209 
3210     BLOCK_COMMENT("Entry:");
3211     __ enter();
3212     __ mul_add(out, in, offset, len, k, tmp);
3213     __ leave();
3214     __ ret();
3215 
3216     return entry;
3217   }
3218 
3219   /**
3220    *  Arguments:
3221    *
3222    *  Input:
3223    *    c_rarg0   - x address
3224    *    c_rarg1   - x length
3225    *    c_rarg2   - y address
3226    *    c_rarg3   - y length
3227    *    c_rarg4   - z address
3228    */
3229   address generate_multiplyToLen()
3230   {
3231     __ align(CodeEntryAlignment);
3232     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
3233     StubCodeMark mark(this, stub_id);
3234     address entry = __ pc();
3235 
3236     const Register x     = x10;
3237     const Register xlen  = x11;
3238     const Register y     = x12;
3239     const Register ylen  = x13;
3240     const Register z     = x14;
3241 
3242     const Register tmp0  = x15;
3243     const Register tmp1  = x16;
3244     const Register tmp2  = x17;
3245     const Register tmp3  = x7;
3246     const Register tmp4  = x28;
3247     const Register tmp5  = x29;
3248     const Register tmp6  = x30;
3249     const Register tmp7  = x31;
3250 
3251     BLOCK_COMMENT("Entry:");
3252     __ enter(); // required for proper stackwalking of RuntimeStub frame
3253     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3254     __ leave(); // required for proper stackwalking of RuntimeStub frame
3255     __ ret();
3256 
3257     return entry;
3258   }
3259 
3260   address generate_squareToLen()
3261   {
3262     __ align(CodeEntryAlignment);
3263     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
3264     StubCodeMark mark(this, stub_id);
3265     address entry = __ pc();
3266 
3267     const Register x     = x10;
3268     const Register xlen  = x11;
3269     const Register z     = x12;
3270     const Register y     = x14; // == x
3271     const Register ylen  = x15; // == xlen
3272 
3273     const Register tmp0  = x13; // zlen, unused
3274     const Register tmp1  = x16;
3275     const Register tmp2  = x17;
3276     const Register tmp3  = x7;
3277     const Register tmp4  = x28;
3278     const Register tmp5  = x29;
3279     const Register tmp6  = x30;
3280     const Register tmp7  = x31;
3281 
3282     BLOCK_COMMENT("Entry:");
3283     __ enter();
3284     __ mv(y, x);
3285     __ mv(ylen, xlen);
3286     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3287     __ leave();
3288     __ ret();
3289 
3290     return entry;
3291   }
3292 
3293   // Arguments:
3294   //
3295   // Input:
3296   //   c_rarg0   - newArr address
3297   //   c_rarg1   - oldArr address
3298   //   c_rarg2   - newIdx
3299   //   c_rarg3   - shiftCount
3300   //   c_rarg4   - numIter
3301   //
3302   address generate_bigIntegerLeftShift() {
3303     __ align(CodeEntryAlignment);
3304     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
3305     StubCodeMark mark(this, stub_id);
3306     address entry = __ pc();
3307 
3308     Label loop, exit;
3309 
3310     Register newArr        = c_rarg0;
3311     Register oldArr        = c_rarg1;
3312     Register newIdx        = c_rarg2;
3313     Register shiftCount    = c_rarg3;
3314     Register numIter       = c_rarg4;
3315 
3316     Register shiftRevCount = c_rarg5;
3317     Register oldArrNext    = t1;
3318 
3319     __ beqz(numIter, exit);
3320     __ shadd(newArr, newIdx, newArr, t0, 2);
3321 
3322     __ mv(shiftRevCount, 32);
3323     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3324 
3325     __ bind(loop);
3326     __ addi(oldArrNext, oldArr, 4);
3327     __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
3328     __ vle32_v(v0, oldArr);
3329     __ vle32_v(v4, oldArrNext);
3330     __ vsll_vx(v0, v0, shiftCount);
3331     __ vsrl_vx(v4, v4, shiftRevCount);
3332     __ vor_vv(v0, v0, v4);
3333     __ vse32_v(v0, newArr);
3334     __ sub(numIter, numIter, t0);
3335     __ shadd(oldArr, t0, oldArr, t1, 2);
3336     __ shadd(newArr, t0, newArr, t1, 2);
3337     __ bnez(numIter, loop);
3338 
3339     __ bind(exit);
3340     __ ret();
3341 
3342     return entry;
3343   }
3344 
3345   // Arguments:
3346   //
3347   // Input:
3348   //   c_rarg0   - newArr address
3349   //   c_rarg1   - oldArr address
3350   //   c_rarg2   - newIdx
3351   //   c_rarg3   - shiftCount
3352   //   c_rarg4   - numIter
3353   //
3354   address generate_bigIntegerRightShift() {
3355     __ align(CodeEntryAlignment);
3356     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
3357     StubCodeMark mark(this, stub_id);
3358     address entry = __ pc();
3359 
3360     Label loop, exit;
3361 
3362     Register newArr        = c_rarg0;
3363     Register oldArr        = c_rarg1;
3364     Register newIdx        = c_rarg2;
3365     Register shiftCount    = c_rarg3;
3366     Register numIter       = c_rarg4;
3367     Register idx           = numIter;
3368 
3369     Register shiftRevCount = c_rarg5;
3370     Register oldArrNext    = c_rarg6;
3371     Register newArrCur     = t0;
3372     Register oldArrCur     = t1;
3373 
3374     __ beqz(idx, exit);
3375     __ shadd(newArr, newIdx, newArr, t0, 2);
3376 
3377     __ mv(shiftRevCount, 32);
3378     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3379 
3380     __ bind(loop);
3381     __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3382     __ sub(idx, idx, t0);
3383     __ shadd(oldArrNext, idx, oldArr, t1, 2);
3384     __ shadd(newArrCur, idx, newArr, t1, 2);
3385     __ addi(oldArrCur, oldArrNext, 4);
3386     __ vle32_v(v0, oldArrCur);
3387     __ vle32_v(v4, oldArrNext);
3388     __ vsrl_vx(v0, v0, shiftCount);
3389     __ vsll_vx(v4, v4, shiftRevCount);
3390     __ vor_vv(v0, v0, v4);
3391     __ vse32_v(v0, newArrCur);
3392     __ bnez(idx, loop);
3393 
3394     __ bind(exit);
3395     __ ret();
3396 
3397     return entry;
3398   }
3399 #endif
3400 
3401 #ifdef COMPILER2
3402   class MontgomeryMultiplyGenerator : public MacroAssembler {
3403 
3404     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3405       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3406 
3407     RegSet _toSave;
3408     bool _squaring;
3409 
3410   public:
3411     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3412       : MacroAssembler(as->code()), _squaring(squaring) {
3413 
3414       // Register allocation
3415 
3416       RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
3417       Pa_base = *regs;       // Argument registers
3418       if (squaring) {
3419         Pb_base = Pa_base;
3420       } else {
3421         Pb_base = *++regs;
3422       }
3423       Pn_base = *++regs;
3424       Rlen= *++regs;
3425       inv = *++regs;
3426       Pm_base = *++regs;
3427 
3428                         // Working registers:
3429       Ra =  *++regs;    // The current digit of a, b, n, and m.
3430       Rb =  *++regs;
3431       Rm =  *++regs;
3432       Rn =  *++regs;
3433 
3434       Pa =  *++regs;      // Pointers to the current/next digit of a, b, n, and m.
3435       Pb =  *++regs;
3436       Pm =  *++regs;
3437       Pn =  *++regs;
3438 
3439       tmp0 =  *++regs;    // Three registers which form a
3440       tmp1 =  *++regs;    // triple-precision accumuator.
3441       tmp2 =  *++regs;
3442 
3443       Ri =  x6;         // Inner and outer loop indexes.
3444       Rj =  x7;
3445 
3446       Rhi_ab = x28;     // Product registers: low and high parts
3447       Rlo_ab = x29;     // of a*b and m*n.
3448       Rhi_mn = x30;
3449       Rlo_mn = x31;
3450 
3451       // x18 and up are callee-saved.
3452       _toSave = RegSet::range(x18, *regs) + Pm_base;
3453     }
3454 
3455   private:
3456     void save_regs() {
3457       push_reg(_toSave, sp);
3458     }
3459 
3460     void restore_regs() {
3461       pop_reg(_toSave, sp);
3462     }
3463 
3464     template <typename T>
3465     void unroll_2(Register count, T block) {
3466       Label loop, end, odd;
3467       beqz(count, end);
3468       test_bit(t0, count, 0);
3469       bnez(t0, odd);
3470       align(16);
3471       bind(loop);
3472       (this->*block)();
3473       bind(odd);
3474       (this->*block)();
3475       subi(count, count, 2);
3476       bgtz(count, loop);
3477       bind(end);
3478     }
3479 
3480     template <typename T>
3481     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3482       Label loop, end, odd;
3483       beqz(count, end);
3484       test_bit(tmp, count, 0);
3485       bnez(tmp, odd);
3486       align(16);
3487       bind(loop);
3488       (this->*block)(d, s, tmp);
3489       bind(odd);
3490       (this->*block)(d, s, tmp);
3491       subi(count, count, 2);
3492       bgtz(count, loop);
3493       bind(end);
3494     }
3495 
3496     void pre1(RegisterOrConstant i) {
3497       block_comment("pre1");
3498       // Pa = Pa_base;
3499       // Pb = Pb_base + i;
3500       // Pm = Pm_base;
3501       // Pn = Pn_base + i;
3502       // Ra = *Pa;
3503       // Rb = *Pb;
3504       // Rm = *Pm;
3505       // Rn = *Pn;
3506       if (i.is_register()) {
3507         slli(t0, i.as_register(), LogBytesPerWord);
3508       } else {
3509         mv(t0, i.as_constant());
3510         slli(t0, t0, LogBytesPerWord);
3511       }
3512 
3513       mv(Pa, Pa_base);
3514       add(Pb, Pb_base, t0);
3515       mv(Pm, Pm_base);
3516       add(Pn, Pn_base, t0);
3517 
3518       ld(Ra, Address(Pa));
3519       ld(Rb, Address(Pb));
3520       ld(Rm, Address(Pm));
3521       ld(Rn, Address(Pn));
3522 
3523       // Zero the m*n result.
3524       mv(Rhi_mn, zr);
3525       mv(Rlo_mn, zr);
3526     }
3527 
3528     // The core multiply-accumulate step of a Montgomery
3529     // multiplication.  The idea is to schedule operations as a
3530     // pipeline so that instructions with long latencies (loads and
3531     // multiplies) have time to complete before their results are
3532     // used.  This most benefits in-order implementations of the
3533     // architecture but out-of-order ones also benefit.
3534     void step() {
3535       block_comment("step");
3536       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3537       // Ra = *++Pa;
3538       // Rb = *--Pb;
3539       mulhu(Rhi_ab, Ra, Rb);
3540       mul(Rlo_ab, Ra, Rb);
3541       addi(Pa, Pa, wordSize);
3542       ld(Ra, Address(Pa));
3543       subi(Pb, Pb, wordSize);
3544       ld(Rb, Address(Pb));
3545       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
3546                                             // previous iteration.
3547       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3548       // Rm = *++Pm;
3549       // Rn = *--Pn;
3550       mulhu(Rhi_mn, Rm, Rn);
3551       mul(Rlo_mn, Rm, Rn);
3552       addi(Pm, Pm, wordSize);
3553       ld(Rm, Address(Pm));
3554       subi(Pn, Pn, wordSize);
3555       ld(Rn, Address(Pn));
3556       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3557     }
3558 
3559     void post1() {
3560       block_comment("post1");
3561 
3562       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3563       // Ra = *++Pa;
3564       // Rb = *--Pb;
3565       mulhu(Rhi_ab, Ra, Rb);
3566       mul(Rlo_ab, Ra, Rb);
3567       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3568       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3569 
3570       // *Pm = Rm = tmp0 * inv;
3571       mul(Rm, tmp0, inv);
3572       sd(Rm, Address(Pm));
3573 
3574       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3575       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3576       mulhu(Rhi_mn, Rm, Rn);
3577 
3578 #ifndef PRODUCT
3579       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3580       {
3581         mul(Rlo_mn, Rm, Rn);
3582         add(Rlo_mn, tmp0, Rlo_mn);
3583         Label ok;
3584         beqz(Rlo_mn, ok);
3585         stop("broken Montgomery multiply");
3586         bind(ok);
3587       }
3588 #endif
3589       // We have very carefully set things up so that
3590       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3591       // the lower half of Rm * Rn because we know the result already:
3592       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3593       // tmp0 != 0.  So, rather than do a mul and an cad we just set
3594       // the carry flag iff tmp0 is nonzero.
3595       //
3596       // mul(Rlo_mn, Rm, Rn);
3597       // cad(zr, tmp0, Rlo_mn);
3598       subi(t0, tmp0, 1);
3599       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3600       cadc(tmp0, tmp1, Rhi_mn, t0);
3601       adc(tmp1, tmp2, zr, t0);
3602       mv(tmp2, zr);
3603     }
3604 
3605     void pre2(Register i, Register len) {
3606       block_comment("pre2");
3607       // Pa = Pa_base + i-len;
3608       // Pb = Pb_base + len;
3609       // Pm = Pm_base + i-len;
3610       // Pn = Pn_base + len;
3611 
3612       sub(Rj, i, len);
3613       // Rj == i-len
3614 
3615       // Ra as temp register
3616       slli(Ra, Rj, LogBytesPerWord);
3617       add(Pa, Pa_base, Ra);
3618       add(Pm, Pm_base, Ra);
3619       slli(Ra, len, LogBytesPerWord);
3620       add(Pb, Pb_base, Ra);
3621       add(Pn, Pn_base, Ra);
3622 
3623       // Ra = *++Pa;
3624       // Rb = *--Pb;
3625       // Rm = *++Pm;
3626       // Rn = *--Pn;
3627       addi(Pa, Pa, wordSize);
3628       ld(Ra, Address(Pa));
3629       subi(Pb, Pb, wordSize);
3630       ld(Rb, Address(Pb));
3631       addi(Pm, Pm, wordSize);
3632       ld(Rm, Address(Pm));
3633       subi(Pn, Pn, wordSize);
3634       ld(Rn, Address(Pn));
3635 
3636       mv(Rhi_mn, zr);
3637       mv(Rlo_mn, zr);
3638     }
3639 
3640     void post2(Register i, Register len) {
3641       block_comment("post2");
3642       sub(Rj, i, len);
3643 
3644       cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
3645 
3646       // As soon as we know the least significant digit of our result,
3647       // store it.
3648       // Pm_base[i-len] = tmp0;
3649       // Rj as temp register
3650       slli(Rj, Rj, LogBytesPerWord);
3651       add(Rj, Pm_base, Rj);
3652       sd(tmp0, Address(Rj));
3653 
3654       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3655       cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
3656       adc(tmp1, tmp2, zr, t0);
3657       mv(tmp2, zr);
3658     }
3659 
3660     // A carry in tmp0 after Montgomery multiplication means that we
3661     // should subtract multiples of n from our result in m.  We'll
3662     // keep doing that until there is no carry.
3663     void normalize(Register len) {
3664       block_comment("normalize");
3665       // while (tmp0)
3666       //   tmp0 = sub(Pm_base, Pn_base, tmp0, len);
3667       Label loop, post, again;
3668       Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
3669       beqz(tmp0, post); {
3670         bind(again); {
3671           mv(i, zr);
3672           mv(cnt, len);
3673           slli(Rn, i, LogBytesPerWord);
3674           add(Rm, Pm_base, Rn);
3675           ld(Rm, Address(Rm));
3676           add(Rn, Pn_base, Rn);
3677           ld(Rn, Address(Rn));
3678           mv(t0, 1); // set carry flag, i.e. no borrow
3679           align(16);
3680           bind(loop); {
3681             notr(Rn, Rn);
3682             add(Rm, Rm, t0);
3683             add(Rm, Rm, Rn);
3684             sltu(t0, Rm, Rn);
3685             slli(Rn, i, LogBytesPerWord); // Rn as temp register
3686             add(Rn, Pm_base, Rn);
3687             sd(Rm, Address(Rn));
3688             addi(i, i, 1);
3689             slli(Rn, i, LogBytesPerWord);
3690             add(Rm, Pm_base, Rn);
3691             ld(Rm, Address(Rm));
3692             add(Rn, Pn_base, Rn);
3693             ld(Rn, Address(Rn));
3694             subi(cnt, cnt, 1);
3695           } bnez(cnt, loop);
3696           subi(tmp0, tmp0, 1);
3697           add(tmp0, tmp0, t0);
3698         } bnez(tmp0, again);
3699       } bind(post);
3700     }
3701 
3702     // Move memory at s to d, reversing words.
3703     //    Increments d to end of copied memory
3704     //    Destroys tmp1, tmp2
3705     //    Preserves len
3706     //    Leaves s pointing to the address which was in d at start
3707     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3708       assert(tmp1->encoding() < x28->encoding(), "register corruption");
3709       assert(tmp2->encoding() < x28->encoding(), "register corruption");
3710 
3711       shadd(s, len, s, tmp1, LogBytesPerWord);
3712       mv(tmp1, len);
3713       unroll_2(tmp1,  &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3714       slli(tmp1, len, LogBytesPerWord);
3715       sub(s, d, tmp1);
3716     }
3717     // [63...0] -> [31...0][63...32]
3718     void reverse1(Register d, Register s, Register tmp) {
3719       subi(s, s, wordSize);
3720       ld(tmp, Address(s));
3721       ror(tmp, tmp, 32, t0);
3722       sd(tmp, Address(d));
3723       addi(d, d, wordSize);
3724     }
3725 
3726     void step_squaring() {
3727       // An extra ACC
3728       step();
3729       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3730     }
3731 
3732     void last_squaring(Register i) {
3733       Label dont;
3734       // if ((i & 1) == 0) {
3735       test_bit(t0, i, 0);
3736       bnez(t0, dont); {
3737         // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3738         // Ra = *++Pa;
3739         // Rb = *--Pb;
3740         mulhu(Rhi_ab, Ra, Rb);
3741         mul(Rlo_ab, Ra, Rb);
3742         acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3743       } bind(dont);
3744     }
3745 
3746     void extra_step_squaring() {
3747       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3748 
3749       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3750       // Rm = *++Pm;
3751       // Rn = *--Pn;
3752       mulhu(Rhi_mn, Rm, Rn);
3753       mul(Rlo_mn, Rm, Rn);
3754       addi(Pm, Pm, wordSize);
3755       ld(Rm, Address(Pm));
3756       subi(Pn, Pn, wordSize);
3757       ld(Rn, Address(Pn));
3758     }
3759 
3760     void post1_squaring() {
3761       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3762 
3763       // *Pm = Rm = tmp0 * inv;
3764       mul(Rm, tmp0, inv);
3765       sd(Rm, Address(Pm));
3766 
3767       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3768       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3769       mulhu(Rhi_mn, Rm, Rn);
3770 
3771 #ifndef PRODUCT
3772       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3773       {
3774         mul(Rlo_mn, Rm, Rn);
3775         add(Rlo_mn, tmp0, Rlo_mn);
3776         Label ok;
3777         beqz(Rlo_mn, ok); {
3778           stop("broken Montgomery multiply");
3779         } bind(ok);
3780       }
3781 #endif
3782       // We have very carefully set things up so that
3783       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3784       // the lower half of Rm * Rn because we know the result already:
3785       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3786       // tmp0 != 0.  So, rather than do a mul and a cad we just set
3787       // the carry flag iff tmp0 is nonzero.
3788       //
3789       // mul(Rlo_mn, Rm, Rn);
3790       // cad(zr, tmp, Rlo_mn);
3791       subi(t0, tmp0, 1);
3792       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3793       cadc(tmp0, tmp1, Rhi_mn, t0);
3794       adc(tmp1, tmp2, zr, t0);
3795       mv(tmp2, zr);
3796     }
3797 
3798     // use t0 as carry
3799     void acc(Register Rhi, Register Rlo,
3800              Register tmp0, Register tmp1, Register tmp2) {
3801       cad(tmp0, tmp0, Rlo, t0);
3802       cadc(tmp1, tmp1, Rhi, t0);
3803       adc(tmp2, tmp2, zr, t0);
3804     }
3805 
3806   public:
3807     /**
3808      * Fast Montgomery multiplication.  The derivation of the
3809      * algorithm is in A Cryptographic Library for the Motorola
3810      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3811      *
3812      * Arguments:
3813      *
3814      * Inputs for multiplication:
3815      *   c_rarg0   - int array elements a
3816      *   c_rarg1   - int array elements b
3817      *   c_rarg2   - int array elements n (the modulus)
3818      *   c_rarg3   - int length
3819      *   c_rarg4   - int inv
3820      *   c_rarg5   - int array elements m (the result)
3821      *
3822      * Inputs for squaring:
3823      *   c_rarg0   - int array elements a
3824      *   c_rarg1   - int array elements n (the modulus)
3825      *   c_rarg2   - int length
3826      *   c_rarg3   - int inv
3827      *   c_rarg4   - int array elements m (the result)
3828      *
3829      */
3830     address generate_multiply() {
3831       Label argh, nothing;
3832       bind(argh);
3833       stop("MontgomeryMultiply total_allocation must be <= 8192");
3834 
3835       align(CodeEntryAlignment);
3836       address entry = pc();
3837 
3838       beqz(Rlen, nothing);
3839 
3840       enter();
3841 
3842       // Make room.
3843       mv(Ra, 512);
3844       bgt(Rlen, Ra, argh);
3845       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3846       sub(Ra, sp, Ra);
3847       andi(sp, Ra, -2 * wordSize);
3848 
3849       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3850 
3851       {
3852         // Copy input args, reversing as we go.  We use Ra as a
3853         // temporary variable.
3854         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3855         if (!_squaring)
3856           reverse(Ra, Pb_base, Rlen, Ri, Rj);
3857         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3858       }
3859 
3860       // Push all call-saved registers and also Pm_base which we'll need
3861       // at the end.
3862       save_regs();
3863 
3864 #ifndef PRODUCT
3865       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3866       {
3867         ld(Rn, Address(Pn_base));
3868         mul(Rlo_mn, Rn, inv);
3869         mv(t0, -1);
3870         Label ok;
3871         beq(Rlo_mn, t0, ok);
3872         stop("broken inverse in Montgomery multiply");
3873         bind(ok);
3874       }
3875 #endif
3876 
3877       mv(Pm_base, Ra);
3878 
3879       mv(tmp0, zr);
3880       mv(tmp1, zr);
3881       mv(tmp2, zr);
3882 
3883       block_comment("for (int i = 0; i < len; i++) {");
3884       mv(Ri, zr); {
3885         Label loop, end;
3886         bge(Ri, Rlen, end);
3887 
3888         bind(loop);
3889         pre1(Ri);
3890 
3891         block_comment("  for (j = i; j; j--) {"); {
3892           mv(Rj, Ri);
3893           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3894         } block_comment("  } // j");
3895 
3896         post1();
3897         addiw(Ri, Ri, 1);
3898         blt(Ri, Rlen, loop);
3899         bind(end);
3900         block_comment("} // i");
3901       }
3902 
3903       block_comment("for (int i = len; i < 2*len; i++) {");
3904       mv(Ri, Rlen); {
3905         Label loop, end;
3906         slli(t0, Rlen, 1);
3907         bge(Ri, t0, end);
3908 
3909         bind(loop);
3910         pre2(Ri, Rlen);
3911 
3912         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3913           slliw(Rj, Rlen, 1);
3914           subw(Rj, Rj, Ri);
3915           subiw(Rj, Rj, 1);
3916           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3917         } block_comment("  } // j");
3918 
3919         post2(Ri, Rlen);
3920         addiw(Ri, Ri, 1);
3921         slli(t0, Rlen, 1);
3922         blt(Ri, t0, loop);
3923         bind(end);
3924       }
3925       block_comment("} // i");
3926 
3927       normalize(Rlen);
3928 
3929       mv(Ra, Pm_base);  // Save Pm_base in Ra
3930       restore_regs();  // Restore caller's Pm_base
3931 
3932       // Copy our result into caller's Pm_base
3933       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3934 
3935       leave();
3936       bind(nothing);
3937       ret();
3938 
3939       return entry;
3940     }
3941 
3942     /**
3943      *
3944      * Arguments:
3945      *
3946      * Inputs:
3947      *   c_rarg0   - int array elements a
3948      *   c_rarg1   - int array elements n (the modulus)
3949      *   c_rarg2   - int length
3950      *   c_rarg3   - int inv
3951      *   c_rarg4   - int array elements m (the result)
3952      *
3953      */
3954     address generate_square() {
3955       Label argh;
3956       bind(argh);
3957       stop("MontgomeryMultiply total_allocation must be <= 8192");
3958 
3959       align(CodeEntryAlignment);
3960       address entry = pc();
3961 
3962       enter();
3963 
3964       // Make room.
3965       mv(Ra, 512);
3966       bgt(Rlen, Ra, argh);
3967       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3968       sub(Ra, sp, Ra);
3969       andi(sp, Ra, -2 * wordSize);
3970 
3971       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3972 
3973       {
3974         // Copy input args, reversing as we go.  We use Ra as a
3975         // temporary variable.
3976         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3977         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3978       }
3979 
3980       // Push all call-saved registers and also Pm_base which we'll need
3981       // at the end.
3982       save_regs();
3983 
3984       mv(Pm_base, Ra);
3985 
3986       mv(tmp0, zr);
3987       mv(tmp1, zr);
3988       mv(tmp2, zr);
3989 
3990       block_comment("for (int i = 0; i < len; i++) {");
3991       mv(Ri, zr); {
3992         Label loop, end;
3993         bind(loop);
3994         bge(Ri, Rlen, end);
3995 
3996         pre1(Ri);
3997 
3998         block_comment("for (j = (i+1)/2; j; j--) {"); {
3999           addi(Rj, Ri, 1);
4000           srliw(Rj, Rj, 1);
4001           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4002         } block_comment("  } // j");
4003 
4004         last_squaring(Ri);
4005 
4006         block_comment("  for (j = i/2; j; j--) {"); {
4007           srliw(Rj, Ri, 1);
4008           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4009         } block_comment("  } // j");
4010 
4011         post1_squaring();
4012         addi(Ri, Ri, 1);
4013         blt(Ri, Rlen, loop);
4014 
4015         bind(end);
4016         block_comment("} // i");
4017       }
4018 
4019       block_comment("for (int i = len; i < 2*len; i++) {");
4020       mv(Ri, Rlen); {
4021         Label loop, end;
4022         bind(loop);
4023         slli(t0, Rlen, 1);
4024         bge(Ri, t0, end);
4025 
4026         pre2(Ri, Rlen);
4027 
4028         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4029           slli(Rj, Rlen, 1);
4030           sub(Rj, Rj, Ri);
4031           subi(Rj, Rj, 1);
4032           srliw(Rj, Rj, 1);
4033           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4034         } block_comment("  } // j");
4035 
4036         last_squaring(Ri);
4037 
4038         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4039           slli(Rj, Rlen, 1);
4040           sub(Rj, Rj, Ri);
4041           srliw(Rj, Rj, 1);
4042           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4043         } block_comment("  } // j");
4044 
4045         post2(Ri, Rlen);
4046         addi(Ri, Ri, 1);
4047         slli(t0, Rlen, 1);
4048         blt(Ri, t0, loop);
4049 
4050         bind(end);
4051         block_comment("} // i");
4052       }
4053 
4054       normalize(Rlen);
4055 
4056       mv(Ra, Pm_base);  // Save Pm_base in Ra
4057       restore_regs();  // Restore caller's Pm_base
4058 
4059       // Copy our result into caller's Pm_base
4060       reverse(Pm_base, Ra, Rlen, Ri, Rj);
4061 
4062       leave();
4063       ret();
4064 
4065       return entry;
4066     }
4067   };
4068 
4069 #endif // COMPILER2
4070 
4071   address generate_cont_thaw(Continuation::thaw_kind kind) {
4072     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
4073     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
4074 
4075     address start = __ pc();
4076 
4077     if (return_barrier) {
4078       __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4079     }
4080 
4081 #ifndef PRODUCT
4082     {
4083       Label OK;
4084       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4085       __ beq(sp, t0, OK);
4086       __ stop("incorrect sp");
4087       __ bind(OK);
4088     }
4089 #endif
4090 
4091     if (return_barrier) {
4092       // preserve possible return value from a method returning to the return barrier
4093       __ subi(sp, sp, 2 * wordSize);
4094       __ fsd(f10, Address(sp, 0 * wordSize));
4095       __ sd(x10, Address(sp, 1 * wordSize));
4096     }
4097 
4098     __ mv(c_rarg1, (return_barrier ? 1 : 0));
4099     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
4100     __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
4101 
4102     if (return_barrier) {
4103       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4104       __ ld(x10, Address(sp, 1 * wordSize));
4105       __ fld(f10, Address(sp, 0 * wordSize));
4106       __ addi(sp, sp, 2 * wordSize);
4107     }
4108 
4109 #ifndef PRODUCT
4110     {
4111       Label OK;
4112       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4113       __ beq(sp, t0, OK);
4114       __ stop("incorrect sp");
4115       __ bind(OK);
4116     }
4117 #endif
4118 
4119     Label thaw_success;
4120     // t1 contains the size of the frames to thaw, 0 if overflow or no more frames
4121     __ bnez(t1, thaw_success);
4122     __ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
4123     __ bind(thaw_success);
4124 
4125     // make room for the thawed frames
4126     __ sub(t0, sp, t1);
4127     __ andi(sp, t0, -16); // align
4128 
4129     if (return_barrier) {
4130       // save original return value -- again
4131       __ subi(sp, sp, 2 * wordSize);
4132       __ fsd(f10, Address(sp, 0 * wordSize));
4133       __ sd(x10, Address(sp, 1 * wordSize));
4134     }
4135 
4136     // If we want, we can templatize thaw by kind, and have three different entries
4137     __ mv(c_rarg1, kind);
4138 
4139     __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
4140     __ mv(t1, x10); // x10 is the sp of the yielding frame
4141 
4142     if (return_barrier) {
4143       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4144       __ ld(x10, Address(sp, 1 * wordSize));
4145       __ fld(f10, Address(sp, 0 * wordSize));
4146       __ addi(sp, sp, 2 * wordSize);
4147     } else {
4148       __ mv(x10, zr); // return 0 (success) from doYield
4149     }
4150 
4151     // we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
4152     __ mv(fp, t1);
4153     __ subi(sp, t1, 2 * wordSize); // now pointing to fp spill
4154 
4155     if (return_barrier_exception) {
4156       __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
4157       __ verify_oop(x10);
4158       __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
4159 
4160       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
4161 
4162       // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
4163 
4164       __ mv(x11, x10); // the exception handler
4165       __ mv(x10, x9); // restore return value contaning the exception oop
4166       __ verify_oop(x10);
4167 
4168       __ leave();
4169       __ mv(x13, ra);
4170       __ jr(x11); // the exception handler
4171     } else {
4172       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4173       __ leave();
4174       __ ret();
4175     }
4176 
4177     return start;
4178   }
4179 
4180   address generate_cont_thaw() {
4181     if (!Continuations::enabled()) return nullptr;
4182 
4183     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
4184     StubCodeMark mark(this, stub_id);
4185     address start = __ pc();
4186     generate_cont_thaw(Continuation::thaw_top);
4187     return start;
4188   }
4189 
4190   address generate_cont_returnBarrier() {
4191     if (!Continuations::enabled()) return nullptr;
4192 
4193     // TODO: will probably need multiple return barriers depending on return type
4194     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
4195     StubCodeMark mark(this, stub_id);
4196     address start = __ pc();
4197 
4198     generate_cont_thaw(Continuation::thaw_return_barrier);
4199 
4200     return start;
4201   }
4202 
4203   address generate_cont_returnBarrier_exception() {
4204     if (!Continuations::enabled()) return nullptr;
4205 
4206     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
4207     StubCodeMark mark(this, stub_id);
4208     address start = __ pc();
4209 
4210     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
4211 
4212     return start;
4213   }
4214 
4215   address generate_cont_preempt_stub() {
4216     if (!Continuations::enabled()) return nullptr;
4217     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
4218     StubCodeMark mark(this, stub_id);
4219     address start = __ pc();
4220 
4221     __ reset_last_Java_frame(true);
4222 
4223     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4224     __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4225 
4226     Label preemption_cancelled;
4227     __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset()));
4228     __ bnez(t0, preemption_cancelled);
4229 
4230     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4231     SharedRuntime::continuation_enter_cleanup(_masm);
4232     __ leave();
4233     __ ret();
4234 
4235     // We acquired the monitor after freezing the frames so call thaw to continue execution.
4236     __ bind(preemption_cancelled);
4237     __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset()));
4238     __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize)));
4239     __ la(t1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
4240     __ ld(t1, Address(t1));
4241     __ jr(t1);
4242 
4243     return start;
4244   }
4245 
4246 #if COMPILER2_OR_JVMCI
4247 
4248 #undef __
4249 #define __ this->
4250 
4251   class Sha2Generator : public MacroAssembler {
4252     StubCodeGenerator* _cgen;
4253    public:
4254       Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
4255       address generate_sha256_implCompress(StubGenStubId stub_id) {
4256         return generate_sha2_implCompress(Assembler::e32, stub_id);
4257       }
4258       address generate_sha512_implCompress(StubGenStubId stub_id) {
4259         return generate_sha2_implCompress(Assembler::e64, stub_id);
4260       }
4261    private:
4262 
4263     void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4264       if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
4265       else                            __ vle64_v(vr, sr);
4266     }
4267 
4268     void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4269       if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
4270       else                            __ vse64_v(vr, sr);
4271     }
4272 
4273     // Overview of the logic in each "quad round".
4274     //
4275     // The code below repeats 16/20 times the logic implementing four rounds
4276     // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
4277     // to implementing the 64/80 single rounds.
4278     //
4279     //    // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
4280     //    // Output:
4281     //    //   vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4282     //    vl1reXX.v vTmp1, ofs
4283     //
4284     //    // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
4285     //    addi ofs, ofs, 16/32
4286     //
4287     //    // Add constants to message schedule words:
4288     //    //  Input
4289     //    //    vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4290     //    //    vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
4291     //    //  Output
4292     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4293     //    vadd.vv vTmp0, vTmp1, vW0
4294     //
4295     //    //  2 rounds of working variables updates.
4296     //    //     vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
4297     //    //  Input:
4298     //    //    vState1 = {c[t],d[t],g[t],h[t]}   " = vState1[t] "
4299     //    //    vState0 = {a[t],b[t],e[t],f[t]}
4300     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4301     //    //  Output:
4302     //    //    vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}  " = vState0[t+2] "
4303     //    //        = {h[t+4],g[t+4],d[t+4],c[t+4]}  " = vState1[t+4] "
4304     //    vsha2cl.vv vState1, vState0, vTmp0
4305     //
4306     //    //  2 rounds of working variables updates.
4307     //    //     vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
4308     //    //  Input
4309     //    //   vState0 = {a[t],b[t],e[t],f[t]}       " = vState0[t] "
4310     //    //       = {h[t+2],g[t+2],d[t+2],c[t+2]}   " = vState1[t+2] "
4311     //    //   vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}   " = vState0[t+2] "
4312     //    //   vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4313     //    //  Output:
4314     //    //   vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]}   " = vState0[t+4] "
4315     //    vsha2ch.vv vState0, vState1, vTmp0
4316     //
4317     //    // Combine 2QW into 1QW
4318     //    //
4319     //    // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
4320     //    //     vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
4321     //    // and it can only take 3 vectors as inputs. Hence we need to combine
4322     //    // vW1[0] and vW2[1..3] in a single vector.
4323     //    //
4324     //    // vmerge Vt4, Vt1, Vt2, V0
4325     //    // Input
4326     //    //  V0 = mask // first word from vW2, 1..3 words from vW1
4327     //    //  vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
4328     //    //  vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
4329     //    // Output
4330     //    //  Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
4331     //    vmerge.vvm vTmp0, vW2, vW1, v0
4332     //
4333     //    // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
4334     //    // Input
4335     //    //  vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]}     W[ 3: 0]
4336     //    //  vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]}     W[15:12]
4337     //    //  vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]}     W[11: 9,4]
4338     //    // Output (next four message schedule words)
4339     //    //  vW0 = {W[t+19],  W[t+18],  W[t+17],  W[t+16]}  W[19:16]
4340     //    vsha2ms.vv vW0, vTmp0, vW3
4341     //
4342     // BEFORE
4343     //  vW0 - vW3 hold the message schedule words (initially the block words)
4344     //    vW0 = W[ 3: 0]   "oldest"
4345     //    vW1 = W[ 7: 4]
4346     //    vW2 = W[11: 8]
4347     //    vW3 = W[15:12]   "newest"
4348     //
4349     //  vt6 - vt7 hold the working state variables
4350     //    vState0 = {a[t],b[t],e[t],f[t]}   // initially {H5,H4,H1,H0}
4351     //    vState1 = {c[t],d[t],g[t],h[t]}   // initially {H7,H6,H3,H2}
4352     //
4353     // AFTER
4354     //  vW0 - vW3 hold the message schedule words (initially the block words)
4355     //    vW1 = W[ 7: 4]   "oldest"
4356     //    vW2 = W[11: 8]
4357     //    vW3 = W[15:12]
4358     //    vW0 = W[19:16]   "newest"
4359     //
4360     //  vState0 and vState1 hold the working state variables
4361     //    vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
4362     //    vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
4363     //
4364     //  The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
4365     //  hence the uses of those vectors rotate in each round, and we get back to the
4366     //  initial configuration every 4 quad-rounds. We could avoid those changes at
4367     //  the cost of moving those vectors at the end of each quad-rounds.
4368     void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
4369                          Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
4370                          bool gen_words = true, bool step_const = true) {
4371       __ vleXX_v(vset_sew, vtemp, scalarconst);
4372       if (step_const) {
4373         __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
4374       }
4375       __ vadd_vv(vtemp2, vtemp, rot1);
4376       __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
4377       __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
4378       if (gen_words) {
4379         __ vmerge_vvm(vtemp2, rot3, rot2);
4380         __ vsha2ms_vv(rot1, vtemp2, rot4);
4381       }
4382     }
4383 
4384     // Arguments:
4385     //
4386     // Inputs:
4387     //   c_rarg0   - byte[]  source+offset
4388     //   c_rarg1   - int[]   SHA.state
4389     //   c_rarg2   - int     offset
4390     //   c_rarg3   - int     limit
4391     //
4392     address generate_sha2_implCompress(Assembler::SEW vset_sew, StubGenStubId stub_id) {
4393       alignas(64) static const uint32_t round_consts_256[64] = {
4394         0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
4395         0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
4396         0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
4397         0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
4398         0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
4399         0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
4400         0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
4401         0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
4402         0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
4403         0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
4404         0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
4405         0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
4406         0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
4407         0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
4408         0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
4409         0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
4410       };
4411       alignas(64) static const uint64_t round_consts_512[80] = {
4412         0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
4413         0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
4414         0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
4415         0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
4416         0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
4417         0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
4418         0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
4419         0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
4420         0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
4421         0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
4422         0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
4423         0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
4424         0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
4425         0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
4426         0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
4427         0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
4428         0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
4429         0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
4430         0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
4431         0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
4432         0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
4433         0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
4434         0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
4435         0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
4436         0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
4437         0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
4438         0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
4439       };
4440       const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
4441 
4442       bool multi_block;
4443       switch (stub_id) {
4444       case sha256_implCompress_id:
4445         assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4446         multi_block = false;
4447         break;
4448       case sha256_implCompressMB_id:
4449         assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4450         multi_block = true;
4451         break;
4452       case sha512_implCompress_id:
4453         assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4454         multi_block = false;
4455         break;
4456       case sha512_implCompressMB_id:
4457         assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4458         multi_block = true;
4459         break;
4460       default:
4461         ShouldNotReachHere();
4462       };
4463       __ align(CodeEntryAlignment);
4464       StubCodeMark mark(_cgen, stub_id);
4465       address start = __ pc();
4466 
4467       Register buf   = c_rarg0;
4468       Register state = c_rarg1;
4469       Register ofs   = c_rarg2;
4470       Register limit = c_rarg3;
4471       Register consts =  t2; // caller saved
4472       Register state_c = x28; // caller saved
4473       VectorRegister vindex = v2;
4474       VectorRegister vW0 = v4;
4475       VectorRegister vW1 = v6;
4476       VectorRegister vW2 = v8;
4477       VectorRegister vW3 = v10;
4478       VectorRegister vState0 = v12;
4479       VectorRegister vState1 = v14;
4480       VectorRegister vHash0  = v16;
4481       VectorRegister vHash1  = v18;
4482       VectorRegister vTmp0   = v20;
4483       VectorRegister vTmp1   = v22;
4484 
4485       Label multi_block_loop;
4486 
4487       __ enter();
4488 
4489       address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
4490       la(consts, ExternalAddress(constant_table));
4491 
4492       // Register use in this function:
4493       //
4494       // VECTORS
4495       //  vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
4496       //             schedule words (Wt). They start with the message block
4497       //             content (W0 to W15), then further words in the message
4498       //             schedule generated via vsha2ms from previous Wt.
4499       //   Initially:
4500       //     vW0 = W[  3:0] = { W3,  W2,  W1,  W0}
4501       //     vW1 = W[  7:4] = { W7,  W6,  W5,  W4}
4502       //     vW2 = W[ 11:8] = {W11, W10,  W9,  W8}
4503       //     vW3 = W[15:12] = {W15, W14, W13, W12}
4504       //
4505       //  vState0 - vState1 hold the working state variables (a, b, ..., h)
4506       //    vState0 = {f[t],e[t],b[t],a[t]}
4507       //    vState1 = {h[t],g[t],d[t],c[t]}
4508       //   Initially:
4509       //    vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
4510       //    vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
4511       //
4512       //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
4513       //
4514       //  vTmp0 = temporary, Wt+Kt
4515       //  vTmp1 = temporary, Kt
4516       //
4517       //  vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
4518       //
4519       // During most of the function the vector state is configured so that each
4520       // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
4521 
4522       // vsha2ch/vsha2cl uses EGW of 4*SEW.
4523       // SHA256 SEW = e32, EGW = 128-bits
4524       // SHA512 SEW = e64, EGW = 256-bits
4525       //
4526       // VLEN is required to be at least 128.
4527       // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
4528       //
4529       // m1: LMUL=1/2
4530       // ta: tail agnostic (don't care about those lanes)
4531       // ma: mask agnostic (don't care about those lanes)
4532       // x0 is not written, we known the number of vector elements.
4533 
4534       if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
4535         __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
4536       } else {
4537         __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
4538       }
4539 
4540       int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
4541       __ li(t0, indexes);
4542       __ vmv_v_x(vindex, t0);
4543 
4544       // Step-over a,b, so we are pointing to c.
4545       // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
4546       __ addi(state_c, state, const_add/2);
4547 
4548       // Use index-load to get {f,e,b,a},{h,g,d,c}
4549       __ vluxei8_v(vState0, state, vindex);
4550       __ vluxei8_v(vState1, state_c, vindex);
4551 
4552       __ bind(multi_block_loop);
4553 
4554       // Capture the initial H values in vHash0 and vHash1 to allow for computing
4555       // the resulting H', since H' = H+{a',b',c',...,h'}.
4556       __ vmv_v_v(vHash0, vState0);
4557       __ vmv_v_v(vHash1, vState1);
4558 
4559       // Load the 512/1024-bits of the message block in vW0-vW3 and perform
4560       // an endian swap on each 4/8 bytes element.
4561       //
4562       // If Zvkb is not implemented one can use vrgather
4563       // with an index sequence to byte-swap.
4564       //  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
4565       //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
4566       //  this sequence. 'vid' gives us the N.
4567       __ vleXX_v(vset_sew, vW0, buf);
4568       __ vrev8_v(vW0, vW0);
4569       __ addi(buf, buf, const_add);
4570       __ vleXX_v(vset_sew, vW1, buf);
4571       __ vrev8_v(vW1, vW1);
4572       __ addi(buf, buf, const_add);
4573       __ vleXX_v(vset_sew, vW2, buf);
4574       __ vrev8_v(vW2, vW2);
4575       __ addi(buf, buf, const_add);
4576       __ vleXX_v(vset_sew, vW3, buf);
4577       __ vrev8_v(vW3, vW3);
4578       __ addi(buf, buf, const_add);
4579 
4580       // Set v0 up for the vmerge that replaces the first word (idx==0)
4581       __ vid_v(v0);
4582       __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
4583 
4584       VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
4585       int rot_pos = 0;
4586       // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
4587       const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
4588       for (int i = 0; i < qr_end; i++) {
4589         sha2_quad_round(vset_sew,
4590                    rotation_regs[(rot_pos + 0) & 0x3],
4591                    rotation_regs[(rot_pos + 1) & 0x3],
4592                    rotation_regs[(rot_pos + 2) & 0x3],
4593                    rotation_regs[(rot_pos + 3) & 0x3],
4594                    consts,
4595                    vTmp1, vTmp0, vState0, vState1);
4596         ++rot_pos;
4597       }
4598       // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
4599       // Note that we stop generating new message schedule words (Wt, vW0-13)
4600       // as we already generated all the words we end up consuming (i.e., W[63:60]).
4601       const int qr_c_end = qr_end + 4;
4602       for (int i = qr_end; i < qr_c_end; i++) {
4603         sha2_quad_round(vset_sew,
4604                    rotation_regs[(rot_pos + 0) & 0x3],
4605                    rotation_regs[(rot_pos + 1) & 0x3],
4606                    rotation_regs[(rot_pos + 2) & 0x3],
4607                    rotation_regs[(rot_pos + 3) & 0x3],
4608                    consts,
4609                    vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
4610         ++rot_pos;
4611       }
4612 
4613       //--------------------------------------------------------------------------------
4614       // Compute the updated hash value H'
4615       //   H' = H + {h',g',...,b',a'}
4616       //      = {h,g,...,b,a} + {h',g',...,b',a'}
4617       //      = {h+h',g+g',...,b+b',a+a'}
4618 
4619       // H' = H+{a',b',c',...,h'}
4620       __ vadd_vv(vState0, vHash0, vState0);
4621       __ vadd_vv(vState1, vHash1, vState1);
4622 
4623       if (multi_block) {
4624         int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
4625         __ subi(consts, consts, total_adds);
4626         __ addi(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
4627         __ ble(ofs, limit, multi_block_loop);
4628         __ mv(c_rarg0, ofs); // return ofs
4629       }
4630 
4631       // Store H[0..8] = {a,b,c,d,e,f,g,h} from
4632       //  vState0 = {f,e,b,a}
4633       //  vState1 = {h,g,d,c}
4634       __ vsuxei8_v(vState0, state,   vindex);
4635       __ vsuxei8_v(vState1, state_c, vindex);
4636 
4637       __ leave();
4638       __ ret();
4639 
4640       return start;
4641     }
4642   };
4643 
4644 #undef __
4645 #define __ _masm->
4646 
4647   // Set of L registers that correspond to a contiguous memory area.
4648   // Each 64-bit register typically corresponds to 2 32-bit integers.
4649   template <uint L>
4650   class RegCache {
4651   private:
4652     MacroAssembler *_masm;
4653     Register _regs[L];
4654 
4655   public:
4656     RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
4657       assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
4658       auto it = rs.begin();
4659       for (auto &r: _regs) {
4660         r = *it;
4661         ++it;
4662       }
4663     }
4664 
4665     // generate load for the i'th register
4666     void gen_load(uint i, Register base) {
4667       assert(i < L, "invalid i: %u", i);
4668       __ ld(_regs[i], Address(base, 8 * i));
4669     }
4670 
4671     // add i'th 32-bit integer to dest
4672     void add_u32(const Register dest, uint i, const Register rtmp = t0) {
4673       assert(i < 2 * L, "invalid i: %u", i);
4674 
4675       if (is_even(i)) {
4676         // Use the bottom 32 bits. No need to mask off the top 32 bits
4677         // as addw will do the right thing.
4678         __ addw(dest, dest, _regs[i / 2]);
4679       } else {
4680         // Use the top 32 bits by right-shifting them.
4681         __ srli(rtmp, _regs[i / 2], 32);
4682         __ addw(dest, dest, rtmp);
4683       }
4684     }
4685   };
4686 
4687   typedef RegCache<8> BufRegCache;
4688 
4689   // a += value + x + ac;
4690   // a = Integer.rotateLeft(a, s) + b;
4691   void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
4692                                Register a, Register b, Register c, Register d,
4693                                int k, int s, int t,
4694                                Register value) {
4695     // a += ac
4696     __ addw(a, a, t, t1);
4697 
4698     // a += x;
4699     reg_cache.add_u32(a, k);
4700     // a += value;
4701     __ addw(a, a, value);
4702 
4703     // a = Integer.rotateLeft(a, s) + b;
4704     __ rolw(a, a, s);
4705     __ addw(a, a, b);
4706   }
4707 
4708   // a += ((b & c) | ((~b) & d)) + x + ac;
4709   // a = Integer.rotateLeft(a, s) + b;
4710   void md5_FF(BufRegCache& reg_cache,
4711               Register a, Register b, Register c, Register d,
4712               int k, int s, int t,
4713               Register rtmp1, Register rtmp2) {
4714     // rtmp1 = b & c
4715     __ andr(rtmp1, b, c);
4716 
4717     // rtmp2 = (~b) & d
4718     __ andn(rtmp2, d, b);
4719 
4720     // rtmp1 = (b & c) | ((~b) & d)
4721     __ orr(rtmp1, rtmp1, rtmp2);
4722 
4723     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4724   }
4725 
4726   // a += ((b & d) | (c & (~d))) + x + ac;
4727   // a = Integer.rotateLeft(a, s) + b;
4728   void md5_GG(BufRegCache& reg_cache,
4729               Register a, Register b, Register c, Register d,
4730               int k, int s, int t,
4731               Register rtmp1, Register rtmp2) {
4732     // rtmp1 = b & d
4733     __ andr(rtmp1, b, d);
4734 
4735     // rtmp2 = c & (~d)
4736     __ andn(rtmp2, c, d);
4737 
4738     // rtmp1 = (b & d) | (c & (~d))
4739     __ orr(rtmp1, rtmp1, rtmp2);
4740 
4741     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4742   }
4743 
4744   // a += ((b ^ c) ^ d) + x + ac;
4745   // a = Integer.rotateLeft(a, s) + b;
4746   void md5_HH(BufRegCache& reg_cache,
4747               Register a, Register b, Register c, Register d,
4748               int k, int s, int t,
4749               Register rtmp1, Register rtmp2) {
4750     // rtmp1 = (b ^ c) ^ d
4751     __ xorr(rtmp2, b, c);
4752     __ xorr(rtmp1, rtmp2, d);
4753 
4754     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4755   }
4756 
4757   // a += (c ^ (b | (~d))) + x + ac;
4758   // a = Integer.rotateLeft(a, s) + b;
4759   void md5_II(BufRegCache& reg_cache,
4760               Register a, Register b, Register c, Register d,
4761               int k, int s, int t,
4762               Register rtmp1, Register rtmp2) {
4763     // rtmp1 = c ^ (b | (~d))
4764     __ orn(rtmp2, b, d);
4765     __ xorr(rtmp1, c, rtmp2);
4766 
4767     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4768   }
4769 
4770   // Arguments:
4771   //
4772   // Inputs:
4773   //   c_rarg0   - byte[]  source+offset
4774   //   c_rarg1   - int[]   SHA.state
4775   //   c_rarg2   - int     offset  (multi_block == True)
4776   //   c_rarg3   - int     limit   (multi_block == True)
4777   //
4778   // Registers:
4779   //    x0   zero  (zero)
4780   //    x1     ra  (return address)
4781   //    x2     sp  (stack pointer)
4782   //    x3     gp  (global pointer)
4783   //    x4     tp  (thread pointer)
4784   //    x5     t0  (tmp register)
4785   //    x6     t1  (tmp register)
4786   //    x7     t2  state0
4787   //    x8  f0/s0  (frame pointer)
4788   //    x9     s1
4789   //   x10     a0  rtmp1 / c_rarg0
4790   //   x11     a1  rtmp2 / c_rarg1
4791   //   x12     a2  a     / c_rarg2
4792   //   x13     a3  b     / c_rarg3
4793   //   x14     a4  c
4794   //   x15     a5  d
4795   //   x16     a6  buf
4796   //   x17     a7  state
4797   //   x18     s2  ofs     [saved-reg]  (multi_block == True)
4798   //   x19     s3  limit   [saved-reg]  (multi_block == True)
4799   //   x20     s4  state1  [saved-reg]
4800   //   x21     s5  state2  [saved-reg]
4801   //   x22     s6  state3  [saved-reg]
4802   //   x23     s7
4803   //   x24     s8  buf0    [saved-reg]
4804   //   x25     s9  buf1    [saved-reg]
4805   //   x26    s10  buf2    [saved-reg]
4806   //   x27    s11  buf3    [saved-reg]
4807   //   x28     t3  buf4
4808   //   x29     t4  buf5
4809   //   x30     t5  buf6
4810   //   x31     t6  buf7
4811   address generate_md5_implCompress(StubGenStubId stub_id) {
4812     __ align(CodeEntryAlignment);
4813     bool multi_block;
4814     switch (stub_id) {
4815     case md5_implCompress_id:
4816       multi_block = false;
4817       break;
4818     case md5_implCompressMB_id:
4819       multi_block = true;
4820       break;
4821     default:
4822       ShouldNotReachHere();
4823     };
4824     StubCodeMark mark(this, stub_id);
4825     address start = __ pc();
4826 
4827     // rotation constants
4828     const int S11 = 7;
4829     const int S12 = 12;
4830     const int S13 = 17;
4831     const int S14 = 22;
4832     const int S21 = 5;
4833     const int S22 = 9;
4834     const int S23 = 14;
4835     const int S24 = 20;
4836     const int S31 = 4;
4837     const int S32 = 11;
4838     const int S33 = 16;
4839     const int S34 = 23;
4840     const int S41 = 6;
4841     const int S42 = 10;
4842     const int S43 = 15;
4843     const int S44 = 21;
4844 
4845     const int64_t mask32 = 0xffffffff;
4846 
4847     Register buf_arg   = c_rarg0; // a0
4848     Register state_arg = c_rarg1; // a1
4849     Register ofs_arg   = c_rarg2; // a2
4850     Register limit_arg = c_rarg3; // a3
4851 
4852     // we'll copy the args to these registers to free up a0-a3
4853     // to use for other values manipulated by instructions
4854     // that can be compressed
4855     Register buf       = x16; // a6
4856     Register state     = x17; // a7
4857     Register ofs       = x18; // s2
4858     Register limit     = x19; // s3
4859 
4860     // using x12->15 to allow compressed instructions
4861     Register a         = x12; // a2
4862     Register b         = x13; // a3
4863     Register c         = x14; // a4
4864     Register d         = x15; // a5
4865 
4866     Register state0    =  x7; // t2
4867     Register state1    = x20; // s4
4868     Register state2    = x21; // s5
4869     Register state3    = x22; // s6
4870 
4871     // using x10->x11 to allow compressed instructions
4872     Register rtmp1     = x10; // a0
4873     Register rtmp2     = x11; // a1
4874 
4875     RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
4876     RegSet reg_cache_regs;
4877     reg_cache_regs += reg_cache_saved_regs;
4878     reg_cache_regs += RegSet::of(t3, t4, t5, t6);
4879     BufRegCache reg_cache(_masm, reg_cache_regs);
4880 
4881     RegSet saved_regs;
4882     if (multi_block) {
4883       saved_regs += RegSet::of(ofs, limit);
4884     }
4885     saved_regs += RegSet::of(state1, state2, state3);
4886     saved_regs += reg_cache_saved_regs;
4887 
4888     __ push_reg(saved_regs, sp);
4889 
4890     __ mv(buf, buf_arg);
4891     __ mv(state, state_arg);
4892     if (multi_block) {
4893       __ mv(ofs, ofs_arg);
4894       __ mv(limit, limit_arg);
4895     }
4896 
4897     // to minimize the number of memory operations:
4898     // read the 4 state 4-byte values in pairs, with a single ld,
4899     // and split them into 2 registers.
4900     //
4901     // And, as the core algorithm of md5 works on 32-bits words, so
4902     // in the following code, it does not care about the content of
4903     // higher 32-bits in state[x]. Based on this observation,
4904     // we can apply further optimization, which is to just ignore the
4905     // higher 32-bits in state0/state2, rather than set the higher
4906     // 32-bits of state0/state2 to zero explicitly with extra instructions.
4907     __ ld(state0, Address(state));
4908     __ srli(state1, state0, 32);
4909     __ ld(state2, Address(state, 8));
4910     __ srli(state3, state2, 32);
4911 
4912     Label md5_loop;
4913     __ BIND(md5_loop);
4914 
4915     __ mv(a, state0);
4916     __ mv(b, state1);
4917     __ mv(c, state2);
4918     __ mv(d, state3);
4919 
4920     // Round 1
4921     reg_cache.gen_load(0, buf);
4922     md5_FF(reg_cache, a, b, c, d,  0, S11, 0xd76aa478, rtmp1, rtmp2);
4923     md5_FF(reg_cache, d, a, b, c,  1, S12, 0xe8c7b756, rtmp1, rtmp2);
4924     reg_cache.gen_load(1, buf);
4925     md5_FF(reg_cache, c, d, a, b,  2, S13, 0x242070db, rtmp1, rtmp2);
4926     md5_FF(reg_cache, b, c, d, a,  3, S14, 0xc1bdceee, rtmp1, rtmp2);
4927     reg_cache.gen_load(2, buf);
4928     md5_FF(reg_cache, a, b, c, d,  4, S11, 0xf57c0faf, rtmp1, rtmp2);
4929     md5_FF(reg_cache, d, a, b, c,  5, S12, 0x4787c62a, rtmp1, rtmp2);
4930     reg_cache.gen_load(3, buf);
4931     md5_FF(reg_cache, c, d, a, b,  6, S13, 0xa8304613, rtmp1, rtmp2);
4932     md5_FF(reg_cache, b, c, d, a,  7, S14, 0xfd469501, rtmp1, rtmp2);
4933     reg_cache.gen_load(4, buf);
4934     md5_FF(reg_cache, a, b, c, d,  8, S11, 0x698098d8, rtmp1, rtmp2);
4935     md5_FF(reg_cache, d, a, b, c,  9, S12, 0x8b44f7af, rtmp1, rtmp2);
4936     reg_cache.gen_load(5, buf);
4937     md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
4938     md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
4939     reg_cache.gen_load(6, buf);
4940     md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
4941     md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
4942     reg_cache.gen_load(7, buf);
4943     md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
4944     md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
4945 
4946     // Round 2
4947     md5_GG(reg_cache, a, b, c, d,  1, S21, 0xf61e2562, rtmp1, rtmp2);
4948     md5_GG(reg_cache, d, a, b, c,  6, S22, 0xc040b340, rtmp1, rtmp2);
4949     md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
4950     md5_GG(reg_cache, b, c, d, a,  0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
4951     md5_GG(reg_cache, a, b, c, d,  5, S21, 0xd62f105d, rtmp1, rtmp2);
4952     md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
4953     md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
4954     md5_GG(reg_cache, b, c, d, a,  4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
4955     md5_GG(reg_cache, a, b, c, d,  9, S21, 0x21e1cde6, rtmp1, rtmp2);
4956     md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
4957     md5_GG(reg_cache, c, d, a, b,  3, S23, 0xf4d50d87, rtmp1, rtmp2);
4958     md5_GG(reg_cache, b, c, d, a,  8, S24, 0x455a14ed, rtmp1, rtmp2);
4959     md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
4960     md5_GG(reg_cache, d, a, b, c,  2, S22, 0xfcefa3f8, rtmp1, rtmp2);
4961     md5_GG(reg_cache, c, d, a, b,  7, S23, 0x676f02d9, rtmp1, rtmp2);
4962     md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
4963 
4964     // Round 3
4965     md5_HH(reg_cache, a, b, c, d,  5, S31, 0xfffa3942, rtmp1, rtmp2);
4966     md5_HH(reg_cache, d, a, b, c,  8, S32, 0x8771f681, rtmp1, rtmp2);
4967     md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
4968     md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
4969     md5_HH(reg_cache, a, b, c, d,  1, S31, 0xa4beea44, rtmp1, rtmp2);
4970     md5_HH(reg_cache, d, a, b, c,  4, S32, 0x4bdecfa9, rtmp1, rtmp2);
4971     md5_HH(reg_cache, c, d, a, b,  7, S33, 0xf6bb4b60, rtmp1, rtmp2);
4972     md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
4973     md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
4974     md5_HH(reg_cache, d, a, b, c,  0, S32, 0xeaa127fa, rtmp1, rtmp2);
4975     md5_HH(reg_cache, c, d, a, b,  3, S33, 0xd4ef3085, rtmp1, rtmp2);
4976     md5_HH(reg_cache, b, c, d, a,  6, S34, 0x04881d05, rtmp1, rtmp2);
4977     md5_HH(reg_cache, a, b, c, d,  9, S31, 0xd9d4d039, rtmp1, rtmp2);
4978     md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
4979     md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
4980     md5_HH(reg_cache, b, c, d, a,  2, S34, 0xc4ac5665, rtmp1, rtmp2);
4981 
4982     // Round 4
4983     md5_II(reg_cache, a, b, c, d,  0, S41, 0xf4292244, rtmp1, rtmp2);
4984     md5_II(reg_cache, d, a, b, c,  7, S42, 0x432aff97, rtmp1, rtmp2);
4985     md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
4986     md5_II(reg_cache, b, c, d, a,  5, S44, 0xfc93a039, rtmp1, rtmp2);
4987     md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
4988     md5_II(reg_cache, d, a, b, c,  3, S42, 0x8f0ccc92, rtmp1, rtmp2);
4989     md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
4990     md5_II(reg_cache, b, c, d, a,  1, S44, 0x85845dd1, rtmp1, rtmp2);
4991     md5_II(reg_cache, a, b, c, d,  8, S41, 0x6fa87e4f, rtmp1, rtmp2);
4992     md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
4993     md5_II(reg_cache, c, d, a, b,  6, S43, 0xa3014314, rtmp1, rtmp2);
4994     md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
4995     md5_II(reg_cache, a, b, c, d,  4, S41, 0xf7537e82, rtmp1, rtmp2);
4996     md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
4997     md5_II(reg_cache, c, d, a, b,  2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
4998     md5_II(reg_cache, b, c, d, a,  9, S44, 0xeb86d391, rtmp1, rtmp2);
4999 
5000     __ addw(state0, state0, a);
5001     __ addw(state1, state1, b);
5002     __ addw(state2, state2, c);
5003     __ addw(state3, state3, d);
5004 
5005     if (multi_block) {
5006       __ addi(buf, buf, 64);
5007       __ addi(ofs, ofs, 64);
5008       // if (ofs <= limit) goto m5_loop
5009       __ bge(limit, ofs, md5_loop);
5010       __ mv(c_rarg0, ofs); // return ofs
5011     }
5012 
5013     // to minimize the number of memory operations:
5014     // write back the 4 state 4-byte values in pairs, with a single sd
5015     __ mv(t0, mask32);
5016     __ andr(state0, state0, t0);
5017     __ slli(state1, state1, 32);
5018     __ orr(state0, state0, state1);
5019     __ sd(state0, Address(state));
5020     __ andr(state2, state2, t0);
5021     __ slli(state3, state3, 32);
5022     __ orr(state2, state2, state3);
5023     __ sd(state2, Address(state, 8));
5024 
5025     __ pop_reg(saved_regs, sp);
5026     __ ret();
5027 
5028     return (address) start;
5029   }
5030 
5031   /**
5032    * Perform the quarter round calculations on values contained within four vector registers.
5033    *
5034    * @param aVec the SIMD register containing only the "a" values
5035    * @param bVec the SIMD register containing only the "b" values
5036    * @param cVec the SIMD register containing only the "c" values
5037    * @param dVec the SIMD register containing only the "d" values
5038    * @param tmp_vr temporary vector register holds intermedia values.
5039    */
5040   void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
5041                           VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
5042     // a += b, d ^= a, d <<<= 16
5043     __ vadd_vv(aVec, aVec, bVec);
5044     __ vxor_vv(dVec, dVec, aVec);
5045     __ vrole32_vi(dVec, 16, tmp_vr);
5046 
5047     // c += d, b ^= c, b <<<= 12
5048     __ vadd_vv(cVec, cVec, dVec);
5049     __ vxor_vv(bVec, bVec, cVec);
5050     __ vrole32_vi(bVec, 12, tmp_vr);
5051 
5052     // a += b, d ^= a, d <<<= 8
5053     __ vadd_vv(aVec, aVec, bVec);
5054     __ vxor_vv(dVec, dVec, aVec);
5055     __ vrole32_vi(dVec, 8, tmp_vr);
5056 
5057     // c += d, b ^= c, b <<<= 7
5058     __ vadd_vv(cVec, cVec, dVec);
5059     __ vxor_vv(bVec, bVec, cVec);
5060     __ vrole32_vi(bVec, 7, tmp_vr);
5061   }
5062 
5063   /**
5064    * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
5065    *
5066    *  Input arguments:
5067    *  c_rarg0   - state, the starting state
5068    *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function
5069    *
5070    *  Implementation Note:
5071    *   Parallelization is achieved by loading individual state elements into vectors for N blocks.
5072    *   N depends on single vector register length.
5073    */
5074   address generate_chacha20Block() {
5075     Label L_Rounds;
5076 
5077     __ align(CodeEntryAlignment);
5078     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
5079     StubCodeMark mark(this, stub_id);
5080     address start = __ pc();
5081     __ enter();
5082 
5083     const int states_len = 16;
5084     const int step = 4;
5085     const Register state = c_rarg0;
5086     const Register key_stream = c_rarg1;
5087     const Register tmp_addr = t0;
5088     const Register length = t1;
5089 
5090     // Organize vector registers in an array that facilitates
5091     // putting repetitive opcodes into loop structures below.
5092     const VectorRegister work_vrs[16] = {
5093       v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
5094       v8, v9, v10, v11, v12, v13, v14, v15
5095     };
5096     const VectorRegister tmp_vr = v16;
5097     const VectorRegister counter_vr = v17;
5098 
5099     {
5100       // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
5101       // in java level.
5102       __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
5103     }
5104 
5105     // Load from source state.
5106     // Every element in source state is duplicated to all elements in the corresponding vector.
5107     __ mv(tmp_addr, state);
5108     for (int i = 0; i < states_len; i += 1) {
5109       __ vlse32_v(work_vrs[i], tmp_addr, zr);
5110       __ addi(tmp_addr, tmp_addr, step);
5111     }
5112     // Adjust counter for every individual block.
5113     __ vid_v(counter_vr);
5114     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5115 
5116     // Perform 10 iterations of the 8 quarter round set
5117     {
5118       const Register loop = t2; // share t2 with other non-overlapping usages.
5119       __ mv(loop, 10);
5120       __ BIND(L_Rounds);
5121 
5122       chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8],  work_vrs[12], tmp_vr);
5123       chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9],  work_vrs[13], tmp_vr);
5124       chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
5125       chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
5126 
5127       chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
5128       chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
5129       chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8],  work_vrs[13], tmp_vr);
5130       chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9],  work_vrs[14], tmp_vr);
5131 
5132       __ subi(loop, loop, 1);
5133       __ bnez(loop, L_Rounds);
5134     }
5135 
5136     // Add the original state into the end working state.
5137     // We do this by first duplicating every element in source state array to the corresponding
5138     // vector, then adding it to the post-loop working state.
5139     __ mv(tmp_addr, state);
5140     for (int i = 0; i < states_len; i += 1) {
5141       __ vlse32_v(tmp_vr, tmp_addr, zr);
5142       __ addi(tmp_addr, tmp_addr, step);
5143       __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
5144     }
5145     // Add the counter overlay onto work_vrs[12] at the end.
5146     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5147 
5148     // Store result to key stream.
5149     {
5150       const Register stride = t2; // share t2 with other non-overlapping usages.
5151       // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
5152       __ mv(stride, 64);
5153       for (int i = 0; i < states_len; i += 1) {
5154         __ vsse32_v(work_vrs[i], key_stream, stride);
5155         __ addi(key_stream, key_stream, step);
5156       }
5157     }
5158 
5159     // Return length of output key_stream
5160     __ slli(c_rarg0, length, 6);
5161 
5162     __ leave();
5163     __ ret();
5164 
5165     return (address) start;
5166   }
5167 
5168 
5169   // ------------------------ SHA-1 intrinsic ------------------------
5170 
5171   // K't =
5172   //    5a827999, 0  <= t <= 19
5173   //    6ed9eba1, 20 <= t <= 39
5174   //    8f1bbcdc, 40 <= t <= 59
5175   //    ca62c1d6, 60 <= t <= 79
5176   void sha1_prepare_k(Register cur_k, int round) {
5177     assert(round >= 0 && round < 80, "must be");
5178 
5179     static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
5180     if ((round % 20) == 0) {
5181       __ mv(cur_k, ks[round/20]);
5182     }
5183   }
5184 
5185   // W't =
5186   //    M't,                                      0 <=  t <= 15
5187   //    ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5188   void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
5189     assert(round >= 0 && round < 80, "must be");
5190 
5191     if (round < 16) {
5192       // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
5193       //   in ws[0], high part contains W't-0, low part contains W't-1,
5194       //   in ws[1], high part contains W't-2, low part contains W't-3,
5195       //   ...
5196       //   in ws[7], high part contains W't-14, low part contains W't-15.
5197 
5198       if ((round % 2) == 0) {
5199         __ ld(ws[round/2], Address(buf, (round/2) * 8));
5200         // reverse bytes, as SHA-1 is defined in big-endian.
5201         __ revb(ws[round/2], ws[round/2]);
5202         __ srli(cur_w, ws[round/2], 32);
5203       } else {
5204         __ mv(cur_w, ws[round/2]);
5205       }
5206 
5207       return;
5208     }
5209 
5210     if ((round % 2) == 0) {
5211       int idx = 16;
5212       // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5213       __ srli(t1, ws[(idx-8)/2], 32);
5214       __ xorr(t0, ws[(idx-3)/2], t1);
5215 
5216       __ srli(t1, ws[(idx-14)/2], 32);
5217       __ srli(cur_w, ws[(idx-16)/2], 32);
5218       __ xorr(cur_w, cur_w, t1);
5219 
5220       __ xorr(cur_w, cur_w, t0);
5221       __ rolw(cur_w, cur_w, 1, t0);
5222 
5223       // copy the cur_w value to ws[8].
5224       // now, valid w't values are at:
5225       //  w0:       ws[0]'s lower 32 bits
5226       //  w1 ~ w14: ws[1] ~ ws[7]
5227       //  w15:      ws[8]'s higher 32 bits
5228       __ slli(ws[idx/2], cur_w, 32);
5229 
5230       return;
5231     }
5232 
5233     int idx = 17;
5234     // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5235     __ srli(t1, ws[(idx-3)/2], 32);
5236     __ xorr(t0, t1, ws[(idx-8)/2]);
5237 
5238     __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
5239 
5240     __ xorr(cur_w, cur_w, t0);
5241     __ rolw(cur_w, cur_w, 1, t0);
5242 
5243     // copy the cur_w value to ws[8]
5244     __ zext(cur_w, cur_w, 32);
5245     __ orr(ws[idx/2], ws[idx/2], cur_w);
5246 
5247     // shift the w't registers, so they start from ws[0] again.
5248     // now, valid w't values are at:
5249     //  w0 ~ w15: ws[0] ~ ws[7]
5250     Register ws_0 = ws[0];
5251     for (int i = 0; i < 16/2; i++) {
5252       ws[i] = ws[i+1];
5253     }
5254     ws[8] = ws_0;
5255   }
5256 
5257   // f't(x, y, z) =
5258   //    Ch(x, y, z)     = (x & y) ^ (~x & z)            , 0  <= t <= 19
5259   //    Parity(x, y, z) = x ^ y ^ z                     , 20 <= t <= 39
5260   //    Maj(x, y, z)    = (x & y) ^ (x & z) ^ (y & z)   , 40 <= t <= 59
5261   //    Parity(x, y, z) = x ^ y ^ z                     , 60 <= t <= 79
5262   void sha1_f(Register dst, Register x, Register y, Register z, int round) {
5263     assert(round >= 0 && round < 80, "must be");
5264     assert_different_registers(dst, x, y, z, t0, t1);
5265 
5266     if (round < 20) {
5267       // (x & y) ^ (~x & z)
5268       __ andr(t0, x, y);
5269       __ andn(dst, z, x);
5270       __ xorr(dst, dst, t0);
5271     } else if (round >= 40 && round < 60) {
5272       // (x & y) ^ (x & z) ^ (y & z)
5273       __ andr(t0, x, y);
5274       __ andr(t1, x, z);
5275       __ andr(dst, y, z);
5276       __ xorr(dst, dst, t0);
5277       __ xorr(dst, dst, t1);
5278     } else {
5279       // x ^ y ^ z
5280       __ xorr(dst, x, y);
5281       __ xorr(dst, dst, z);
5282     }
5283   }
5284 
5285   // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5286   // e = d
5287   // d = c
5288   // c = ROTL'30(b)
5289   // b = a
5290   // a = T
5291   void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
5292                           Register cur_k, Register cur_w, Register tmp, int round) {
5293     assert(round >= 0 && round < 80, "must be");
5294     assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
5295 
5296     // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5297 
5298     // cur_w will be recalculated at the beginning of each round,
5299     // so, we can reuse it as a temp register here.
5300     Register tmp2 = cur_w;
5301 
5302     // reuse e as a temporary register, as we will mv new value into it later
5303     Register tmp3 = e;
5304     __ add(tmp2, cur_k, tmp2);
5305     __ add(tmp3, tmp3, tmp2);
5306     __ rolw(tmp2, a, 5, t0);
5307 
5308     sha1_f(tmp, b, c, d, round);
5309 
5310     __ add(tmp2, tmp2, tmp);
5311     __ add(tmp2, tmp2, tmp3);
5312 
5313     // e = d
5314     // d = c
5315     // c = ROTL'30(b)
5316     // b = a
5317     // a = T
5318     __ mv(e, d);
5319     __ mv(d, c);
5320 
5321     __ rolw(c, b, 30);
5322     __ mv(b, a);
5323     __ mv(a, tmp2);
5324   }
5325 
5326   // H(i)0 = a + H(i-1)0
5327   // H(i)1 = b + H(i-1)1
5328   // H(i)2 = c + H(i-1)2
5329   // H(i)3 = d + H(i-1)3
5330   // H(i)4 = e + H(i-1)4
5331   void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
5332                               Register prev_ab, Register prev_cd, Register prev_e) {
5333     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5334 
5335     __ add(a, a, prev_ab);
5336     __ srli(prev_ab, prev_ab, 32);
5337     __ add(b, b, prev_ab);
5338 
5339     __ add(c, c, prev_cd);
5340     __ srli(prev_cd, prev_cd, 32);
5341     __ add(d, d, prev_cd);
5342 
5343     __ add(e, e, prev_e);
5344   }
5345 
5346   void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
5347                                 Register prev_ab, Register prev_cd, Register prev_e) {
5348     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
5349 
5350     __ slli(t0, b, 32);
5351     __ zext(prev_ab, a, 32);
5352     __ orr(prev_ab, prev_ab, t0);
5353 
5354     __ slli(t0, d, 32);
5355     __ zext(prev_cd, c, 32);
5356     __ orr(prev_cd, prev_cd, t0);
5357 
5358     __ mv(prev_e, e);
5359   }
5360 
5361   // Intrinsic for:
5362   //   void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
5363   //   void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
5364   //
5365   // Arguments:
5366   //
5367   // Inputs:
5368   //   c_rarg0: byte[]  src array + offset
5369   //   c_rarg1: int[]   SHA.state
5370   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5371   //   c_rarg2: int     offset
5372   //   c_rarg3: int     limit
5373   //
5374   // Outputs:
5375   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5376   //   c_rarg0: int offset, when (multi_block == true)
5377   //
5378   address generate_sha1_implCompress(StubGenStubId stub_id) {
5379       bool multi_block;
5380       switch (stub_id) {
5381       case sha1_implCompress_id:
5382         multi_block = false;
5383         break;
5384       case sha1_implCompressMB_id:
5385         multi_block = true;
5386         break;
5387       default:
5388         ShouldNotReachHere();
5389       };
5390     __ align(CodeEntryAlignment);
5391     StubCodeMark mark(this, stub_id);
5392 
5393     address start = __ pc();
5394     __ enter();
5395 
5396     RegSet saved_regs = RegSet::range(x18, x27);
5397     if (multi_block) {
5398       // use x9 as src below.
5399       saved_regs += RegSet::of(x9);
5400     }
5401     __ push_reg(saved_regs, sp);
5402 
5403     // c_rarg0 - c_rarg3: x10 - x13
5404     Register buf    = c_rarg0;
5405     Register state  = c_rarg1;
5406     Register offset = c_rarg2;
5407     Register limit  = c_rarg3;
5408     // use src to contain the original start point of the array.
5409     Register src    = x9;
5410 
5411     if (multi_block) {
5412       __ sub(limit, limit, offset);
5413       __ add(limit, limit, buf);
5414       __ sub(src, buf, offset);
5415     }
5416 
5417     // [args-reg]:  x14 - x17
5418     // [temp-reg]:  x28 - x31
5419     // [saved-reg]: x18 - x27
5420 
5421     // h0/1/2/3/4
5422     const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5423     // w0, w1, ... w15
5424     // put two adjecent w's in one register:
5425     //    one at high word part, another at low word part
5426     // at different round (even or odd), w't value reside in different items in ws[].
5427     // w0 ~ w15, either reside in
5428     //    ws[0] ~ ws[7], where
5429     //      w0 at higher 32 bits of ws[0],
5430     //      w1 at lower 32 bits of ws[0],
5431     //      ...
5432     //      w14 at higher 32 bits of ws[7],
5433     //      w15 at lower 32 bits of ws[7].
5434     // or, reside in
5435     //    w0:       ws[0]'s lower 32 bits
5436     //    w1 ~ w14: ws[1] ~ ws[7]
5437     //    w15:      ws[8]'s higher 32 bits
5438     Register ws[9] = {x29, x30, x31, x18,
5439                       x19, x20, x21, x22,
5440                       x23}; // auxiliary register for calculating w's value
5441     // current k't's value
5442     const Register cur_k = x24;
5443     // current w't's value
5444     const Register cur_w = x25;
5445     // values of a, b, c, d, e in the previous round
5446     const Register prev_ab = x26, prev_cd = x27;
5447     const Register prev_e = offset; // reuse offset/c_rarg2
5448 
5449     // load 5 words state into a, b, c, d, e.
5450     //
5451     // To minimize the number of memory operations, we apply following
5452     // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5453     // with a single ld, and split them into 2 registers.
5454     //
5455     // And, as the core algorithm of SHA-1 works on 32-bits words, so
5456     // in the following code, it does not care about the content of
5457     // higher 32-bits in a/b/c/d/e. Based on this observation,
5458     // we can apply further optimization, which is to just ignore the
5459     // higher 32-bits in a/c/e, rather than set the higher
5460     // 32-bits of a/c/e to zero explicitly with extra instructions.
5461     __ ld(a, Address(state, 0));
5462     __ srli(b, a, 32);
5463     __ ld(c, Address(state, 8));
5464     __ srli(d, c, 32);
5465     __ lw(e, Address(state, 16));
5466 
5467     Label L_sha1_loop;
5468     if (multi_block) {
5469       __ BIND(L_sha1_loop);
5470     }
5471 
5472     sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5473 
5474     for (int round = 0; round < 80; round++) {
5475       // prepare K't value
5476       sha1_prepare_k(cur_k, round);
5477 
5478       // prepare W't value
5479       sha1_prepare_w(cur_w, ws, buf, round);
5480 
5481       // one round process
5482       sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
5483     }
5484 
5485     // compute the intermediate hash value
5486     sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5487 
5488     if (multi_block) {
5489       int64_t block_bytes = 16 * 4;
5490       __ addi(buf, buf, block_bytes);
5491 
5492       __ bge(limit, buf, L_sha1_loop, true);
5493     }
5494 
5495     // store back the state.
5496     __ zext(a, a, 32);
5497     __ slli(b, b, 32);
5498     __ orr(a, a, b);
5499     __ sd(a, Address(state, 0));
5500     __ zext(c, c, 32);
5501     __ slli(d, d, 32);
5502     __ orr(c, c, d);
5503     __ sd(c, Address(state, 8));
5504     __ sw(e, Address(state, 16));
5505 
5506     // return offset
5507     if (multi_block) {
5508       __ sub(c_rarg0, buf, src);
5509     }
5510 
5511     __ pop_reg(saved_regs, sp);
5512 
5513     __ leave();
5514     __ ret();
5515 
5516     return (address) start;
5517   }
5518 
5519   /**
5520    * vector registers:
5521    *   input VectorRegister's:  intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
5522    *   index VectorRegister's:  idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
5523    *   output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
5524    *
5525    * NOTE: each field will occupy a vector register group
5526    */
5527   void base64_vector_encode_round(Register src, Register dst, Register codec,
5528                     Register size, Register stepSrc, Register stepDst,
5529                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
5530                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
5531                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
5532                     Assembler::LMUL lmul) {
5533     // set vector register type/len
5534     __ vsetvli(x0, size, Assembler::e8, lmul);
5535 
5536     // segmented load src into v registers: mem(src) => vr(3)
5537     __ vlseg3e8_v(inputV1, src);
5538 
5539     // src = src + register_group_len_bytes * 3
5540     __ add(src, src, stepSrc);
5541 
5542     // encoding
5543     //   1. compute index into lookup table: vr(3) => vr(4)
5544     __ vsrl_vi(idxV1, inputV1, 2);
5545 
5546     __ vsrl_vi(idxV2, inputV2, 2);
5547     __ vsll_vi(inputV1, inputV1, 6);
5548     __ vor_vv(idxV2, idxV2, inputV1);
5549     __ vsrl_vi(idxV2, idxV2, 2);
5550 
5551     __ vsrl_vi(idxV3, inputV3, 4);
5552     __ vsll_vi(inputV2, inputV2, 4);
5553     __ vor_vv(idxV3, inputV2, idxV3);
5554     __ vsrl_vi(idxV3, idxV3, 2);
5555 
5556     __ vsll_vi(idxV4, inputV3, 2);
5557     __ vsrl_vi(idxV4, idxV4, 2);
5558 
5559     //   2. indexed load: vr(4) => vr(4)
5560     __ vluxei8_v(outputV1, codec, idxV1);
5561     __ vluxei8_v(outputV2, codec, idxV2);
5562     __ vluxei8_v(outputV3, codec, idxV3);
5563     __ vluxei8_v(outputV4, codec, idxV4);
5564 
5565     // segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
5566     __ vsseg4e8_v(outputV1, dst);
5567 
5568     // dst = dst + register_group_len_bytes * 4
5569     __ add(dst, dst, stepDst);
5570   }
5571 
5572   /**
5573    *  void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
5574    *
5575    *  Input arguments:
5576    *  c_rarg0   - src, source array
5577    *  c_rarg1   - sp, src start offset
5578    *  c_rarg2   - sl, src end offset
5579    *  c_rarg3   - dst, dest array
5580    *  c_rarg4   - dp, dst start offset
5581    *  c_rarg5   - isURL, Base64 or URL character set
5582    */
5583   address generate_base64_encodeBlock() {
5584     alignas(64) static const char toBase64[64] = {
5585       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5586       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5587       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5588       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5589       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5590     };
5591 
5592     alignas(64) static const char toBase64URL[64] = {
5593       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5594       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5595       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5596       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5597       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5598     };
5599 
5600     __ align(CodeEntryAlignment);
5601     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
5602     StubCodeMark mark(this, stub_id);
5603     address start = __ pc();
5604     __ enter();
5605 
5606     Register src    = c_rarg0;
5607     Register soff   = c_rarg1;
5608     Register send   = c_rarg2;
5609     Register dst    = c_rarg3;
5610     Register doff   = c_rarg4;
5611     Register isURL  = c_rarg5;
5612 
5613     Register codec  = c_rarg6;
5614     Register length = c_rarg7; // total length of src data in bytes
5615 
5616     Label ProcessData, Exit;
5617 
5618     // length should be multiple of 3
5619     __ sub(length, send, soff);
5620     // real src/dst to process data
5621     __ add(src, src, soff);
5622     __ add(dst, dst, doff);
5623 
5624     // load the codec base address
5625     __ la(codec, ExternalAddress((address) toBase64));
5626     __ beqz(isURL, ProcessData);
5627     __ la(codec, ExternalAddress((address) toBase64URL));
5628     __ BIND(ProcessData);
5629 
5630     // vector version
5631     if (UseRVV) {
5632       Label ProcessM2, ProcessM1, ProcessScalar;
5633 
5634       Register size      = soff;
5635       Register stepSrcM1 = send;
5636       Register stepSrcM2 = doff;
5637       Register stepDst   = isURL;
5638 
5639       __ mv(size, MaxVectorSize * 2);
5640       __ mv(stepSrcM1, MaxVectorSize * 3);
5641       __ slli(stepSrcM2, stepSrcM1, 1);
5642       __ mv(stepDst, MaxVectorSize * 2 * 4);
5643 
5644       __ blt(length, stepSrcM2, ProcessM1);
5645 
5646       __ BIND(ProcessM2);
5647       base64_vector_encode_round(src, dst, codec,
5648                     size, stepSrcM2, stepDst,
5649                     v2, v4, v6,         // inputs
5650                     v8, v10, v12, v14,  // indexes
5651                     v16, v18, v20, v22, // outputs
5652                     Assembler::m2);
5653 
5654       __ sub(length, length, stepSrcM2);
5655       __ bge(length, stepSrcM2, ProcessM2);
5656 
5657       __ BIND(ProcessM1);
5658       __ blt(length, stepSrcM1, ProcessScalar);
5659 
5660       __ srli(size, size, 1);
5661       __ srli(stepDst, stepDst, 1);
5662       base64_vector_encode_round(src, dst, codec,
5663                     size, stepSrcM1, stepDst,
5664                     v1, v2, v3,         // inputs
5665                     v4, v5, v6, v7,     // indexes
5666                     v8, v9, v10, v11,   // outputs
5667                     Assembler::m1);
5668       __ sub(length, length, stepSrcM1);
5669 
5670       __ BIND(ProcessScalar);
5671     }
5672 
5673     // scalar version
5674     {
5675       Register byte1 = soff, byte0 = send, byte2 = doff;
5676       Register combined24Bits = isURL;
5677 
5678       __ beqz(length, Exit);
5679 
5680       Label ScalarLoop;
5681       __ BIND(ScalarLoop);
5682       {
5683         // plain:   [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
5684         // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
5685 
5686         // load 3 bytes src data
5687         __ lbu(byte0, Address(src, 0));
5688         __ lbu(byte1, Address(src, 1));
5689         __ lbu(byte2, Address(src, 2));
5690         __ addi(src, src, 3);
5691 
5692         // construct 24 bits from 3 bytes
5693         __ slliw(byte0, byte0, 16);
5694         __ slliw(byte1, byte1, 8);
5695         __ orr(combined24Bits, byte0, byte1);
5696         __ orr(combined24Bits, combined24Bits, byte2);
5697 
5698         // get codec index and encode(ie. load from codec by index)
5699         __ slliw(byte0, combined24Bits, 8);
5700         __ srliw(byte0, byte0, 26);
5701         __ add(byte0, codec, byte0);
5702         __ lbu(byte0, byte0);
5703 
5704         __ slliw(byte1, combined24Bits, 14);
5705         __ srliw(byte1, byte1, 26);
5706         __ add(byte1, codec, byte1);
5707         __ lbu(byte1, byte1);
5708 
5709         __ slliw(byte2, combined24Bits, 20);
5710         __ srliw(byte2, byte2, 26);
5711         __ add(byte2, codec, byte2);
5712         __ lbu(byte2, byte2);
5713 
5714         __ andi(combined24Bits, combined24Bits, 0x3f);
5715         __ add(combined24Bits, codec, combined24Bits);
5716         __ lbu(combined24Bits, combined24Bits);
5717 
5718         // store 4 bytes encoded data
5719         __ sb(byte0, Address(dst, 0));
5720         __ sb(byte1, Address(dst, 1));
5721         __ sb(byte2, Address(dst, 2));
5722         __ sb(combined24Bits, Address(dst, 3));
5723 
5724         __ subi(length, length, 3);
5725         __ addi(dst, dst, 4);
5726         // loop back
5727         __ bnez(length, ScalarLoop);
5728       }
5729     }
5730 
5731     __ BIND(Exit);
5732 
5733     __ leave();
5734     __ ret();
5735 
5736     return (address) start;
5737   }
5738 
5739   /**
5740    * vector registers:
5741    * input VectorRegister's:  intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
5742    * index VectorRegister's:  idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
5743    * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
5744    *
5745    * NOTE: each field will occupy a single vector register group
5746    */
5747   void base64_vector_decode_round(Register src, Register dst, Register codec,
5748                     Register size, Register stepSrc, Register stepDst, Register failedIdx,
5749                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
5750                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
5751                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
5752                     Assembler::LMUL lmul) {
5753     // set vector register type/len
5754     __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
5755 
5756     // segmented load src into v registers: mem(src) => vr(4)
5757     __ vlseg4e8_v(inputV1, src);
5758 
5759     // src = src + register_group_len_bytes * 4
5760     __ add(src, src, stepSrc);
5761 
5762     // decoding
5763     //   1. indexed load: vr(4) => vr(4)
5764     __ vluxei8_v(idxV1, codec, inputV1);
5765     __ vluxei8_v(idxV2, codec, inputV2);
5766     __ vluxei8_v(idxV3, codec, inputV3);
5767     __ vluxei8_v(idxV4, codec, inputV4);
5768 
5769     //   2. check wrong data
5770     __ vor_vv(outputV1, idxV1, idxV2);
5771     __ vor_vv(outputV2, idxV3, idxV4);
5772     __ vor_vv(outputV1, outputV1, outputV2);
5773     __ vmseq_vi(v0, outputV1, -1);
5774     __ vfirst_m(failedIdx, v0);
5775     Label NoFailure, FailureAtIdx0;
5776     // valid value can only be -1 when < 0
5777     __ bltz(failedIdx, NoFailure);
5778     // when the first data (at index 0) fails, no need to process data anymore
5779     __ beqz(failedIdx, FailureAtIdx0);
5780     __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
5781     __ slli(stepDst, failedIdx, 1);
5782     __ add(stepDst, failedIdx, stepDst);
5783     __ BIND(NoFailure);
5784 
5785     //   3. compute the decoded data: vr(4) => vr(3)
5786     __ vsll_vi(idxV1, idxV1, 2);
5787     __ vsrl_vi(outputV1, idxV2, 4);
5788     __ vor_vv(outputV1, outputV1, idxV1);
5789 
5790     __ vsll_vi(idxV2, idxV2, 4);
5791     __ vsrl_vi(outputV2, idxV3, 2);
5792     __ vor_vv(outputV2, outputV2, idxV2);
5793 
5794     __ vsll_vi(idxV3, idxV3, 6);
5795     __ vor_vv(outputV3, idxV4, idxV3);
5796 
5797     // segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
5798     __ vsseg3e8_v(outputV1, dst);
5799 
5800     // dst = dst + register_group_len_bytes * 3
5801     __ add(dst, dst, stepDst);
5802     __ BIND(FailureAtIdx0);
5803   }
5804 
5805   /**
5806    * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
5807    *
5808    *  Input arguments:
5809    *  c_rarg0   - src, source array
5810    *  c_rarg1   - sp, src start offset
5811    *  c_rarg2   - sl, src end offset
5812    *  c_rarg3   - dst, dest array
5813    *  c_rarg4   - dp, dst start offset
5814    *  c_rarg5   - isURL, Base64 or URL character set
5815    *  c_rarg6   - isMIME, Decoding MIME block
5816    */
5817   address generate_base64_decodeBlock() {
5818 
5819     static const uint8_t fromBase64[256] = {
5820         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5821         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5822         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5823         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5824         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5825         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
5826         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5827         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5828         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5829         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5830         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5831         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5832         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5833         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5834         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5835         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5836     };
5837 
5838     static const uint8_t fromBase64URL[256] = {
5839         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5840         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5841         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5842         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5843         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5844         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
5845         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5846         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5847         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5848         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5849         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5850         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5851         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5852         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5853         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5854         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5855     };
5856 
5857     __ align(CodeEntryAlignment);
5858     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
5859     StubCodeMark mark(this, stub_id);
5860     address start = __ pc();
5861     __ enter();
5862 
5863     Register src    = c_rarg0;
5864     Register soff   = c_rarg1;
5865     Register send   = c_rarg2;
5866     Register dst    = c_rarg3;
5867     Register doff   = c_rarg4;
5868     Register isURL  = c_rarg5;
5869     Register isMIME = c_rarg6;
5870 
5871     Register codec     = c_rarg7;
5872     Register dstBackup = t6;
5873     Register length    = t3;     // total length of src data in bytes
5874 
5875     Label ProcessData, Exit;
5876     Label ProcessScalar, ScalarLoop;
5877 
5878     // passed in length (send - soff) is guaranteed to be > 4,
5879     // and in this intrinsic we only process data of length in multiple of 4,
5880     // it's not guaranteed to be multiple of 4 by java level, so do it explicitly
5881     __ sub(length, send, soff);
5882     __ andi(length, length, -4);
5883     // real src/dst to process data
5884     __ add(src, src, soff);
5885     __ add(dst, dst, doff);
5886     // backup of dst, used to calculate the return value at exit
5887     __ mv(dstBackup, dst);
5888 
5889     // load the codec base address
5890     __ la(codec, ExternalAddress((address) fromBase64));
5891     __ beqz(isURL, ProcessData);
5892     __ la(codec, ExternalAddress((address) fromBase64URL));
5893     __ BIND(ProcessData);
5894 
5895     // vector version
5896     if (UseRVV) {
5897       // for MIME case, it has a default length limit of 76 which could be
5898       // different(smaller) from (send - soff), so in MIME case, we go through
5899       // the scalar code path directly.
5900       __ bnez(isMIME, ScalarLoop);
5901 
5902       Label ProcessM1, ProcessM2;
5903 
5904       Register failedIdx = soff;
5905       Register stepSrcM1 = send;
5906       Register stepSrcM2 = doff;
5907       Register stepDst   = isURL;
5908       Register size      = t4;
5909 
5910       __ mv(size, MaxVectorSize * 2);
5911       __ mv(stepSrcM1, MaxVectorSize * 4);
5912       __ slli(stepSrcM2, stepSrcM1, 1);
5913       __ mv(stepDst, MaxVectorSize * 2 * 3);
5914 
5915       __ blt(length, stepSrcM2, ProcessM1);
5916 
5917 
5918       // Assembler::m2
5919       __ BIND(ProcessM2);
5920       base64_vector_decode_round(src, dst, codec,
5921                     size, stepSrcM2, stepDst, failedIdx,
5922                     v2, v4, v6, v8,      // inputs
5923                     v10, v12, v14, v16,  // indexes
5924                     v18, v20, v22,       // outputs
5925                     Assembler::m2);
5926       __ sub(length, length, stepSrcM2);
5927 
5928       // error check
5929       // valid value of failedIdx can only be -1 when < 0
5930       __ bgez(failedIdx, Exit);
5931 
5932       __ bge(length, stepSrcM2, ProcessM2);
5933 
5934 
5935       // Assembler::m1
5936       __ BIND(ProcessM1);
5937       __ blt(length, stepSrcM1, ProcessScalar);
5938 
5939       __ srli(size, size, 1);
5940       __ srli(stepDst, stepDst, 1);
5941       base64_vector_decode_round(src, dst, codec,
5942                     size, stepSrcM1, stepDst, failedIdx,
5943                     v1, v2, v3, v4,      // inputs
5944                     v5, v6, v7, v8,      // indexes
5945                     v9, v10, v11,        // outputs
5946                     Assembler::m1);
5947       __ sub(length, length, stepSrcM1);
5948 
5949       // error check
5950       // valid value of failedIdx can only be -1 when < 0
5951       __ bgez(failedIdx, Exit);
5952 
5953       __ BIND(ProcessScalar);
5954       __ beqz(length, Exit);
5955     }
5956 
5957     // scalar version
5958     {
5959       Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
5960       Register combined32Bits = t4;
5961 
5962       // encoded:   [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
5963       // plain:     [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
5964       __ BIND(ScalarLoop);
5965 
5966       // load 4 bytes encoded src data
5967       __ lbu(byte0, Address(src, 0));
5968       __ lbu(byte1, Address(src, 1));
5969       __ lbu(byte2, Address(src, 2));
5970       __ lbu(byte3, Address(src, 3));
5971       __ addi(src, src, 4);
5972 
5973       // get codec index and decode (ie. load from codec by index)
5974       __ add(byte0, codec, byte0);
5975       __ add(byte1, codec, byte1);
5976       __ lb(byte0, Address(byte0, 0));
5977       __ lb(byte1, Address(byte1, 0));
5978       __ add(byte2, codec, byte2);
5979       __ add(byte3, codec, byte3);
5980       __ lb(byte2, Address(byte2, 0));
5981       __ lb(byte3, Address(byte3, 0));
5982       __ slliw(byte0, byte0, 18);
5983       __ slliw(byte1, byte1, 12);
5984       __ orr(byte0, byte0, byte1);
5985       __ orr(byte0, byte0, byte3);
5986       __ slliw(byte2, byte2, 6);
5987       // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
5988       //  1. error check below
5989       //  2. decode below
5990       __ orr(combined32Bits, byte0, byte2);
5991 
5992       // error check
5993       __ bltz(combined32Bits, Exit);
5994 
5995       // store 3 bytes decoded data
5996       __ sraiw(byte0, combined32Bits, 16);
5997       __ sraiw(byte1, combined32Bits, 8);
5998       __ sb(byte0, Address(dst, 0));
5999       __ sb(byte1, Address(dst, 1));
6000       __ sb(combined32Bits, Address(dst, 2));
6001 
6002       __ subi(length, length, 4);
6003       __ addi(dst, dst, 3);
6004       // loop back
6005       __ bnez(length, ScalarLoop);
6006     }
6007 
6008     __ BIND(Exit);
6009     __ sub(c_rarg0, dst, dstBackup);
6010 
6011     __ leave();
6012     __ ret();
6013 
6014     return (address) start;
6015   }
6016 
6017   void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
6018     VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
6019     Register temp0, Register temp1, Register temp2,  Register temp3,
6020     VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
6021 
6022     assert((lmul == Assembler::m4 && step == 64) ||
6023            (lmul == Assembler::m2 && step == 32) ||
6024            (lmul == Assembler::m1 && step == 16),
6025            "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
6026     // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
6027     // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
6028     // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
6029     // In non-vectorized code, we update s1 and s2 as:
6030     //   s1 <- s1 + b1
6031     //   s2 <- s2 + s1
6032     //   s1 <- s1 + b2
6033     //   s2 <- s2 + b1
6034     //   ...
6035     //   s1 <- s1 + b64
6036     //   s2 <- s2 + s1
6037     // Putting above assignments together, we have:
6038     //   s1_new = s1 + b1 + b2 + ... + b64
6039     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
6040     //          = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
6041     //          = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
6042 
6043     __ mv(temp3, step);
6044     // Load data
6045     __ vsetvli(temp0, temp3, Assembler::e8, lmul);
6046     __ vle8_v(vbytes, buff);
6047     __ addi(buff, buff, step);
6048 
6049     // Upper bound reduction sum for s1_new:
6050     // 0xFF * 64 = 0x3FC0, so:
6051     // 1. Need to do vector-widening reduction sum
6052     // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
6053     __ vwredsumu_vs(vs1acc, vbytes, vzero);
6054     // Multiplication for s2_new
6055     __ vwmulu_vv(vs2acc, vtable, vbytes);
6056 
6057     // s2 = s2 + s1 * log2(step)
6058     __ slli(temp1, s1, exact_log2(step));
6059     __ add(s2, s2, temp1);
6060 
6061     // Summing up calculated results for s2_new
6062     if (MaxVectorSize > 16) {
6063       __ vsetvli(temp0, temp3, Assembler::e16, lmul);
6064     } else {
6065       // Half of vector-widening multiplication result is in successor of vs2acc
6066       // group for vlen == 16, in which case we need to double vector register
6067       // group width in order to reduction sum all of them
6068       Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
6069                                (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
6070       __ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
6071     }
6072     // Upper bound for reduction sum:
6073     // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
6074     // 1. Need to do vector-widening reduction sum
6075     // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
6076     __ vwredsumu_vs(vtemp1, vs2acc, vzero);
6077 
6078     // Extracting results for:
6079     // s1_new
6080     __ vmv_x_s(temp0, vs1acc);
6081     __ add(s1, s1, temp0);
6082     // s2_new
6083     __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
6084     __ vmv_x_s(temp1, vtemp1);
6085     __ add(s2, s2, temp1);
6086   }
6087 
6088   /***
6089    *  int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
6090    *
6091    *  Arguments:
6092    *
6093    *  Inputs:
6094    *   c_rarg0   - int   adler
6095    *   c_rarg1   - byte* buff (b + off)
6096    *   c_rarg2   - int   len
6097    *
6098    *  Output:
6099    *   c_rarg0   - int adler result
6100    */
6101   address generate_updateBytesAdler32() {
6102     __ align(CodeEntryAlignment);
6103     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
6104     StubCodeMark mark(this, stub_id);
6105     address start = __ pc();
6106 
6107     Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
6108       L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
6109 
6110     // Aliases
6111     Register adler  = c_rarg0;
6112     Register s1     = c_rarg0;
6113     Register s2     = c_rarg3;
6114     Register buff   = c_rarg1;
6115     Register len    = c_rarg2;
6116     Register nmax  = c_rarg4;
6117     Register base  = c_rarg5;
6118     Register count = c_rarg6;
6119     Register temp0 = t3;
6120     Register temp1 = t4;
6121     Register temp2 = t5;
6122     Register temp3 = t6;
6123 
6124     VectorRegister vzero = v31;
6125     VectorRegister vbytes = v8; // group: v8, v9, v10, v11
6126     VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
6127     VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
6128     VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
6129     VectorRegister vtable_32 = v4; // group: v4, v5
6130     VectorRegister vtable_16 = v30;
6131     VectorRegister vtemp1 = v28;
6132     VectorRegister vtemp2 = v29;
6133 
6134     // Max number of bytes we can process before having to take the mod
6135     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
6136     const uint64_t BASE = 0xfff1;
6137     const uint64_t NMAX = 0x15B0;
6138 
6139     // Loops steps
6140     int step_64 = 64;
6141     int step_32 = 32;
6142     int step_16 = 16;
6143     int step_1  = 1;
6144 
6145     __ enter(); // Required for proper stackwalking of RuntimeStub frame
6146     __ mv(temp1, 64);
6147     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
6148 
6149     // Generating accumulation coefficients for further calculations
6150     // vtable_64:
6151     __ vid_v(vtemp1);
6152     __ vrsub_vx(vtable_64, vtemp1, temp1);
6153     // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
6154 
6155     // vtable_32:
6156     __ mv(temp1, 32);
6157     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
6158     __ vid_v(vtemp1);
6159     __ vrsub_vx(vtable_32, vtemp1, temp1);
6160     // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
6161 
6162     __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
6163     // vtable_16:
6164     __ mv(temp1, 16);
6165     __ vid_v(vtemp1);
6166     __ vrsub_vx(vtable_16, vtemp1, temp1);
6167     // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
6168 
6169     __ vmv_v_i(vzero, 0);
6170 
6171     __ mv(base, BASE);
6172     __ mv(nmax, NMAX);
6173 
6174     // s1 is initialized to the lower 16 bits of adler
6175     // s2 is initialized to the upper 16 bits of adler
6176     __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
6177     __ zext(s1, adler, 16); // s1 = (adler & 0xffff)
6178 
6179     // The pipelined loop needs at least 16 elements for 1 iteration
6180     // It does check this, but it is more effective to skip to the cleanup loop
6181     __ mv(temp0, step_16);
6182     __ bgeu(len, temp0, L_nmax);
6183     __ beqz(len, L_combine);
6184 
6185     // Jumping to L_by1_loop
6186     __ subi(len, len, step_1);
6187     __ j(L_by1_loop);
6188 
6189   __ bind(L_nmax);
6190     __ sub(len, len, nmax);
6191     __ subi(count, nmax, 16);
6192     __ bltz(len, L_by16);
6193 
6194   // Align L_nmax loop by 64
6195   __ bind(L_nmax_loop_entry);
6196     __ subi(count, count, 32);
6197 
6198   __ bind(L_nmax_loop);
6199     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6200       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6201       vtemp1, vtemp2, step_64, Assembler::m4);
6202     __ subi(count, count, step_64);
6203     __ bgtz(count, L_nmax_loop);
6204 
6205     // There are three iterations left to do
6206     adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
6207       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6208       vtemp1, vtemp2, step_32, Assembler::m2);
6209     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6210       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6211       vtemp1, vtemp2, step_16, Assembler::m1);
6212 
6213     // s1 = s1 % BASE
6214     __ remuw(s1, s1, base);
6215     // s2 = s2 % BASE
6216     __ remuw(s2, s2, base);
6217 
6218     __ sub(len, len, nmax);
6219     __ subi(count, nmax, 16);
6220     __ bgez(len, L_nmax_loop_entry);
6221 
6222   __ bind(L_by16);
6223     __ add(len, len, count);
6224     __ bltz(len, L_by1);
6225     // Trying to unroll
6226     __ mv(temp3, step_64);
6227     __ blt(len, temp3, L_by16_loop);
6228 
6229   __ bind(L_by16_loop_unroll);
6230     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6231       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6232       vtemp1, vtemp2, step_64, Assembler::m4);
6233     __ subi(len, len, step_64);
6234     // By now the temp3 should still be 64
6235     __ bge(len, temp3, L_by16_loop_unroll);
6236 
6237   __ bind(L_by16_loop);
6238     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6239       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6240       vtemp1, vtemp2, step_16, Assembler::m1);
6241     __ subi(len, len, step_16);
6242     __ bgez(len, L_by16_loop);
6243 
6244   __ bind(L_by1);
6245     __ addi(len, len, 15);
6246     __ bltz(len, L_do_mod);
6247 
6248   __ bind(L_by1_loop);
6249     __ lbu(temp0, Address(buff, 0));
6250     __ addi(buff, buff, step_1);
6251     __ add(s1, temp0, s1);
6252     __ add(s2, s2, s1);
6253     __ subi(len, len, step_1);
6254     __ bgez(len, L_by1_loop);
6255 
6256   __ bind(L_do_mod);
6257     // s1 = s1 % BASE
6258     __ remuw(s1, s1, base);
6259     // s2 = s2 % BASE
6260     __ remuw(s2, s2, base);
6261 
6262     // Combine lower bits and higher bits
6263     // adler = s1 | (s2 << 16)
6264   __ bind(L_combine);
6265     __ slli(s2, s2, 16);
6266     __ orr(s1, s1, s2);
6267 
6268     __ leave(); // Required for proper stackwalking of RuntimeStub frame
6269     __ ret();
6270 
6271     return start;
6272   }
6273 
6274 #endif // COMPILER2_OR_JVMCI
6275 
6276   // x10 = input (float16)
6277   // f10 = result (float)
6278   // t1  = temporary register
6279   address generate_float16ToFloat() {
6280     __ align(CodeEntryAlignment);
6281     StubGenStubId stub_id = StubGenStubId::hf2f_id;
6282     StubCodeMark mark(this, stub_id);
6283     address entry = __ pc();
6284     BLOCK_COMMENT("float16ToFloat:");
6285 
6286     FloatRegister dst = f10;
6287     Register src = x10;
6288     Label NaN_SLOW;
6289 
6290     assert(VM_Version::supports_float16_float_conversion(), "must");
6291 
6292     // On riscv, NaN needs a special process as fcvt does not work in that case.
6293     // On riscv, Inf does not need a special process as fcvt can handle it correctly.
6294     // but we consider to get the slow path to process NaN and Inf at the same time,
6295     // as both of them are rare cases, and if we try to get the slow path to handle
6296     // only NaN case it would sacrifise the performance for normal cases,
6297     // i.e. non-NaN and non-Inf cases.
6298 
6299     // check whether it's a NaN or +/- Inf.
6300     __ mv(t0, 0x7c00);
6301     __ andr(t1, src, t0);
6302     // jump to stub processing NaN and Inf cases.
6303     __ beq(t0, t1, NaN_SLOW);
6304 
6305     // non-NaN or non-Inf cases, just use built-in instructions.
6306     __ fmv_h_x(dst, src);
6307     __ fcvt_s_h(dst, dst);
6308     __ ret();
6309 
6310     __ bind(NaN_SLOW);
6311     // following instructions mainly focus on NaN, as riscv does not handle
6312     // NaN well with fcvt, but the code also works for Inf at the same time.
6313 
6314     // construct a NaN in 32 bits from the NaN in 16 bits,
6315     // we need the payloads of non-canonical NaNs to be preserved.
6316     __ mv(t1, 0x7f800000);
6317     // sign-bit was already set via sign-extension if necessary.
6318     __ slli(t0, src, 13);
6319     __ orr(t1, t0, t1);
6320     __ fmv_w_x(dst, t1);
6321 
6322     __ ret();
6323     return entry;
6324   }
6325 
6326   // f10 = input (float)
6327   // x10 = result (float16)
6328   // f11 = temporary float register
6329   // t1  = temporary register
6330   address generate_floatToFloat16() {
6331     __ align(CodeEntryAlignment);
6332     StubGenStubId stub_id = StubGenStubId::f2hf_id;
6333     StubCodeMark mark(this, stub_id);
6334     address entry = __ pc();
6335     BLOCK_COMMENT("floatToFloat16:");
6336 
6337     Register dst = x10;
6338     FloatRegister src = f10, ftmp = f11;
6339     Label NaN_SLOW;
6340 
6341     assert(VM_Version::supports_float16_float_conversion(), "must");
6342 
6343     // On riscv, NaN needs a special process as fcvt does not work in that case.
6344 
6345     // check whether it's a NaN.
6346     // replace fclass with feq as performance optimization.
6347     __ feq_s(t0, src, src);
6348     // jump to stub processing NaN cases.
6349     __ beqz(t0, NaN_SLOW);
6350 
6351     // non-NaN cases, just use built-in instructions.
6352     __ fcvt_h_s(ftmp, src);
6353     __ fmv_x_h(dst, ftmp);
6354     __ ret();
6355 
6356     __ bind(NaN_SLOW);
6357     __ fmv_x_w(dst, src);
6358 
6359     // preserve the payloads of non-canonical NaNs.
6360     __ srai(dst, dst, 13);
6361     // preserve the sign bit.
6362     __ srai(t1, dst, 13);
6363     __ slli(t1, t1, 10);
6364     __ mv(t0, 0x3ff);
6365     __ orr(t1, t1, t0);
6366 
6367     // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
6368     __ andr(dst, dst, t1);
6369 
6370     __ ret();
6371     return entry;
6372   }
6373 
6374 #ifdef COMPILER2
6375 
6376 static const int64_t right_2_bits = right_n_bits(2);
6377 static const int64_t right_3_bits = right_n_bits(3);
6378 
6379   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
6380   // are represented as long[5], with BITS_PER_LIMB = 26.
6381   // Pack five 26-bit limbs into three 64-bit registers.
6382   void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
6383     assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
6384 
6385     // The goal is to have 128-bit value in dest2:dest1:dest0
6386     __ ld(dest0, Address(src, 0));    // 26 bits in dest0
6387 
6388     __ ld(tmp1, Address(src, sizeof(jlong)));
6389     __ slli(tmp1, tmp1, 26);
6390     __ add(dest0, dest0, tmp1);       // 52 bits in dest0
6391 
6392     __ ld(tmp2, Address(src, 2 * sizeof(jlong)));
6393     __ slli(tmp1, tmp2, 52);
6394     __ add(dest0, dest0, tmp1);       // dest0 is full
6395 
6396     __ srli(dest1, tmp2, 12);         // 14-bit in dest1
6397 
6398     __ ld(tmp1, Address(src, 3 * sizeof(jlong)));
6399     __ slli(tmp1, tmp1, 14);
6400     __ add(dest1, dest1, tmp1);       // 40-bit in dest1
6401 
6402     __ ld(tmp1, Address(src, 4 * sizeof(jlong)));
6403     __ slli(tmp2, tmp1, 40);
6404     __ add(dest1, dest1, tmp2);       // dest1 is full
6405 
6406     if (dest2->is_valid()) {
6407       __ srli(tmp1, tmp1, 24);
6408       __ mv(dest2, tmp1);               // 2 bits in dest2
6409     } else {
6410 #ifdef ASSERT
6411       Label OK;
6412       __ srli(tmp1, tmp1, 24);
6413       __ beq(zr, tmp1, OK);           // 2 bits
6414       __ stop("high bits of Poly1305 integer should be zero");
6415       __ should_not_reach_here();
6416       __ bind(OK);
6417 #endif
6418     }
6419   }
6420 
6421   // As above, but return only a 128-bit integer, packed into two
6422   // 64-bit registers.
6423   void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
6424     poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
6425   }
6426 
6427   // U_2:U_1:U_0: += (U_2 >> 2) * 5
6428   void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
6429     assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
6430 
6431     // First, U_2:U_1:U_0 += (U_2 >> 2)
6432     __ srli(tmp1, U_2, 2);
6433     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6434     __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
6435     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6436     __ add(U_2, U_2, tmp2);
6437 
6438     // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
6439     __ slli(tmp1, tmp1, 2);
6440     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6441     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6442     __ add(U_2, U_2, tmp2);
6443   }
6444 
6445   // Poly1305, RFC 7539
6446   // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
6447 
6448   // Arguments:
6449   //    c_rarg0:   input_start -- where the input is stored
6450   //    c_rarg1:   length
6451   //    c_rarg2:   acc_start -- where the output will be stored
6452   //    c_rarg3:   r_start -- where the randomly generated 128-bit key is stored
6453 
6454   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
6455   // description of the tricks used to simplify and accelerate this
6456   // computation.
6457 
6458   address generate_poly1305_processBlocks() {
6459     __ align(CodeEntryAlignment);
6460     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
6461     StubCodeMark mark(this, stub_id);
6462     address start = __ pc();
6463     __ enter();
6464     Label here;
6465 
6466     RegSet saved_regs = RegSet::range(x18, x21);
6467     RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
6468     __ push_reg(saved_regs, sp);
6469 
6470     // Arguments
6471     const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
6472 
6473     // R_n is the 128-bit randomly-generated key, packed into two
6474     // registers. The caller passes this key to us as long[5], with
6475     // BITS_PER_LIMB = 26.
6476     const Register R_0 = *regs, R_1 = *++regs;
6477     poly1305_pack_26(R_0, R_1, r_start, t1, t2);
6478 
6479     // RR_n is (R_n >> 2) * 5
6480     const Register RR_0 = *++regs, RR_1 = *++regs;
6481     __ srli(t1, R_0, 2);
6482     __ shadd(RR_0, t1, t1, t2, 2);
6483     __ srli(t1, R_1, 2);
6484     __ shadd(RR_1, t1, t1, t2, 2);
6485 
6486     // U_n is the current checksum
6487     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
6488     poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
6489 
6490     static constexpr int BLOCK_LENGTH = 16;
6491     Label DONE, LOOP;
6492 
6493     __ mv(t1, BLOCK_LENGTH);
6494     __ blt(length, t1, DONE); {
6495       __ bind(LOOP);
6496 
6497       // S_n is to be the sum of U_n and the next block of data
6498       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
6499       __ ld(S_0, Address(input_start, 0));
6500       __ ld(S_1, Address(input_start, wordSize));
6501 
6502       __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
6503       __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
6504       __ add(S_2, U_2, t1);
6505 
6506       __ addi(S_2, S_2, 1);
6507 
6508       const Register U_0HI = *++regs, U_1HI = *++regs;
6509 
6510       // NB: this logic depends on some of the special properties of
6511       // Poly1305 keys. In particular, because we know that the top
6512       // four bits of R_0 and R_1 are zero, we can add together
6513       // partial products without any risk of needing to propagate a
6514       // carry out.
6515       __ wide_mul(U_0, U_0HI, S_0, R_0);
6516       __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
6517       __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
6518 
6519       __ wide_mul(U_1, U_1HI, S_0, R_1);
6520       __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
6521       __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
6522 
6523       __ andi(U_2, R_0, right_2_bits);
6524       __ mul(U_2, S_2, U_2);
6525 
6526       // Partial reduction mod 2**130 - 5
6527       __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
6528       __ adc(U_2, U_2, U_1HI, t1);
6529       // Sum is now in U_2:U_1:U_0.
6530 
6531       // U_2:U_1:U_0: += (U_2 >> 2) * 5
6532       poly1305_reduce(U_2, U_1, U_0, t1, t2);
6533 
6534       __ subi(length, length, BLOCK_LENGTH);
6535       __ addi(input_start, input_start, BLOCK_LENGTH);
6536       __ mv(t1, BLOCK_LENGTH);
6537       __ bge(length, t1, LOOP);
6538     }
6539 
6540     // Further reduce modulo 2^130 - 5
6541     poly1305_reduce(U_2, U_1, U_0, t1, t2);
6542 
6543     // Unpack the sum into five 26-bit limbs and write to memory.
6544     // First 26 bits is the first limb
6545     __ slli(t1, U_0, 38); // Take lowest 26 bits
6546     __ srli(t1, t1, 38);
6547     __ sd(t1, Address(acc_start)); // First 26-bit limb
6548 
6549     // 27-52 bits of U_0 is the second limb
6550     __ slli(t1, U_0, 12); // Take next 27-52 bits
6551     __ srli(t1, t1, 38);
6552     __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
6553 
6554     // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
6555     __ srli(t1, U_0, 52);
6556     __ slli(t2, U_1, 50);
6557     __ srli(t2, t2, 38);
6558     __ add(t1, t1, t2);
6559     __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
6560 
6561     // Storing 15-40 bits of U_1
6562     __ slli(t1, U_1, 24); // Already used up 14 bits
6563     __ srli(t1, t1, 38); // Clear all other bits from t1
6564     __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
6565 
6566     // Storing 41-64 bits of U_1 and first three bits from U_2 in one register
6567     __ srli(t1, U_1, 40);
6568     __ andi(t2, U_2, right_3_bits);
6569     __ slli(t2, t2, 24);
6570     __ add(t1, t1, t2);
6571     __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
6572 
6573     __ bind(DONE);
6574     __ pop_reg(saved_regs, sp);
6575     __ leave(); // Required for proper stackwalking
6576     __ ret();
6577 
6578     return start;
6579   }
6580 
6581 #endif // COMPILER2
6582 
6583   /**
6584    *  Arguments:
6585    *
6586    * Inputs:
6587    *   c_rarg0   - int crc
6588    *   c_rarg1   - byte* buf
6589    *   c_rarg2   - int length
6590    *
6591    * Output:
6592    *   c_rarg0   - int crc result
6593    */
6594   address generate_updateBytesCRC32() {
6595     assert(UseCRC32Intrinsics, "what are we doing here?");
6596 
6597     __ align(CodeEntryAlignment);
6598     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
6599     StubCodeMark mark(this, stub_id);
6600 
6601     address start = __ pc();
6602 
6603     // input parameters
6604     const Register crc    = c_rarg0;  // crc
6605     const Register buf    = c_rarg1;  // source java byte array address
6606     const Register len    = c_rarg2;  // length
6607 
6608     BLOCK_COMMENT("Entry:");
6609     __ enter(); // required for proper stackwalking of RuntimeStub frame
6610 
6611     __ kernel_crc32(crc, buf, len,
6612                     c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables
6613                     c_rarg7, t2, t3, t4, t5, t6);       // misc tmps
6614 
6615     __ leave(); // required for proper stackwalking of RuntimeStub frame
6616     __ ret();
6617 
6618     return start;
6619   }
6620 
6621   // exception handler for upcall stubs
6622   address generate_upcall_stub_exception_handler() {
6623     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
6624     StubCodeMark mark(this, stub_id);
6625     address start = __ pc();
6626 
6627     // Native caller has no idea how to handle exceptions,
6628     // so we just crash here. Up to callee to catch exceptions.
6629     __ verify_oop(x10); // return a exception oop in a0
6630     __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
6631     __ should_not_reach_here();
6632 
6633     return start;
6634   }
6635 
6636   // load Method* target of MethodHandle
6637   // j_rarg0 = jobject receiver
6638   // xmethod = Method* result
6639   address generate_upcall_stub_load_target() {
6640 
6641     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
6642     StubCodeMark mark(this, stub_id);
6643     address start = __ pc();
6644 
6645     __ resolve_global_jobject(j_rarg0, t0, t1);
6646       // Load target method from receiver
6647     __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1);
6648     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1);
6649     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1);
6650     __ access_load_at(T_ADDRESS, IN_HEAP, xmethod,
6651                       Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
6652                       noreg, noreg);
6653     __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
6654 
6655     __ ret();
6656 
6657     return start;
6658   }
6659 
6660 #undef __
6661 
6662   // Initialization
6663   void generate_preuniverse_stubs() {
6664     // preuniverse stubs are not needed for riscv
6665   }
6666 
6667   void generate_initial_stubs() {
6668     // Generate initial stubs and initializes the entry points
6669 
6670     // entry points that exist in all platforms Note: This is code
6671     // that could be shared among different platforms - however the
6672     // benefit seems to be smaller than the disadvantage of having a
6673     // much more complicated generator structure. See also comment in
6674     // stubRoutines.hpp.
6675 
6676     StubRoutines::_forward_exception_entry = generate_forward_exception();
6677 
6678     if (UnsafeMemoryAccess::_table == nullptr) {
6679       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
6680     }
6681 
6682     StubRoutines::_call_stub_entry =
6683       generate_call_stub(StubRoutines::_call_stub_return_address);
6684 
6685     // is referenced by megamorphic call
6686     StubRoutines::_catch_exception_entry = generate_catch_exception();
6687 
6688     if (UseCRC32Intrinsics) {
6689       // set table address before stub generation which use it
6690       StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table;
6691       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
6692     }
6693 
6694     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
6695         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
6696       StubRoutines::_hf2f = generate_float16ToFloat();
6697       StubRoutines::_f2hf = generate_floatToFloat16();
6698     }
6699   }
6700 
6701   void generate_continuation_stubs() {
6702     // Continuation stubs:
6703     StubRoutines::_cont_thaw             = generate_cont_thaw();
6704     StubRoutines::_cont_returnBarrier    = generate_cont_returnBarrier();
6705     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
6706     StubRoutines::_cont_preempt_stub     = generate_cont_preempt_stub();
6707   }
6708 
6709   void generate_final_stubs() {
6710     // support for verify_oop (must happen after universe_init)
6711     if (VerifyOops) {
6712       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
6713     }
6714 
6715     // arraycopy stubs used by compilers
6716     generate_arraycopy_stubs();
6717 
6718     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
6719 
6720 #ifdef COMPILER2
6721     if (UseSecondarySupersTable) {
6722       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
6723       if (!InlineSecondarySupersTest) {
6724         generate_lookup_secondary_supers_table_stub();
6725       }
6726     }
6727 #endif // COMPILER2
6728 
6729     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
6730     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
6731 
6732     StubRoutines::riscv::set_completed();
6733   }
6734 
6735   void generate_compiler_stubs() {
6736 #ifdef COMPILER2
6737     if (UseMulAddIntrinsic) {
6738       StubRoutines::_mulAdd = generate_mulAdd();
6739     }
6740 
6741     if (UseMultiplyToLenIntrinsic) {
6742       StubRoutines::_multiplyToLen = generate_multiplyToLen();
6743     }
6744 
6745     if (UseSquareToLenIntrinsic) {
6746       StubRoutines::_squareToLen = generate_squareToLen();
6747     }
6748 
6749     if (UseMontgomeryMultiplyIntrinsic) {
6750       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
6751       StubCodeMark mark(this, stub_id);
6752       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
6753       StubRoutines::_montgomeryMultiply = g.generate_multiply();
6754     }
6755 
6756     if (UseMontgomerySquareIntrinsic) {
6757       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
6758       StubCodeMark mark(this, stub_id);
6759       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
6760       StubRoutines::_montgomerySquare = g.generate_square();
6761     }
6762 
6763     if (UseAESIntrinsics) {
6764       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6765       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6766     }
6767 
6768     if (UsePoly1305Intrinsics) {
6769       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
6770     }
6771 
6772     if (UseRVV) {
6773       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
6774       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
6775     }
6776 
6777     if (UseSHA256Intrinsics) {
6778       Sha2Generator sha2(_masm, this);
6779       StubRoutines::_sha256_implCompress   = sha2.generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
6780       StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
6781     }
6782 
6783     if (UseSHA512Intrinsics) {
6784       Sha2Generator sha2(_masm, this);
6785       StubRoutines::_sha512_implCompress   = sha2.generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
6786       StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
6787     }
6788 
6789     if (UseMD5Intrinsics) {
6790       StubRoutines::_md5_implCompress   = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
6791       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
6792     }
6793 
6794     if (UseChaCha20Intrinsics) {
6795       StubRoutines::_chacha20Block = generate_chacha20Block();
6796     }
6797 
6798     if (UseSHA1Intrinsics) {
6799       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
6800       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
6801     }
6802 
6803     if (UseBASE64Intrinsics) {
6804       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6805       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
6806     }
6807 
6808     if (UseAdler32Intrinsics) {
6809       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6810     }
6811 
6812     generate_compare_long_strings();
6813 
6814     generate_string_indexof_stubs();
6815 
6816 #endif // COMPILER2
6817   }
6818 
6819  public:
6820   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
6821     switch(blob_id) {
6822     case preuniverse_id:
6823       generate_preuniverse_stubs();
6824       break;
6825     case initial_id:
6826       generate_initial_stubs();
6827       break;
6828      case continuation_id:
6829       generate_continuation_stubs();
6830       break;
6831     case compiler_id:
6832       generate_compiler_stubs();
6833       break;
6834     case final_id:
6835       generate_final_stubs();
6836       break;
6837     default:
6838       fatal("unexpected blob id: %d", blob_id);
6839       break;
6840     };
6841   }
6842 }; // end class declaration
6843 
6844 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
6845   StubGenerator g(code, blob_id);
6846 }