1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2025, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/universe.hpp"
  34 #include "nativeInst_riscv.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/method.hpp"
  37 #include "oops/objArrayKlass.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "prims/upcallLinker.hpp"
  41 #include "runtime/continuation.hpp"
  42 #include "runtime/continuationEntry.inline.hpp"
  43 #include "runtime/frame.inline.hpp"
  44 #include "runtime/handles.inline.hpp"
  45 #include "runtime/javaThread.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubCodeGenerator.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "utilities/align.hpp"
  50 #include "utilities/powerOfTwo.hpp"
  51 #ifdef COMPILER2
  52 #include "opto/runtime.hpp"
  53 #endif
  54 
  55 // Declaration and definition of StubGenerator (no .hpp file).
  56 // For a more detailed description of the stub routine structure
  57 // see the comment in stubRoutines.hpp
  58 
  59 #undef __
  60 #define __ _masm->
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(uint& counter) {
  79     __ incrementw(ExternalAddress((address)&counter));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save x1 (ra) as the return PC at the base of the frame and
 102   // link x8 (fp) below it as the frame pointer installing sp (x2)
 103   // into fp.
 104   //
 105   // we save x10-x17, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save x5 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 116   // volatile
 117   //
 118   // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
 119   // registers and C expects to be callee-save
 120   //
 121   // so the stub frame looks like this when we enter Java code
 122   //
 123   //     [ return_from_Java     ] <--- sp
 124   //     [ argument word n      ]
 125   //      ...
 126   // -35 [ argument word 1      ]
 127   // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
 128   // -33 [ saved f27            ]
 129   // -32 [ saved f26            ]
 130   // -31 [ saved f25            ]
 131   // -30 [ saved f24            ]
 132   // -29 [ saved f23            ]
 133   // -28 [ saved f22            ]
 134   // -27 [ saved f21            ]
 135   // -26 [ saved f20            ]
 136   // -25 [ saved f19            ]
 137   // -24 [ saved f18            ]
 138   // -23 [ saved f9             ]
 139   // -22 [ saved f8             ]
 140   // -21 [ saved x27            ]
 141   // -20 [ saved x26            ]
 142   // -19 [ saved x25            ]
 143   // -18 [ saved x24            ]
 144   // -17 [ saved x23            ]
 145   // -16 [ saved x22            ]
 146   // -15 [ saved x21            ]
 147   // -14 [ saved x20            ]
 148   // -13 [ saved x19            ]
 149   // -12 [ saved x18            ]
 150   // -11 [ saved x9             ]
 151   // -10 [ call wrapper   (x10) ]
 152   //  -9 [ result         (x11) ]
 153   //  -8 [ result type    (x12) ]
 154   //  -7 [ method         (x13) ]
 155   //  -6 [ entry point    (x14) ]
 156   //  -5 [ parameters     (x15) ]
 157   //  -4 [ parameter size (x16) ]
 158   //  -3 [ thread         (x17) ]
 159   //  -2 [ saved fp       (x8)  ]
 160   //  -1 [ saved ra       (x1)  ]
 161   //   0 [                      ] <--- fp == saved sp (x2)
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off  = -34,
 166 
 167     frm_off            = sp_after_call_off,
 168     f27_off            = -33,
 169     f26_off            = -32,
 170     f25_off            = -31,
 171     f24_off            = -30,
 172     f23_off            = -29,
 173     f22_off            = -28,
 174     f21_off            = -27,
 175     f20_off            = -26,
 176     f19_off            = -25,
 177     f18_off            = -24,
 178     f9_off             = -23,
 179     f8_off             = -22,
 180 
 181     x27_off            = -21,
 182     x26_off            = -20,
 183     x25_off            = -19,
 184     x24_off            = -18,
 185     x23_off            = -17,
 186     x22_off            = -16,
 187     x21_off            = -15,
 188     x20_off            = -14,
 189     x19_off            = -13,
 190     x18_off            = -12,
 191     x9_off             = -11,
 192 
 193     call_wrapper_off   = -10,
 194     result_off         = -9,
 195     result_type_off    = -8,
 196     method_off         = -7,
 197     entry_point_off    = -6,
 198     parameters_off     = -5,
 199     parameter_size_off = -4,
 200     thread_off         = -3,
 201     fp_f               = -2,
 202     retaddr_off        = -1,
 203   };
 204 
 205   address generate_call_stub(address& return_address) {
 206     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 207            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 208            "adjust this code");
 209 
 210     StubId stub_id = StubId::stubgen_call_stub_id;
 211     StubCodeMark mark(this, stub_id);
 212     address start = __ pc();
 213 
 214     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 215 
 216     const Address frm_save      (fp, frm_off           * wordSize);
 217     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 218     const Address result        (fp, result_off         * wordSize);
 219     const Address result_type   (fp, result_type_off    * wordSize);
 220     const Address method        (fp, method_off         * wordSize);
 221     const Address entry_point   (fp, entry_point_off    * wordSize);
 222     const Address parameters    (fp, parameters_off     * wordSize);
 223     const Address parameter_size(fp, parameter_size_off * wordSize);
 224 
 225     const Address thread        (fp, thread_off         * wordSize);
 226 
 227     const Address f27_save      (fp, f27_off            * wordSize);
 228     const Address f26_save      (fp, f26_off            * wordSize);
 229     const Address f25_save      (fp, f25_off            * wordSize);
 230     const Address f24_save      (fp, f24_off            * wordSize);
 231     const Address f23_save      (fp, f23_off            * wordSize);
 232     const Address f22_save      (fp, f22_off            * wordSize);
 233     const Address f21_save      (fp, f21_off            * wordSize);
 234     const Address f20_save      (fp, f20_off            * wordSize);
 235     const Address f19_save      (fp, f19_off            * wordSize);
 236     const Address f18_save      (fp, f18_off            * wordSize);
 237     const Address f9_save       (fp, f9_off             * wordSize);
 238     const Address f8_save       (fp, f8_off             * wordSize);
 239 
 240     const Address x27_save      (fp, x27_off            * wordSize);
 241     const Address x26_save      (fp, x26_off            * wordSize);
 242     const Address x25_save      (fp, x25_off            * wordSize);
 243     const Address x24_save      (fp, x24_off            * wordSize);
 244     const Address x23_save      (fp, x23_off            * wordSize);
 245     const Address x22_save      (fp, x22_off            * wordSize);
 246     const Address x21_save      (fp, x21_off            * wordSize);
 247     const Address x20_save      (fp, x20_off            * wordSize);
 248     const Address x19_save      (fp, x19_off            * wordSize);
 249     const Address x18_save      (fp, x18_off            * wordSize);
 250 
 251     const Address x9_save       (fp, x9_off             * wordSize);
 252 
 253     // stub code
 254 
 255     address riscv_entry = __ pc();
 256 
 257     // set up frame and move sp to end of save area
 258     __ enter();
 259     __ addi(sp, fp, sp_after_call_off * wordSize);
 260 
 261     // save register parameters and Java temporary/global registers
 262     // n.b. we save thread even though it gets installed in
 263     // xthread because we want to sanity check tp later
 264     __ sd(c_rarg7, thread);
 265     __ sw(c_rarg6, parameter_size);
 266     __ sd(c_rarg5, parameters);
 267     __ sd(c_rarg4, entry_point);
 268     __ sd(c_rarg3, method);
 269     __ sd(c_rarg2, result_type);
 270     __ sd(c_rarg1, result);
 271     __ sd(c_rarg0, call_wrapper);
 272 
 273     __ sd(x9, x9_save);
 274 
 275     __ sd(x18, x18_save);
 276     __ sd(x19, x19_save);
 277     __ sd(x20, x20_save);
 278     __ sd(x21, x21_save);
 279     __ sd(x22, x22_save);
 280     __ sd(x23, x23_save);
 281     __ sd(x24, x24_save);
 282     __ sd(x25, x25_save);
 283     __ sd(x26, x26_save);
 284     __ sd(x27, x27_save);
 285 
 286     __ fsd(f8,  f8_save);
 287     __ fsd(f9,  f9_save);
 288     __ fsd(f18, f18_save);
 289     __ fsd(f19, f19_save);
 290     __ fsd(f20, f20_save);
 291     __ fsd(f21, f21_save);
 292     __ fsd(f22, f22_save);
 293     __ fsd(f23, f23_save);
 294     __ fsd(f24, f24_save);
 295     __ fsd(f25, f25_save);
 296     __ fsd(f26, f26_save);
 297     __ fsd(f27, f27_save);
 298 
 299     __ frrm(t0);
 300     __ sd(t0, frm_save);
 301     // Set frm to the state we need. We do want Round to Nearest. We
 302     // don't want non-IEEE rounding modes.
 303     Label skip_fsrmi;
 304     guarantee(__ RoundingMode::rne == 0, "must be");
 305     __ beqz(t0, skip_fsrmi);
 306     __ fsrmi(__ RoundingMode::rne);
 307     __ bind(skip_fsrmi);
 308 
 309     // install Java thread in global register now we have saved
 310     // whatever value it held
 311     __ mv(xthread, c_rarg7);
 312 
 313     // And method
 314     __ mv(xmethod, c_rarg3);
 315 
 316     // set up the heapbase register
 317     __ reinit_heapbase();
 318 
 319 #ifdef ASSERT
 320     // make sure we have no pending exceptions
 321     {
 322       Label L;
 323       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 324       __ beqz(t0, L);
 325       __ stop("StubRoutines::call_stub: entered with pending exception");
 326       __ BIND(L);
 327     }
 328 #endif
 329     // pass parameters if any
 330     __ mv(esp, sp);
 331     __ slli(t0, c_rarg6, LogBytesPerWord);
 332     __ sub(t0, sp, t0); // Move SP out of the way
 333     __ andi(sp, t0, -2 * wordSize);
 334 
 335     BLOCK_COMMENT("pass parameters if any");
 336     Label parameters_done;
 337     // parameter count is still in c_rarg6
 338     // and parameter pointer identifying param 1 is in c_rarg5
 339     __ beqz(c_rarg6, parameters_done);
 340 
 341     address loop = __ pc();
 342     __ ld(t0, Address(c_rarg5, 0));
 343     __ addi(c_rarg5, c_rarg5, wordSize);
 344     __ subi(c_rarg6, c_rarg6, 1);
 345     __ push_reg(t0);
 346     __ bgtz(c_rarg6, loop);
 347 
 348     __ BIND(parameters_done);
 349 
 350     // call Java entry -- passing methdoOop, and current sp
 351     //      xmethod: Method*
 352     //      x19_sender_sp: sender sp
 353     BLOCK_COMMENT("call Java function");
 354     __ mv(x19_sender_sp, sp);
 355     __ jalr(c_rarg4);
 356 
 357     // save current address for use by exception handling code
 358 
 359     return_address = __ pc();
 360 
 361     // store result depending on type (everything that is not
 362     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 363     // n.b. this assumes Java returns an integral result in x10
 364     // and a floating result in j_farg0
 365     __ ld(j_rarg2, result);
 366     Label is_long, is_float, is_double, exit;
 367     __ ld(j_rarg1, result_type);
 368     __ mv(t0, (u1)T_OBJECT);
 369     __ beq(j_rarg1, t0, is_long);
 370     __ mv(t0, (u1)T_LONG);
 371     __ beq(j_rarg1, t0, is_long);
 372     __ mv(t0, (u1)T_FLOAT);
 373     __ beq(j_rarg1, t0, is_float);
 374     __ mv(t0, (u1)T_DOUBLE);
 375     __ beq(j_rarg1, t0, is_double);
 376 
 377     // handle T_INT case
 378     __ sw(x10, Address(j_rarg2));
 379 
 380     __ BIND(exit);
 381 
 382     // pop parameters
 383     __ addi(esp, fp, sp_after_call_off * wordSize);
 384 
 385 #ifdef ASSERT
 386     // verify that threads correspond
 387     {
 388       Label L, S;
 389       __ ld(t0, thread);
 390       __ bne(xthread, t0, S);
 391       __ get_thread(t0);
 392       __ beq(xthread, t0, L);
 393       __ BIND(S);
 394       __ stop("StubRoutines::call_stub: threads must correspond");
 395       __ BIND(L);
 396     }
 397 #endif
 398 
 399     __ pop_cont_fastpath(xthread);
 400 
 401     // restore callee-save registers
 402     __ fld(f27, f27_save);
 403     __ fld(f26, f26_save);
 404     __ fld(f25, f25_save);
 405     __ fld(f24, f24_save);
 406     __ fld(f23, f23_save);
 407     __ fld(f22, f22_save);
 408     __ fld(f21, f21_save);
 409     __ fld(f20, f20_save);
 410     __ fld(f19, f19_save);
 411     __ fld(f18, f18_save);
 412     __ fld(f9,  f9_save);
 413     __ fld(f8,  f8_save);
 414 
 415     __ ld(x27, x27_save);
 416     __ ld(x26, x26_save);
 417     __ ld(x25, x25_save);
 418     __ ld(x24, x24_save);
 419     __ ld(x23, x23_save);
 420     __ ld(x22, x22_save);
 421     __ ld(x21, x21_save);
 422     __ ld(x20, x20_save);
 423     __ ld(x19, x19_save);
 424     __ ld(x18, x18_save);
 425 
 426     __ ld(x9, x9_save);
 427 
 428     // restore frm
 429     Label skip_fsrm;
 430     __ ld(t0, frm_save);
 431     __ frrm(t1);
 432     __ beq(t0, t1, skip_fsrm);
 433     __ fsrm(t0);
 434     __ bind(skip_fsrm);
 435 
 436     __ ld(c_rarg0, call_wrapper);
 437     __ ld(c_rarg1, result);
 438     __ ld(c_rarg2, result_type);
 439     __ ld(c_rarg3, method);
 440     __ ld(c_rarg4, entry_point);
 441     __ ld(c_rarg5, parameters);
 442     __ ld(c_rarg6, parameter_size);
 443     __ ld(c_rarg7, thread);
 444 
 445     // leave frame and return to caller
 446     __ leave();
 447     __ ret();
 448 
 449     // handle return types different from T_INT
 450 
 451     __ BIND(is_long);
 452     __ sd(x10, Address(j_rarg2, 0));
 453     __ j(exit);
 454 
 455     __ BIND(is_float);
 456     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 457     __ j(exit);
 458 
 459     __ BIND(is_double);
 460     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 461     __ j(exit);
 462 
 463     return start;
 464   }
 465 
 466   // Return point for a Java call if there's an exception thrown in
 467   // Java code.  The exception is caught and transformed into a
 468   // pending exception stored in JavaThread that can be tested from
 469   // within the VM.
 470   //
 471   // Note: Usually the parameters are removed by the callee. In case
 472   // of an exception crossing an activation frame boundary, that is
 473   // not the case if the callee is compiled code => need to setup the
 474   // sp.
 475   //
 476   // x10: exception oop
 477 
 478   address generate_catch_exception() {
 479     StubId stub_id = StubId::stubgen_catch_exception_id;
 480     StubCodeMark mark(this, stub_id);
 481     address start = __ pc();
 482 
 483     // same as in generate_call_stub():
 484     const Address thread(fp, thread_off * wordSize);
 485 
 486 #ifdef ASSERT
 487     // verify that threads correspond
 488     {
 489       Label L, S;
 490       __ ld(t0, thread);
 491       __ bne(xthread, t0, S);
 492       __ get_thread(t0);
 493       __ beq(xthread, t0, L);
 494       __ bind(S);
 495       __ stop("StubRoutines::catch_exception: threads must correspond");
 496       __ bind(L);
 497     }
 498 #endif
 499 
 500     // set pending exception
 501     __ verify_oop(x10);
 502 
 503     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 504     __ mv(t0, (address)__FILE__);
 505     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 506     __ mv(t0, (int)__LINE__);
 507     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 508 
 509     // complete return to VM
 510     assert(StubRoutines::_call_stub_return_address != nullptr,
 511            "_call_stub_return_address must have been generated before");
 512     __ j(RuntimeAddress(StubRoutines::_call_stub_return_address));
 513 
 514     return start;
 515   }
 516 
 517   // Continuation point for runtime calls returning with a pending
 518   // exception.  The pending exception check happened in the runtime
 519   // or native call stub.  The pending exception in Thread is
 520   // converted into a Java-level exception.
 521   //
 522   // Contract with Java-level exception handlers:
 523   // x10: exception
 524   // x13: throwing pc
 525   //
 526   // NOTE: At entry of this stub, exception-pc must be in RA !!
 527 
 528   // NOTE: this is always used as a jump target within generated code
 529   // so it just needs to be generated code with no x86 prolog
 530 
 531   address generate_forward_exception() {
 532     StubId stub_id = StubId::stubgen_forward_exception_id;
 533     StubCodeMark mark(this, stub_id);
 534     address start = __ pc();
 535 
 536     // Upon entry, RA points to the return address returning into
 537     // Java (interpreted or compiled) code; i.e., the return address
 538     // becomes the throwing pc.
 539     //
 540     // Arguments pushed before the runtime call are still on the stack
 541     // but the exception handler will reset the stack pointer ->
 542     // ignore them.  A potential result in registers can be ignored as
 543     // well.
 544 
 545 #ifdef ASSERT
 546     // make sure this code is only executed if there is a pending exception
 547     {
 548       Label L;
 549       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 550       __ bnez(t0, L);
 551       __ stop("StubRoutines::forward exception: no pending exception (1)");
 552       __ bind(L);
 553     }
 554 #endif
 555 
 556     // compute exception handler into x9
 557 
 558     // call the VM to find the handler address associated with the
 559     // caller address. pass thread in x10 and caller pc (ret address)
 560     // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
 561     // the stack.
 562     __ mv(c_rarg1, ra);
 563     // ra will be trashed by the VM call so we move it to x9
 564     // (callee-saved) because we also need to pass it to the handler
 565     // returned by this call.
 566     __ mv(x9, ra);
 567     BLOCK_COMMENT("call exception_handler_for_return_address");
 568     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 569                          SharedRuntime::exception_handler_for_return_address),
 570                     xthread, c_rarg1);
 571     // we should not really care that ra is no longer the callee
 572     // address. we saved the value the handler needs in x9 so we can
 573     // just copy it to x13. however, the C2 handler will push its own
 574     // frame and then calls into the VM and the VM code asserts that
 575     // the PC for the frame above the handler belongs to a compiled
 576     // Java method. So, we restore ra here to satisfy that assert.
 577     __ mv(ra, x9);
 578     // setup x10 & x13 & clear pending exception
 579     __ mv(x13, x9);
 580     __ mv(x9, x10);
 581     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 582     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 583 
 584 #ifdef ASSERT
 585     // make sure exception is set
 586     {
 587       Label L;
 588       __ bnez(x10, L);
 589       __ stop("StubRoutines::forward exception: no pending exception (2)");
 590       __ bind(L);
 591     }
 592 #endif
 593 
 594     // continue at exception handler
 595     // x10: exception
 596     // x13: throwing pc
 597     // x9: exception handler
 598     __ verify_oop(x10);
 599     __ jr(x9);
 600 
 601     return start;
 602   }
 603 
 604   // Non-destructive plausibility checks for oops
 605   //
 606   // Arguments:
 607   //    x10: oop to verify
 608   //    t0: error message
 609   //
 610   // Stack after saving c_rarg3:
 611   //    [tos + 0]: saved c_rarg3
 612   //    [tos + 1]: saved c_rarg2
 613   //    [tos + 2]: saved ra
 614   //    [tos + 3]: saved t1
 615   //    [tos + 4]: saved x10
 616   //    [tos + 5]: saved t0
 617   address generate_verify_oop() {
 618 
 619     StubId stub_id = StubId::stubgen_verify_oop_id;
 620     StubCodeMark mark(this, stub_id);
 621     address start = __ pc();
 622 
 623     Label exit, error;
 624 
 625     __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
 626 
 627     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 628     __ ld(c_rarg3, Address(c_rarg2));
 629     __ addi(c_rarg3, c_rarg3, 1);
 630     __ sd(c_rarg3, Address(c_rarg2));
 631 
 632     // object is in x10
 633     // make sure object is 'reasonable'
 634     __ beqz(x10, exit); // if obj is null it is OK
 635 
 636     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 637     bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
 638 
 639     // return if everything seems ok
 640     __ bind(exit);
 641 
 642     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp);  // pop c_rarg2 and c_rarg3
 643     __ ret();
 644 
 645     // handle errors
 646     __ bind(error);
 647     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
 648 
 649     __ push_reg(RegSet::range(x0, x31), sp);
 650     // debug(char* msg, int64_t pc, int64_t regs[])
 651     __ mv(c_rarg0, t0);             // pass address of error message
 652     __ mv(c_rarg1, ra);             // pass return address
 653     __ mv(c_rarg2, sp);             // pass address of regs on stack
 654 #ifndef PRODUCT
 655     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 656 #endif
 657     BLOCK_COMMENT("call MacroAssembler::debug");
 658     __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 659     __ ebreak();
 660 
 661     return start;
 662   }
 663 
 664   // The inner part of zero_words().
 665   //
 666   // Inputs:
 667   // x28: the HeapWord-aligned base address of an array to zero.
 668   // x29: the count in HeapWords, x29 > 0.
 669   //
 670   // Returns x28 and x29, adjusted for the caller to clear.
 671   // x28: the base address of the tail of words left to clear.
 672   // x29: the number of words in the tail.
 673   //      x29 < MacroAssembler::zero_words_block_size.
 674 
 675   address generate_zero_blocks() {
 676     Label done;
 677 
 678     const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
 679 
 680     __ align(CodeEntryAlignment);
 681     StubId stub_id = StubId::stubgen_zero_blocks_id;
 682     StubCodeMark mark(this, stub_id);
 683     address start = __ pc();
 684 
 685     if (UseBlockZeroing) {
 686       int zicboz_block_size = VM_Version::zicboz_block_size.value();
 687       // Ensure count >= 2 * zicboz_block_size so that it still deserves
 688       // a cbo.zero after alignment.
 689       Label small;
 690       int low_limit = MAX2(2 * zicboz_block_size, (int)BlockZeroingLowLimit) / wordSize;
 691       __ mv(tmp1, low_limit);
 692       __ blt(cnt, tmp1, small);
 693       __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
 694       __ bind(small);
 695     }
 696 
 697     {
 698       // Clear the remaining blocks.
 699       Label loop;
 700       __ mv(tmp1, MacroAssembler::zero_words_block_size);
 701       __ blt(cnt, tmp1, done);
 702       __ bind(loop);
 703       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 704         __ sd(zr, Address(base, i * wordSize));
 705       }
 706       __ addi(base, base, MacroAssembler::zero_words_block_size * wordSize);
 707       __ subi(cnt, cnt, MacroAssembler::zero_words_block_size);
 708       __ bge(cnt, tmp1, loop);
 709       __ bind(done);
 710     }
 711 
 712     __ ret();
 713 
 714     return start;
 715   }
 716 
 717   typedef enum {
 718     copy_forwards = 1,
 719     copy_backwards = -1
 720   } copy_direction;
 721 
 722   // Bulk copy of blocks of 8 words.
 723   //
 724   // count is a count of words.
 725   //
 726   // Precondition: count >= 8
 727   //
 728   // Postconditions:
 729   //
 730   // The least significant bit of count contains the remaining count
 731   // of words to copy.  The rest of count is trash.
 732   //
 733   // s and d are adjusted to point to the remaining words to copy
 734   //
 735   address generate_copy_longs(StubId stub_id, Register s, Register d, Register count) {
 736     BasicType type;
 737     copy_direction direction;
 738     switch (stub_id) {
 739     case StubId::stubgen_copy_byte_f_id:
 740       direction = copy_forwards;
 741       type = T_BYTE;
 742       break;
 743     case StubId::stubgen_copy_byte_b_id:
 744       direction = copy_backwards;
 745       type = T_BYTE;
 746       break;
 747     default:
 748       ShouldNotReachHere();
 749     }
 750     int unit = wordSize * direction;
 751     int bias = wordSize;
 752 
 753     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 754       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 755 
 756     const Register stride = x30;
 757 
 758     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 759       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 760     assert_different_registers(s, d, count, t0);
 761 
 762     Label again, drain;
 763     StubCodeMark mark(this, stub_id);
 764     __ align(CodeEntryAlignment);
 765     address start = __ pc();
 766 
 767     if (direction == copy_forwards) {
 768       __ sub(s, s, bias);
 769       __ sub(d, d, bias);
 770     }
 771 
 772 #ifdef ASSERT
 773     // Make sure we are never given < 8 words
 774     {
 775       Label L;
 776 
 777       __ mv(t0, 8);
 778       __ bge(count, t0, L);
 779       __ stop("genrate_copy_longs called with < 8 words");
 780       __ bind(L);
 781     }
 782 #endif
 783 
 784     __ ld(tmp_reg0, Address(s, 1 * unit));
 785     __ ld(tmp_reg1, Address(s, 2 * unit));
 786     __ ld(tmp_reg2, Address(s, 3 * unit));
 787     __ ld(tmp_reg3, Address(s, 4 * unit));
 788     __ ld(tmp_reg4, Address(s, 5 * unit));
 789     __ ld(tmp_reg5, Address(s, 6 * unit));
 790     __ ld(tmp_reg6, Address(s, 7 * unit));
 791     __ ld(tmp_reg7, Address(s, 8 * unit));
 792     __ addi(s, s, 8 * unit);
 793 
 794     __ subi(count, count, 16);
 795     __ bltz(count, drain);
 796 
 797     __ bind(again);
 798 
 799     __ sd(tmp_reg0, Address(d, 1 * unit));
 800     __ sd(tmp_reg1, Address(d, 2 * unit));
 801     __ sd(tmp_reg2, Address(d, 3 * unit));
 802     __ sd(tmp_reg3, Address(d, 4 * unit));
 803     __ sd(tmp_reg4, Address(d, 5 * unit));
 804     __ sd(tmp_reg5, Address(d, 6 * unit));
 805     __ sd(tmp_reg6, Address(d, 7 * unit));
 806     __ sd(tmp_reg7, Address(d, 8 * unit));
 807 
 808     __ ld(tmp_reg0, Address(s, 1 * unit));
 809     __ ld(tmp_reg1, Address(s, 2 * unit));
 810     __ ld(tmp_reg2, Address(s, 3 * unit));
 811     __ ld(tmp_reg3, Address(s, 4 * unit));
 812     __ ld(tmp_reg4, Address(s, 5 * unit));
 813     __ ld(tmp_reg5, Address(s, 6 * unit));
 814     __ ld(tmp_reg6, Address(s, 7 * unit));
 815     __ ld(tmp_reg7, Address(s, 8 * unit));
 816 
 817     __ addi(s, s, 8 * unit);
 818     __ addi(d, d, 8 * unit);
 819 
 820     __ subi(count, count, 8);
 821     __ bgez(count, again);
 822 
 823     // Drain
 824     __ bind(drain);
 825 
 826     __ sd(tmp_reg0, Address(d, 1 * unit));
 827     __ sd(tmp_reg1, Address(d, 2 * unit));
 828     __ sd(tmp_reg2, Address(d, 3 * unit));
 829     __ sd(tmp_reg3, Address(d, 4 * unit));
 830     __ sd(tmp_reg4, Address(d, 5 * unit));
 831     __ sd(tmp_reg5, Address(d, 6 * unit));
 832     __ sd(tmp_reg6, Address(d, 7 * unit));
 833     __ sd(tmp_reg7, Address(d, 8 * unit));
 834     __ addi(d, d, 8 * unit);
 835 
 836     {
 837       Label L1, L2;
 838       __ test_bit(t0, count, 2);
 839       __ beqz(t0, L1);
 840 
 841       __ ld(tmp_reg0, Address(s, 1 * unit));
 842       __ ld(tmp_reg1, Address(s, 2 * unit));
 843       __ ld(tmp_reg2, Address(s, 3 * unit));
 844       __ ld(tmp_reg3, Address(s, 4 * unit));
 845       __ addi(s, s, 4 * unit);
 846 
 847       __ sd(tmp_reg0, Address(d, 1 * unit));
 848       __ sd(tmp_reg1, Address(d, 2 * unit));
 849       __ sd(tmp_reg2, Address(d, 3 * unit));
 850       __ sd(tmp_reg3, Address(d, 4 * unit));
 851       __ addi(d, d, 4 * unit);
 852 
 853       __ bind(L1);
 854 
 855       if (direction == copy_forwards) {
 856         __ addi(s, s, bias);
 857         __ addi(d, d, bias);
 858       }
 859 
 860       __ test_bit(t0, count, 1);
 861       __ beqz(t0, L2);
 862       if (direction == copy_backwards) {
 863         __ addi(s, s, 2 * unit);
 864         __ ld(tmp_reg0, Address(s));
 865         __ ld(tmp_reg1, Address(s, wordSize));
 866         __ addi(d, d, 2 * unit);
 867         __ sd(tmp_reg0, Address(d));
 868         __ sd(tmp_reg1, Address(d, wordSize));
 869       } else {
 870         __ ld(tmp_reg0, Address(s));
 871         __ ld(tmp_reg1, Address(s, wordSize));
 872         __ addi(s, s, 2 * unit);
 873         __ sd(tmp_reg0, Address(d));
 874         __ sd(tmp_reg1, Address(d, wordSize));
 875         __ addi(d, d, 2 * unit);
 876       }
 877       __ bind(L2);
 878     }
 879 
 880     __ ret();
 881 
 882     return start;
 883   }
 884 
 885   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 886 
 887   void copy_memory_v(Register s, Register d, Register count, int step) {
 888     bool is_backward = step < 0;
 889     int granularity = g_uabs(step);
 890 
 891     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 892     assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
 893     Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
 894     Label loop_forward, loop_backward, done;
 895 
 896     __ mv(dst, d);
 897     __ mv(src, s);
 898     __ mv(cnt, count);
 899 
 900     __ bind(loop_forward);
 901     __ vsetvli(vl, cnt, sew, Assembler::m8);
 902     if (is_backward) {
 903       __ bne(vl, cnt, loop_backward);
 904     }
 905 
 906     __ vlex_v(v0, src, sew);
 907     __ sub(cnt, cnt, vl);
 908     if (sew != Assembler::e8) {
 909       // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 910       __ slli(vl, vl, sew);
 911     }
 912     __ add(src, src, vl);
 913 
 914     __ vsex_v(v0, dst, sew);
 915     __ add(dst, dst, vl);
 916     __ bnez(cnt, loop_forward);
 917 
 918     if (is_backward) {
 919       __ j(done);
 920 
 921       __ bind(loop_backward);
 922       __ sub(t0, cnt, vl);
 923       if (sew != Assembler::e8) {
 924         // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 925         __ slli(t0, t0, sew);
 926       }
 927       __ add(tmp1, s, t0);
 928       __ vlex_v(v0, tmp1, sew);
 929       __ add(tmp2, d, t0);
 930       __ vsex_v(v0, tmp2, sew);
 931       __ sub(cnt, cnt, vl);
 932       __ bnez(cnt, loop_forward);
 933       __ bind(done);
 934     }
 935   }
 936 
 937   // All-singing all-dancing memory copy.
 938   //
 939   // Copy count units of memory from s to d.  The size of a unit is
 940   // step, which can be positive or negative depending on the direction
 941   // of copy.
 942   //
 943   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 944                    Register s, Register d, Register count, int step) {
 945     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 946     if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
 947       return copy_memory_v(s, d, count, step);
 948     }
 949 
 950     bool is_backwards = step < 0;
 951     int granularity = g_uabs(step);
 952 
 953     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
 954     const Register gct1 = x28, gct2 = x29, gct3 = t2;
 955 
 956     Label same_aligned;
 957     Label copy_big, copy32_loop, copy8_loop, copy_small, done;
 958 
 959     // The size of copy32_loop body increases significantly with ZGC GC barriers.
 960     // Need conditional far branches to reach a point beyond the loop in this case.
 961     bool is_far = UseZGC;
 962 
 963     __ beqz(count, done, is_far);
 964     __ slli(cnt, count, exact_log2(granularity));
 965     if (is_backwards) {
 966       __ add(src, s, cnt);
 967       __ add(dst, d, cnt);
 968     } else {
 969       __ mv(src, s);
 970       __ mv(dst, d);
 971     }
 972 
 973     if (is_aligned) {
 974       __ subi(t0, cnt, 32);
 975       __ bgez(t0, copy32_loop);
 976       __ subi(t0, cnt, 8);
 977       __ bgez(t0, copy8_loop, is_far);
 978       __ j(copy_small);
 979     } else {
 980       __ mv(t0, 16);
 981       __ blt(cnt, t0, copy_small, is_far);
 982 
 983       __ xorr(t0, src, dst);
 984       __ andi(t0, t0, 0b111);
 985       __ bnez(t0, copy_small, is_far);
 986 
 987       __ bind(same_aligned);
 988       __ andi(t0, src, 0b111);
 989       __ beqz(t0, copy_big);
 990       if (is_backwards) {
 991         __ addi(src, src, step);
 992         __ addi(dst, dst, step);
 993       }
 994       bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
 995       bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
 996       if (!is_backwards) {
 997         __ addi(src, src, step);
 998         __ addi(dst, dst, step);
 999       }
1000       __ subi(cnt, cnt, granularity);
1001       __ beqz(cnt, done, is_far);
1002       __ j(same_aligned);
1003 
1004       __ bind(copy_big);
1005       __ mv(t0, 32);
1006       __ blt(cnt, t0, copy8_loop, is_far);
1007     }
1008 
1009     __ bind(copy32_loop);
1010     if (is_backwards) {
1011       __ subi(src, src, wordSize * 4);
1012       __ subi(dst, dst, wordSize * 4);
1013     }
1014     // we first load 32 bytes, then write it, so the direction here doesn't matter
1015     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src),     gct1);
1016     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8),  gct1);
1017     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
1018     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
1019 
1020     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst),     tmp3, gct1, gct2, gct3);
1021     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8),  tmp4, gct1, gct2, gct3);
1022     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
1023     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
1024 
1025     if (!is_backwards) {
1026       __ addi(src, src, wordSize * 4);
1027       __ addi(dst, dst, wordSize * 4);
1028     }
1029     __ subi(t0, cnt, 32 + wordSize * 4);
1030     __ subi(cnt, cnt, wordSize * 4);
1031     __ bgez(t0, copy32_loop); // cnt >= 32, do next loop
1032 
1033     __ beqz(cnt, done); // if that's all - done
1034 
1035     __ subi(t0, cnt, 8); // if not - copy the reminder
1036     __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
1037 
1038     __ bind(copy8_loop);
1039     if (is_backwards) {
1040       __ subi(src, src, wordSize);
1041       __ subi(dst, dst, wordSize);
1042     }
1043     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1044     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1045 
1046     if (!is_backwards) {
1047       __ addi(src, src, wordSize);
1048       __ addi(dst, dst, wordSize);
1049     }
1050     __ subi(t0, cnt, 8 + wordSize);
1051     __ subi(cnt, cnt, wordSize);
1052     __ bgez(t0, copy8_loop); // cnt >= 8, do next loop
1053 
1054     __ beqz(cnt, done); // if that's all - done
1055 
1056     __ bind(copy_small);
1057     if (is_backwards) {
1058       __ addi(src, src, step);
1059       __ addi(dst, dst, step);
1060     }
1061 
1062     bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
1063     bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
1064 
1065     if (!is_backwards) {
1066       __ addi(src, src, step);
1067       __ addi(dst, dst, step);
1068     }
1069     __ subi(cnt, cnt, granularity);
1070     __ bgtz(cnt, copy_small);
1071 
1072     __ bind(done);
1073   }
1074 
1075   // Scan over array at a for count oops, verifying each one.
1076   // Preserves a and count, clobbers t0 and t1.
1077   void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1078     Label loop, end;
1079     __ mv(t1, zr);
1080     __ slli(t0, count, exact_log2(size));
1081     __ bind(loop);
1082     __ bgeu(t1, t0, end);
1083 
1084     __ add(temp, a, t1);
1085     if (size == (size_t)wordSize) {
1086       __ ld(temp, Address(temp, 0));
1087       __ verify_oop(temp);
1088     } else {
1089       __ lwu(temp, Address(temp, 0));
1090       __ decode_heap_oop(temp); // calls verify_oop
1091     }
1092     __ add(t1, t1, size);
1093     __ j(loop);
1094     __ bind(end);
1095   }
1096 
1097   // Arguments:
1098   //   stub_id - is used to name the stub and identify all details of
1099   //             how to perform the copy.
1100   //
1101   //   nopush_entry - is assigned to the stub's post push entry point
1102   //                  unless it is null
1103   //
1104   // Inputs:
1105   //   c_rarg0   - source array address
1106   //   c_rarg1   - destination array address
1107   //   c_rarg2   - element count, treated as ssize_t, can be zero
1108   //
1109   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1110   // the hardware handle it.  The two dwords within qwords that span
1111   // cache line boundaries will still be loaded and stored atomically.
1112   //
1113   // Side Effects: nopush_entry is set to the (post push) entry point
1114   //               so it can be used by the corresponding conjoint
1115   //               copy method
1116   //
1117   address generate_disjoint_copy(StubId stub_id, address* nopush_entry) {
1118     size_t size;
1119     bool aligned;
1120     bool is_oop;
1121     bool dest_uninitialized;
1122     switch (stub_id) {
1123     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1124       size = sizeof(jbyte);
1125       aligned = false;
1126       is_oop = false;
1127       dest_uninitialized = false;
1128       break;
1129     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1130       size = sizeof(jbyte);
1131       aligned = true;
1132       is_oop = false;
1133       dest_uninitialized = false;
1134       break;
1135     case StubId::stubgen_jshort_disjoint_arraycopy_id:
1136       size = sizeof(jshort);
1137       aligned = false;
1138       is_oop = false;
1139       dest_uninitialized = false;
1140       break;
1141     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1142       size = sizeof(jshort);
1143       aligned = true;
1144       is_oop = false;
1145       dest_uninitialized = false;
1146       break;
1147     case StubId::stubgen_jint_disjoint_arraycopy_id:
1148       size = sizeof(jint);
1149       aligned = false;
1150       is_oop = false;
1151       dest_uninitialized = false;
1152       break;
1153     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1154       size = sizeof(jint);
1155       aligned = true;
1156       is_oop = false;
1157       dest_uninitialized = false;
1158       break;
1159     case StubId::stubgen_jlong_disjoint_arraycopy_id:
1160       // since this is always aligned we can (should!) use the same
1161       // stub as for case arrayof_jlong_disjoint_arraycopy
1162       ShouldNotReachHere();
1163       break;
1164     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1165       size = sizeof(jlong);
1166       aligned = true;
1167       is_oop = false;
1168       dest_uninitialized = false;
1169       break;
1170     case StubId::stubgen_oop_disjoint_arraycopy_id:
1171       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1172       aligned = !UseCompressedOops;
1173       is_oop = true;
1174       dest_uninitialized = false;
1175       break;
1176     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1177       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1178       aligned = !UseCompressedOops;
1179       is_oop = true;
1180       dest_uninitialized = false;
1181       break;
1182     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1183       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1184       aligned = !UseCompressedOops;
1185       is_oop = true;
1186       dest_uninitialized = true;
1187       break;
1188     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1189       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1190       aligned = !UseCompressedOops;
1191       is_oop = true;
1192       dest_uninitialized = true;
1193       break;
1194     default:
1195       ShouldNotReachHere();
1196       break;
1197     }
1198 
1199     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1200     RegSet saved_reg = RegSet::of(s, d, count);
1201     __ align(CodeEntryAlignment);
1202     StubCodeMark mark(this, stub_id);
1203     address start = __ pc();
1204     __ enter();
1205 
1206     if (nopush_entry != nullptr) {
1207      *nopush_entry = __ pc();
1208       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1209       BLOCK_COMMENT("Entry:");
1210     }
1211 
1212     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1213     if (dest_uninitialized) {
1214       decorators |= IS_DEST_UNINITIALIZED;
1215     }
1216     if (aligned) {
1217       decorators |= ARRAYCOPY_ALIGNED;
1218     }
1219 
1220     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1221     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1222 
1223     if (is_oop) {
1224       // save regs before copy_memory
1225       __ push_reg(RegSet::of(d, count), sp);
1226     }
1227 
1228     {
1229       // UnsafeMemoryAccess page error: continue after unsafe access
1230       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1231       UnsafeMemoryAccessMark umam(this, add_entry, true);
1232       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1233     }
1234 
1235     if (is_oop) {
1236       __ pop_reg(RegSet::of(d, count), sp);
1237       if (VerifyOops) {
1238         verify_oop_array(size, d, count, t2);
1239       }
1240     }
1241 
1242     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1243 
1244     __ leave();
1245     __ mv(x10, zr); // return 0
1246     __ ret();
1247     return start;
1248   }
1249 
1250   // Arguments:
1251   //   stub_id - is used to name the stub and identify all details of
1252   //             how to perform the copy.
1253   //
1254   //   nooverlap_target - identifes the (post push) entry for the
1255   //             corresponding disjoint copy routine which can be
1256   //             jumped to if the ranges do not actually overlap
1257   //
1258   //   nopush_entry - is assigned to the stub's post push entry point
1259   //                 unless it is null
1260   //
1261   // Inputs:
1262   //   c_rarg0   - source array address
1263   //   c_rarg1   - destination array address
1264   //   c_rarg2   - element count, treated as ssize_t, can be zero
1265   //
1266   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1267   // the hardware handle it.  The two dwords within qwords that span
1268   // cache line boundaries will still be loaded and stored atomically.
1269   //
1270   // Side Effects:
1271   //   nopush_entry is set to the no-overlap entry point so it can be
1272   //   used by some other conjoint copy method
1273   //
1274   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1275     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1276     RegSet saved_regs = RegSet::of(s, d, count);
1277     int size;
1278     bool aligned;
1279     bool is_oop;
1280     bool dest_uninitialized;
1281     switch (stub_id) {
1282     case StubId::stubgen_jbyte_arraycopy_id:
1283       size = sizeof(jbyte);
1284       aligned = false;
1285       is_oop = false;
1286       dest_uninitialized = false;
1287       break;
1288     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1289       size = sizeof(jbyte);
1290       aligned = true;
1291       is_oop = false;
1292       dest_uninitialized = false;
1293       break;
1294     case StubId::stubgen_jshort_arraycopy_id:
1295       size = sizeof(jshort);
1296       aligned = false;
1297       is_oop = false;
1298       dest_uninitialized = false;
1299       break;
1300     case StubId::stubgen_arrayof_jshort_arraycopy_id:
1301       size = sizeof(jshort);
1302       aligned = true;
1303       is_oop = false;
1304       dest_uninitialized = false;
1305       break;
1306     case StubId::stubgen_jint_arraycopy_id:
1307       size = sizeof(jint);
1308       aligned = false;
1309       is_oop = false;
1310       dest_uninitialized = false;
1311       break;
1312     case StubId::stubgen_arrayof_jint_arraycopy_id:
1313       size = sizeof(jint);
1314       aligned = true;
1315       is_oop = false;
1316       dest_uninitialized = false;
1317       break;
1318     case StubId::stubgen_jlong_arraycopy_id:
1319       // since this is always aligned we can (should!) use the same
1320       // stub as for case arrayof_jlong_disjoint_arraycopy
1321       ShouldNotReachHere();
1322       break;
1323     case StubId::stubgen_arrayof_jlong_arraycopy_id:
1324       size = sizeof(jlong);
1325       aligned = true;
1326       is_oop = false;
1327       dest_uninitialized = false;
1328       break;
1329     case StubId::stubgen_oop_arraycopy_id:
1330       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1331       aligned = !UseCompressedOops;
1332       is_oop = true;
1333       dest_uninitialized = false;
1334       break;
1335     case StubId::stubgen_arrayof_oop_arraycopy_id:
1336       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1337       aligned = !UseCompressedOops;
1338       is_oop = true;
1339       dest_uninitialized = false;
1340       break;
1341     case StubId::stubgen_oop_arraycopy_uninit_id:
1342       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1343       aligned = !UseCompressedOops;
1344       is_oop = true;
1345       dest_uninitialized = true;
1346       break;
1347     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1348       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1349       aligned = !UseCompressedOops;
1350       is_oop = true;
1351       dest_uninitialized = true;
1352       break;
1353     default:
1354       ShouldNotReachHere();
1355     }
1356 
1357     StubCodeMark mark(this, stub_id);
1358     address start = __ pc();
1359     __ enter();
1360 
1361     if (nopush_entry != nullptr) {
1362       *nopush_entry = __ pc();
1363       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1364       BLOCK_COMMENT("Entry:");
1365     }
1366 
1367     // use fwd copy when (d-s) above_equal (count*size)
1368     __ sub(t0, d, s);
1369     __ slli(t1, count, exact_log2(size));
1370     Label L_continue;
1371     __ bltu(t0, t1, L_continue);
1372     __ j(RuntimeAddress(nooverlap_target));
1373     __ bind(L_continue);
1374 
1375     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1376     if (dest_uninitialized) {
1377       decorators |= IS_DEST_UNINITIALIZED;
1378     }
1379     if (aligned) {
1380       decorators |= ARRAYCOPY_ALIGNED;
1381     }
1382 
1383     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1384     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1385 
1386     if (is_oop) {
1387       // save regs before copy_memory
1388       __ push_reg(RegSet::of(d, count), sp);
1389     }
1390 
1391     {
1392       // UnsafeMemoryAccess page error: continue after unsafe access
1393       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1394       UnsafeMemoryAccessMark umam(this, add_entry, true);
1395       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1396     }
1397 
1398     if (is_oop) {
1399       __ pop_reg(RegSet::of(d, count), sp);
1400       if (VerifyOops) {
1401         verify_oop_array(size, d, count, t2);
1402       }
1403     }
1404     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1405     __ leave();
1406     __ mv(x10, zr); // return 0
1407     __ ret();
1408     return start;
1409   }
1410 
1411   // Helper for generating a dynamic type check.
1412   // Smashes t0, t1.
1413   void generate_type_check(Register sub_klass,
1414                            Register super_check_offset,
1415                            Register super_klass,
1416                            Register result,
1417                            Register tmp1,
1418                            Register tmp2,
1419                            Label& L_success) {
1420     assert_different_registers(sub_klass, super_check_offset, super_klass);
1421 
1422     BLOCK_COMMENT("type_check:");
1423 
1424     Label L_miss;
1425 
1426     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
1427     __ check_klass_subtype_slow_path(sub_klass, super_klass, tmp1, tmp2, &L_success, nullptr);
1428 
1429     // Fall through on failure!
1430     __ BIND(L_miss);
1431   }
1432 
1433   //
1434   //  Generate checkcasting array copy stub
1435   //
1436   //  Input:
1437   //    c_rarg0   - source array address
1438   //    c_rarg1   - destination array address
1439   //    c_rarg2   - element count, treated as ssize_t, can be zero
1440   //    c_rarg3   - size_t ckoff (super_check_offset)
1441   //    c_rarg4   - oop ckval (super_klass)
1442   //
1443   //  Output:
1444   //    x10 ==  0  -  success
1445   //    x10 == -1^K - failure, where K is partial transfer count
1446   //
1447   address generate_checkcast_copy(StubId stub_id, address* nopush_entry) {
1448     bool dest_uninitialized;
1449     switch (stub_id) {
1450     case StubId::stubgen_checkcast_arraycopy_id:
1451       dest_uninitialized = false;
1452       break;
1453     case StubId::stubgen_checkcast_arraycopy_uninit_id:
1454       dest_uninitialized = true;
1455       break;
1456     default:
1457       ShouldNotReachHere();
1458     }
1459 
1460     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1461 
1462     // Input registers (after setup_arg_regs)
1463     const Register from        = c_rarg0;   // source array address
1464     const Register to          = c_rarg1;   // destination array address
1465     const Register count       = c_rarg2;   // elementscount
1466     const Register ckoff       = c_rarg3;   // super_check_offset
1467     const Register ckval       = c_rarg4;   // super_klass
1468 
1469     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1470 
1471     // Registers used as temps (x7, x9, x18 are save-on-entry)
1472     const Register count_save  = x19;       // orig elementscount
1473     const Register start_to    = x18;       // destination array start address
1474     const Register copied_oop  = x7;        // actual oop copied
1475     const Register r9_klass    = x9;        // oop._klass
1476 
1477     // Registers used as gc temps (x15, x16, x17 are save-on-call)
1478     const Register gct1 = x15, gct2 = x16, gct3 = x17;
1479 
1480     //---------------------------------------------------------------
1481     // Assembler stub will be used for this call to arraycopy
1482     // if the two arrays are subtypes of Object[] but the
1483     // destination array type is not equal to or a supertype
1484     // of the source type.  Each element must be separately
1485     // checked.
1486 
1487     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1488                                copied_oop, r9_klass, count_save);
1489 
1490     __ align(CodeEntryAlignment);
1491     StubCodeMark mark(this, stub_id);
1492     address start = __ pc();
1493 
1494     __ enter(); // required for proper stackwalking of RuntimeStub frame
1495 
1496     // Caller of this entry point must set up the argument registers.
1497     if (nopush_entry != nullptr) {
1498       *nopush_entry = __ pc();
1499       BLOCK_COMMENT("Entry:");
1500     }
1501 
1502     // Empty array:  Nothing to do
1503     __ beqz(count, L_done);
1504 
1505     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1506 
1507 #ifdef ASSERT
1508     BLOCK_COMMENT("assert consistent ckoff/ckval");
1509     // The ckoff and ckval must be mutually consistent,
1510     // even though caller generates both.
1511     { Label L;
1512       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1513       __ lwu(start_to, Address(ckval, sco_offset));
1514       __ beq(ckoff, start_to, L);
1515       __ stop("super_check_offset inconsistent");
1516       __ bind(L);
1517     }
1518 #endif //ASSERT
1519 
1520     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1521     if (dest_uninitialized) {
1522       decorators |= IS_DEST_UNINITIALIZED;
1523     }
1524 
1525     bool is_oop = true;
1526     int element_size = UseCompressedOops ? 4 : 8;
1527 
1528     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1529     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1530 
1531     // save the original count
1532     __ mv(count_save, count);
1533 
1534     // Copy from low to high addresses
1535     __ mv(start_to, to);              // Save destination array start address
1536     __ j(L_load_element);
1537 
1538     // ======== begin loop ========
1539     // (Loop is rotated; its entry is L_load_element.)
1540     // Loop control:
1541     //   for count to 0 do
1542     //     copied_oop = load_heap_oop(from++)
1543     //     ... generate_type_check ...
1544     //     store_heap_oop(to++, copied_oop)
1545     //   end
1546 
1547     __ align(OptoLoopAlignment);
1548 
1549     __ BIND(L_store_element);
1550     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1551                       Address(to, 0), copied_oop,
1552                       gct1, gct2, gct3);
1553     __ addi(to, to, UseCompressedOops ? 4 : 8);
1554     __ subi(count, count, 1);
1555     __ beqz(count, L_do_card_marks);
1556 
1557     // ======== loop entry is here ========
1558     __ BIND(L_load_element);
1559     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1560                      copied_oop, Address(from, 0),
1561                      gct1);
1562     __ addi(from, from, UseCompressedOops ? 4 : 8);
1563     __ beqz(copied_oop, L_store_element);
1564 
1565     __ load_klass(r9_klass, copied_oop);// query the object klass
1566 
1567     BLOCK_COMMENT("type_check:");
1568     generate_type_check(r9_klass, /*sub_klass*/
1569                         ckoff,    /*super_check_offset*/
1570                         ckval,    /*super_klass*/
1571                         x10,      /*result*/
1572                         gct1,     /*tmp1*/
1573                         gct2,     /*tmp2*/
1574                         L_store_element);
1575 
1576     // Fall through on failure!
1577 
1578     // ======== end loop ========
1579 
1580     // It was a real error; we must depend on the caller to finish the job.
1581     // Register count = remaining oops, count_orig = total oops.
1582     // Emit GC store barriers for the oops we have copied and report
1583     // their number to the caller.
1584 
1585     __ sub(count, count_save, count);     // K = partially copied oop count
1586     __ xori(count, count, -1);            // report (-1^K) to caller
1587     __ beqz(count, L_done_pop);
1588 
1589     __ BIND(L_do_card_marks);
1590     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0);
1591 
1592     __ bind(L_done_pop);
1593     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1594     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1595 
1596     __ bind(L_done);
1597     __ mv(x10, count);
1598     __ leave();
1599     __ ret();
1600 
1601     return start;
1602   }
1603 
1604   // Perform range checks on the proposed arraycopy.
1605   // Kills temp, but nothing else.
1606   // Also, clean the sign bits of src_pos and dst_pos.
1607   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1608                               Register src_pos, // source position (c_rarg1)
1609                               Register dst,     // destination array oo (c_rarg2)
1610                               Register dst_pos, // destination position (c_rarg3)
1611                               Register length,
1612                               Register temp,
1613                               Label& L_failed) {
1614     BLOCK_COMMENT("arraycopy_range_checks:");
1615 
1616     assert_different_registers(t0, temp);
1617 
1618     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1619     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1620     __ addw(temp, length, src_pos);
1621     __ bgtu(temp, t0, L_failed);
1622 
1623     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1624     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1625     __ addw(temp, length, dst_pos);
1626     __ bgtu(temp, t0, L_failed);
1627 
1628     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1629     __ zext(src_pos, src_pos, 32);
1630     __ zext(dst_pos, dst_pos, 32);
1631 
1632     BLOCK_COMMENT("arraycopy_range_checks done");
1633   }
1634 
1635   address generate_unsafecopy_common_error_exit() {
1636     address start = __ pc();
1637     __ mv(x10, 0);
1638     __ leave();
1639     __ ret();
1640     return start;
1641   }
1642 
1643   //
1644   //  Generate 'unsafe' set memory stub
1645   //  Though just as safe as the other stubs, it takes an unscaled
1646   //  size_t (# bytes) argument instead of an element count.
1647   //
1648   //  Input:
1649   //    c_rarg0   - destination array address
1650   //    c_rarg1   - byte count (size_t)
1651   //    c_rarg2   - byte value
1652   //
1653   address generate_unsafe_setmemory() {
1654     __ align(CodeEntryAlignment);
1655     StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
1656     StubCodeMark mark(this, stub_id);
1657     address start = __ pc();
1658 
1659     // bump this on entry, not on exit:
1660     // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
1661 
1662     Label L_fill_elements;
1663 
1664     const Register dest = c_rarg0;
1665     const Register count = c_rarg1;
1666     const Register value = c_rarg2;
1667     const Register cnt_words = x28; // temp register
1668     const Register tmp_reg   = x29; // temp register
1669 
1670     // Mark remaining code as such which performs Unsafe accesses.
1671     UnsafeMemoryAccessMark umam(this, true, false);
1672 
1673     __ enter(); // required for proper stackwalking of RuntimeStub frame
1674 
1675     // if count < 8, jump to L_fill_elements
1676     __ mv(tmp_reg, 8); // 8 bytes fill by element
1677     __ bltu(count, tmp_reg, L_fill_elements);
1678 
1679     // Propagate byte to 64-bit width
1680     // 8 bit -> 16 bit
1681     __ zext(value, value, 8);
1682     __ slli(tmp_reg, value, 8);
1683     __ orr(value, value, tmp_reg);
1684     // 16 bit -> 32 bit
1685     __ slli(tmp_reg, value, 16);
1686     __ orr(value, value, tmp_reg);
1687     // 32 bit -> 64 bit
1688     __ slli(tmp_reg, value, 32);
1689     __ orr(value, value, tmp_reg);
1690 
1691     // Align source address at 8 bytes address boundary.
1692     Label L_skip_align1, L_skip_align2, L_skip_align4;
1693     // One byte misalignment happens.
1694     __ test_bit(tmp_reg, dest, 0);
1695     __ beqz(tmp_reg, L_skip_align1);
1696     __ sb(value, Address(dest, 0));
1697     __ addi(dest, dest, 1);
1698     __ subi(count, count, 1);
1699 
1700     __ bind(L_skip_align1);
1701     // Two bytes misalignment happens.
1702     __ test_bit(tmp_reg, dest, 1);
1703     __ beqz(tmp_reg, L_skip_align2);
1704     __ sh(value, Address(dest, 0));
1705     __ addi(dest, dest, 2);
1706     __ subi(count, count, 2);
1707 
1708     __ bind(L_skip_align2);
1709     // Four bytes misalignment happens.
1710     __ test_bit(tmp_reg, dest, 2);
1711     __ beqz(tmp_reg, L_skip_align4);
1712     __ sw(value, Address(dest, 0));
1713     __ addi(dest, dest, 4);
1714     __ subi(count, count, 4);
1715     __ bind(L_skip_align4);
1716 
1717     //  Fill large chunks
1718     __ srli(cnt_words, count, 3); // number of words
1719     __ slli(tmp_reg, cnt_words, 3);
1720     __ sub(count, count, tmp_reg);
1721     {
1722       __ fill_words(dest, cnt_words, value);
1723     }
1724 
1725     // Handle copies less than 8 bytes
1726     __ bind(L_fill_elements);
1727     Label L_fill_2, L_fill_1, L_exit;
1728     __ test_bit(tmp_reg, count, 2);
1729     __ beqz(tmp_reg, L_fill_2);
1730     __ sb(value, Address(dest, 0));
1731     __ sb(value, Address(dest, 1));
1732     __ sb(value, Address(dest, 2));
1733     __ sb(value, Address(dest, 3));
1734     __ addi(dest, dest, 4);
1735 
1736     __ bind(L_fill_2);
1737     __ test_bit(tmp_reg, count, 1);
1738     __ beqz(tmp_reg, L_fill_1);
1739     __ sb(value, Address(dest, 0));
1740     __ sb(value, Address(dest, 1));
1741     __ addi(dest, dest, 2);
1742 
1743     __ bind(L_fill_1);
1744     __ test_bit(tmp_reg, count, 0);
1745     __ beqz(tmp_reg, L_exit);
1746     __ sb(value, Address(dest, 0));
1747 
1748     __ bind(L_exit);
1749     __ leave();
1750     __ ret();
1751 
1752     return start;
1753   }
1754 
1755   //
1756   //  Generate 'unsafe' array copy stub
1757   //  Though just as safe as the other stubs, it takes an unscaled
1758   //  size_t argument instead of an element count.
1759   //
1760   //  Input:
1761   //    c_rarg0   - source array address
1762   //    c_rarg1   - destination array address
1763   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1764   //
1765   // Examines the alignment of the operands and dispatches
1766   // to a long, int, short, or byte copy loop.
1767   //
1768   address generate_unsafe_copy(address byte_copy_entry,
1769                                address short_copy_entry,
1770                                address int_copy_entry,
1771                                address long_copy_entry) {
1772     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1773                 int_copy_entry != nullptr && long_copy_entry != nullptr);
1774     Label L_long_aligned, L_int_aligned, L_short_aligned;
1775     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1776 
1777     __ align(CodeEntryAlignment);
1778     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
1779     StubCodeMark mark(this, stub_id);
1780     address start = __ pc();
1781     __ enter(); // required for proper stackwalking of RuntimeStub frame
1782 
1783     // bump this on entry, not on exit:
1784     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1785 
1786     __ orr(t0, s, d);
1787     __ orr(t0, t0, count);
1788 
1789     __ andi(t0, t0, BytesPerLong - 1);
1790     __ beqz(t0, L_long_aligned);
1791     __ andi(t0, t0, BytesPerInt - 1);
1792     __ beqz(t0, L_int_aligned);
1793     __ test_bit(t0, t0, 0);
1794     __ beqz(t0, L_short_aligned);
1795     __ j(RuntimeAddress(byte_copy_entry));
1796 
1797     __ BIND(L_short_aligned);
1798     __ srli(count, count, LogBytesPerShort);  // size => short_count
1799     __ j(RuntimeAddress(short_copy_entry));
1800     __ BIND(L_int_aligned);
1801     __ srli(count, count, LogBytesPerInt);    // size => int_count
1802     __ j(RuntimeAddress(int_copy_entry));
1803     __ BIND(L_long_aligned);
1804     __ srli(count, count, LogBytesPerLong);   // size => long_count
1805     __ j(RuntimeAddress(long_copy_entry));
1806 
1807     return start;
1808   }
1809 
1810   //
1811   //  Generate generic array copy stubs
1812   //
1813   //  Input:
1814   //    c_rarg0    -  src oop
1815   //    c_rarg1    -  src_pos (32-bits)
1816   //    c_rarg2    -  dst oop
1817   //    c_rarg3    -  dst_pos (32-bits)
1818   //    c_rarg4    -  element count (32-bits)
1819   //
1820   //  Output:
1821   //    x10 ==  0  -  success
1822   //    x10 == -1^K - failure, where K is partial transfer count
1823   //
1824   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
1825                                 address int_copy_entry, address oop_copy_entry,
1826                                 address long_copy_entry, address checkcast_copy_entry) {
1827     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1828                 int_copy_entry != nullptr && oop_copy_entry != nullptr &&
1829                 long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
1830     Label L_failed, L_failed_0, L_objArray;
1831     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1832 
1833     // Input registers
1834     const Register src        = c_rarg0;  // source array oop
1835     const Register src_pos    = c_rarg1;  // source position
1836     const Register dst        = c_rarg2;  // destination array oop
1837     const Register dst_pos    = c_rarg3;  // destination position
1838     const Register length     = c_rarg4;
1839 
1840     // Registers used as temps
1841     const Register dst_klass = c_rarg5;
1842 
1843     __ align(CodeEntryAlignment);
1844 
1845     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
1846     StubCodeMark mark(this, stub_id);
1847 
1848     address start = __ pc();
1849 
1850     __ enter(); // required for proper stackwalking of RuntimeStub frame
1851 
1852     // bump this on entry, not on exit:
1853     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1854 
1855     //-----------------------------------------------------------------------
1856     // Assembler stub will be used for this call to arraycopy
1857     // if the following conditions are met:
1858     //
1859     // (1) src and dst must not be null.
1860     // (2) src_pos must not be negative.
1861     // (3) dst_pos must not be negative.
1862     // (4) length  must not be negative.
1863     // (5) src klass and dst klass should be the same and not null.
1864     // (6) src and dst should be arrays.
1865     // (7) src_pos + length must not exceed length of src.
1866     // (8) dst_pos + length must not exceed length of dst.
1867     //
1868 
1869     // if src is null then return -1
1870     __ beqz(src, L_failed);
1871 
1872     // if [src_pos < 0] then return -1
1873     __ sext(t0, src_pos, 32);
1874     __ bltz(t0, L_failed);
1875 
1876     // if dst is null then return -1
1877     __ beqz(dst, L_failed);
1878 
1879     // if [dst_pos < 0] then return -1
1880     __ sext(t0, dst_pos, 32);
1881     __ bltz(t0, L_failed);
1882 
1883     // registers used as temp
1884     const Register scratch_length    = x28; // elements count to copy
1885     const Register scratch_src_klass = x29; // array klass
1886     const Register lh                = x30; // layout helper
1887 
1888     // if [length < 0] then return -1
1889     __ sext(scratch_length, length, 32); // length (elements count, 32-bits value)
1890     __ bltz(scratch_length, L_failed);
1891 
1892     __ load_klass(scratch_src_klass, src);
1893 #ifdef ASSERT
1894     {
1895       BLOCK_COMMENT("assert klasses not null {");
1896       Label L1, L2;
1897       __ bnez(scratch_src_klass, L2);   // it is broken if klass is null
1898       __ bind(L1);
1899       __ stop("broken null klass");
1900       __ bind(L2);
1901       __ load_klass(t0, dst, t1);
1902       __ beqz(t0, L1);     // this would be broken also
1903       BLOCK_COMMENT("} assert klasses not null done");
1904     }
1905 #endif
1906 
1907     // Load layout helper (32-bits)
1908     //
1909     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1910     // 32        30    24            16              8     2                 0
1911     //
1912     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1913     //
1914 
1915     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1916 
1917     // Handle objArrays completely differently...
1918     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1919     __ lw(lh, Address(scratch_src_klass, lh_offset));
1920     __ mv(t0, objArray_lh);
1921     __ beq(lh, t0, L_objArray);
1922 
1923     // if [src->klass() != dst->klass()] then return -1
1924     __ load_klass(t1, dst);
1925     __ bne(t1, scratch_src_klass, L_failed);
1926 
1927     // Check for flat inline type array -> return -1
1928     __ test_flat_array_oop(src, t1, L_failed);
1929 
1930     // Check for null-free (non-flat) inline type array -> handle as object array
1931     __ test_null_free_array_oop(src, t1, L_objArray);
1932 
1933     // if src->is_Array() isn't null then return -1
1934     // i.e. (lh >= 0)
1935     __ bgez(lh, L_failed);
1936 
1937     // At this point, it is known to be a typeArray (array_tag 0x3).
1938 #ifdef ASSERT
1939     {
1940       BLOCK_COMMENT("assert primitive array {");
1941       Label L;
1942       __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1943       __ bge(lh, t1, L);
1944       __ stop("must be a primitive array");
1945       __ bind(L);
1946       BLOCK_COMMENT("} assert primitive array done");
1947     }
1948 #endif
1949 
1950     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1951                            t1, L_failed);
1952 
1953     // TypeArrayKlass
1954     //
1955     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1956     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1957     //
1958 
1959     const Register t0_offset = t0;    // array offset
1960     const Register x30_elsize = lh;   // element size
1961 
1962     // Get array_header_in_bytes()
1963     int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1964     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1965     __ slli(t0_offset, lh, XLEN - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1966     __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1967 
1968     __ add(src, src, t0_offset);           // src array offset
1969     __ add(dst, dst, t0_offset);           // dst array offset
1970     BLOCK_COMMENT("choose copy loop based on element size");
1971 
1972     // next registers should be set before the jump to corresponding stub
1973     const Register from     = c_rarg0;  // source array address
1974     const Register to       = c_rarg1;  // destination array address
1975     const Register count    = c_rarg2;  // elements count
1976 
1977     // 'from', 'to', 'count' registers should be set in such order
1978     // since they are the same as 'src', 'src_pos', 'dst'.
1979 
1980     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1981 
1982     // The possible values of elsize are 0-3, i.e. exact_log2(element
1983     // size in bytes).  We do a simple bitwise binary search.
1984   __ BIND(L_copy_bytes);
1985     __ test_bit(t0, x30_elsize, 1);
1986     __ bnez(t0, L_copy_ints);
1987     __ test_bit(t0, x30_elsize, 0);
1988     __ bnez(t0, L_copy_shorts);
1989     __ add(from, src, src_pos); // src_addr
1990     __ add(to, dst, dst_pos); // dst_addr
1991     __ sext(count, scratch_length, 32); // length
1992     __ j(RuntimeAddress(byte_copy_entry));
1993 
1994   __ BIND(L_copy_shorts);
1995     __ shadd(from, src_pos, src, t0, 1); // src_addr
1996     __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1997     __ sext(count, scratch_length, 32); // length
1998     __ j(RuntimeAddress(short_copy_entry));
1999 
2000   __ BIND(L_copy_ints);
2001     __ test_bit(t0, x30_elsize, 0);
2002     __ bnez(t0, L_copy_longs);
2003     __ shadd(from, src_pos, src, t0, 2); // src_addr
2004     __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
2005     __ sext(count, scratch_length, 32); // length
2006     __ j(RuntimeAddress(int_copy_entry));
2007 
2008   __ BIND(L_copy_longs);
2009 #ifdef ASSERT
2010     {
2011       BLOCK_COMMENT("assert long copy {");
2012       Label L;
2013       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
2014       __ sext(lh, lh, 32);
2015       __ mv(t0, LogBytesPerLong);
2016       __ beq(x30_elsize, t0, L);
2017       __ stop("must be long copy, but elsize is wrong");
2018       __ bind(L);
2019       BLOCK_COMMENT("} assert long copy done");
2020     }
2021 #endif
2022     __ shadd(from, src_pos, src, t0, 3); // src_addr
2023     __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
2024     __ sext(count, scratch_length, 32); // length
2025     __ j(RuntimeAddress(long_copy_entry));
2026 
2027     // ObjArrayKlass
2028   __ BIND(L_objArray);
2029     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2030 
2031     Label L_plain_copy, L_checkcast_copy;
2032     // test array classes for subtyping
2033     __ load_klass(t2, dst);
2034     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
2035 
2036     // Identically typed arrays can be copied without element-wise checks.
2037     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2038                            t1, L_failed);
2039 
2040     __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2041     __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2042     __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2043     __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2044     __ sext(count, scratch_length, 32); // length
2045   __ BIND(L_plain_copy);
2046     __ j(RuntimeAddress(oop_copy_entry));
2047 
2048   __ BIND(L_checkcast_copy);
2049     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
2050     {
2051       // Before looking at dst.length, make sure dst is also an objArray.
2052       __ lwu(t0, Address(t2, lh_offset));
2053       __ mv(t1, objArray_lh);
2054       __ bne(t0, t1, L_failed);
2055 
2056       // It is safe to examine both src.length and dst.length.
2057       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2058                              t2, L_failed);
2059 
2060       __ load_klass(dst_klass, dst); // reload
2061 
2062       // Marshal the base address arguments now, freeing registers.
2063       __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2064       __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2065       __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2066       __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2067       __ sext(count, length, 32); // length (reloaded)
2068       const Register sco_temp = c_rarg3; // this register is free now
2069       assert_different_registers(from, to, count, sco_temp,
2070                                  dst_klass, scratch_src_klass);
2071 
2072       // Generate the type check.
2073       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2074       __ lwu(sco_temp, Address(dst_klass, sco_offset));
2075 
2076       // Smashes t0, t1
2077       generate_type_check(scratch_src_klass, sco_temp, dst_klass, noreg, noreg, noreg, L_plain_copy);
2078 
2079       // Fetch destination element klass from the ObjArrayKlass header.
2080       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2081       __ ld(dst_klass, Address(dst_klass, ek_offset));
2082       __ lwu(sco_temp, Address(dst_klass, sco_offset));
2083 
2084       // the checkcast_copy loop needs two extra arguments:
2085       assert(c_rarg3 == sco_temp, "#3 already in place");
2086       // Set up arguments for checkcast_copy_entry.
2087       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
2088       __ j(RuntimeAddress(checkcast_copy_entry));
2089     }
2090 
2091   __ BIND(L_failed);
2092     __ mv(x10, -1);
2093     __ leave();   // required for proper stackwalking of RuntimeStub frame
2094     __ ret();
2095 
2096     return start;
2097   }
2098 
2099   //
2100   // Generate stub for array fill. If "aligned" is true, the
2101   // "to" address is assumed to be heapword aligned.
2102   //
2103   // Arguments for generated stub:
2104   //   to:    c_rarg0
2105   //   value: c_rarg1
2106   //   count: c_rarg2 treated as signed
2107   //
2108   address generate_fill(StubId stub_id) {
2109     BasicType t;
2110     bool aligned;
2111 
2112     switch (stub_id) {
2113     case StubId::stubgen_jbyte_fill_id:
2114       t = T_BYTE;
2115       aligned = false;
2116       break;
2117     case StubId::stubgen_jshort_fill_id:
2118       t = T_SHORT;
2119       aligned = false;
2120       break;
2121     case StubId::stubgen_jint_fill_id:
2122       t = T_INT;
2123       aligned = false;
2124       break;
2125     case StubId::stubgen_arrayof_jbyte_fill_id:
2126       t = T_BYTE;
2127       aligned = true;
2128       break;
2129     case StubId::stubgen_arrayof_jshort_fill_id:
2130       t = T_SHORT;
2131       aligned = true;
2132       break;
2133     case StubId::stubgen_arrayof_jint_fill_id:
2134       t = T_INT;
2135       aligned = true;
2136       break;
2137     default:
2138       ShouldNotReachHere();
2139     };
2140 
2141     __ align(CodeEntryAlignment);
2142     StubCodeMark mark(this, stub_id);
2143     address start = __ pc();
2144 
2145     BLOCK_COMMENT("Entry:");
2146 
2147     const Register to        = c_rarg0;  // source array address
2148     const Register value     = c_rarg1;  // value
2149     const Register count     = c_rarg2;  // elements count
2150 
2151     const Register bz_base   = x28;      // base for block_zero routine
2152     const Register cnt_words = x29;      // temp register
2153     const Register tmp_reg   = t1;
2154 
2155     __ enter();
2156 
2157     Label L_fill_elements;
2158 
2159     int shift = -1;
2160     switch (t) {
2161       case T_BYTE:
2162         shift = 0;
2163         // Short arrays (< 8 bytes) fill by element
2164         __ mv(tmp_reg, 8 >> shift);
2165         __ bltu(count, tmp_reg, L_fill_elements);
2166 
2167         // Zero extend value
2168         // 8 bit -> 16 bit
2169         __ zext(value, value, 8);
2170         __ slli(tmp_reg, value, 8);
2171         __ orr(value, value, tmp_reg);
2172 
2173         // 16 bit -> 32 bit
2174         __ slli(tmp_reg, value, 16);
2175         __ orr(value, value, tmp_reg);
2176         break;
2177       case T_SHORT:
2178         shift = 1;
2179         // Short arrays (< 8 bytes) fill by element
2180         __ mv(tmp_reg, 8 >> shift);
2181         __ bltu(count, tmp_reg, L_fill_elements);
2182 
2183         // Zero extend value
2184         // 16 bit -> 32 bit
2185         __ zext(value, value, 16);
2186         __ slli(tmp_reg, value, 16);
2187         __ orr(value, value, tmp_reg);
2188         break;
2189       case T_INT:
2190         shift = 2;
2191         // Short arrays (< 8 bytes) fill by element
2192         __ mv(tmp_reg, 8 >> shift);
2193         __ bltu(count, tmp_reg, L_fill_elements);
2194         break;
2195       default: ShouldNotReachHere();
2196     }
2197 
2198     // Align source address at 8 bytes address boundary.
2199     Label L_skip_align1, L_skip_align2, L_skip_align4;
2200     if (!aligned) {
2201       switch (t) {
2202         case T_BYTE:
2203           // One byte misalignment happens only for byte arrays.
2204           __ test_bit(tmp_reg, to, 0);
2205           __ beqz(tmp_reg, L_skip_align1);
2206           __ sb(value, Address(to, 0));
2207           __ addi(to, to, 1);
2208           __ subiw(count, count, 1);
2209           __ bind(L_skip_align1);
2210           // Fallthrough
2211         case T_SHORT:
2212           // Two bytes misalignment happens only for byte and short (char) arrays.
2213           __ test_bit(tmp_reg, to, 1);
2214           __ beqz(tmp_reg, L_skip_align2);
2215           __ sh(value, Address(to, 0));
2216           __ addi(to, to, 2);
2217           __ subiw(count, count, 2 >> shift);
2218           __ bind(L_skip_align2);
2219           // Fallthrough
2220         case T_INT:
2221           // Align to 8 bytes, we know we are 4 byte aligned to start.
2222           __ test_bit(tmp_reg, to, 2);
2223           __ beqz(tmp_reg, L_skip_align4);
2224           __ sw(value, Address(to, 0));
2225           __ addi(to, to, 4);
2226           __ subiw(count, count, 4 >> shift);
2227           __ bind(L_skip_align4);
2228           break;
2229         default: ShouldNotReachHere();
2230       }
2231     }
2232 
2233     //
2234     //  Fill large chunks
2235     //
2236     __ srliw(cnt_words, count, 3 - shift); // number of words
2237 
2238     // 32 bit -> 64 bit
2239     __ zext(value, value, 32);
2240     __ slli(tmp_reg, value, 32);
2241     __ orr(value, value, tmp_reg);
2242 
2243     __ slli(tmp_reg, cnt_words, 3 - shift);
2244     __ subw(count, count, tmp_reg);
2245     {
2246       __ fill_words(to, cnt_words, value);
2247     }
2248 
2249     // Handle copies less than 8 bytes.
2250     // Address may not be heapword aligned.
2251     Label L_fill_1, L_fill_2, L_exit;
2252     __ bind(L_fill_elements);
2253     switch (t) {
2254       case T_BYTE:
2255         __ test_bit(tmp_reg, count, 2);
2256         __ beqz(tmp_reg, L_fill_2);
2257         __ sb(value, Address(to, 0));
2258         __ sb(value, Address(to, 1));
2259         __ sb(value, Address(to, 2));
2260         __ sb(value, Address(to, 3));
2261         __ addi(to, to, 4);
2262 
2263         __ bind(L_fill_2);
2264         __ test_bit(tmp_reg, count, 1);
2265         __ beqz(tmp_reg, L_fill_1);
2266         __ sb(value, Address(to, 0));
2267         __ sb(value, Address(to, 1));
2268         __ addi(to, to, 2);
2269 
2270         __ bind(L_fill_1);
2271         __ test_bit(tmp_reg, count, 0);
2272         __ beqz(tmp_reg, L_exit);
2273         __ sb(value, Address(to, 0));
2274         break;
2275       case T_SHORT:
2276         __ test_bit(tmp_reg, count, 1);
2277         __ beqz(tmp_reg, L_fill_2);
2278         __ sh(value, Address(to, 0));
2279         __ sh(value, Address(to, 2));
2280         __ addi(to, to, 4);
2281 
2282         __ bind(L_fill_2);
2283         __ test_bit(tmp_reg, count, 0);
2284         __ beqz(tmp_reg, L_exit);
2285         __ sh(value, Address(to, 0));
2286         break;
2287       case T_INT:
2288         __ beqz(count, L_exit);
2289         __ sw(value, Address(to, 0));
2290         break;
2291       default: ShouldNotReachHere();
2292     }
2293     __ bind(L_exit);
2294     __ leave();
2295     __ ret();
2296 
2297     return start;
2298   }
2299 
2300   void generate_arraycopy_stubs() {
2301     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2302     // entry immediately following their stack push. This can be used
2303     // as a post-push branch target for compatible stubs when they
2304     // identify a special case that can be handled by the fallback
2305     // stub e.g a disjoint copy stub may be use as a special case
2306     // fallback for its compatible conjoint copy stub.
2307     //
2308     // A no push entry is always returned in the following local and
2309     // then published by assigning to the appropriate entry field in
2310     // class StubRoutines. The entry value is then passed to the
2311     // generator for the compatible stub. That means the entry must be
2312     // listed when saving to/restoring from the AOT cache, ensuring
2313     // that the inter-stub jumps are noted at AOT-cache save and
2314     // relocated at AOT cache load.
2315     address nopush_entry = nullptr;
2316 
2317     // generate the common exit first so later stubs can rely on it if
2318     // they want an UnsafeMemoryAccess exit non-local to the stub
2319     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2320     // register the stub as the default exit with class UnsafeMemoryAccess
2321     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2322 
2323     // generate and publish riscv-specific bulk copy routines first
2324     // so we can call them from other copy stubs
2325     StubRoutines::riscv::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, c_rarg0, c_rarg1, t1);
2326     StubRoutines::riscv::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, c_rarg0, c_rarg1, t1);
2327 
2328     StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2329 
2330     //*** jbyte
2331     // Always need aligned and unaligned versions
2332     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2333     // disjoint nopush entry is needed by conjoint copy
2334     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
2335     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2336     // conjoint nopush entry is needed by generic/unsafe copy
2337     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2338     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2339     // disjoint arrayof nopush entry is needed by conjoint copy
2340     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
2341     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2342 
2343     //*** jshort
2344     // Always need aligned and unaligned versions
2345     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2346     // disjoint nopush entry is needed by conjoint copy
2347     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
2348     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2349     // conjoint nopush entry is used by generic/unsafe copy
2350     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2351     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2352     // disjoint arrayof nopush entry is needed by conjoint copy
2353     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2354     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2355 
2356     //*** jint
2357     // Aligned versions
2358     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2359     // disjoint arrayof nopush entry is needed by conjoint copy
2360     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2361     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2362     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2363     // entry_jint_arraycopy always points to the unaligned version
2364     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2365     // disjoint nopush entry is needed by conjoint copy
2366     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
2367     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2368     // conjoint nopush entry is needed by generic/unsafe copy
2369     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2370 
2371     //*** jlong
2372     // It is always aligned
2373     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2374     // disjoint arrayof nopush entry is needed by conjoint copy
2375     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2376     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2377     // conjoint nopush entry is needed by generic/unsafe copy
2378     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2379     // disjoint normal/nopush and conjoint normal entries are not
2380     // generated since the arrayof versions are the same
2381     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2382     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2383     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2384 
2385     //*** oops
2386     StubRoutines::_arrayof_oop_disjoint_arraycopy
2387       = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2388       // disjoint arrayof nopush entry is needed by conjoint copy
2389     StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2390     StubRoutines::_arrayof_oop_arraycopy
2391       = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2392     // conjoint arrayof nopush entry is needed by generic/unsafe copy
2393     StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2394     // Aligned versions without pre-barriers
2395     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2396       = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2397     // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2398     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2399 
2400     // note that we don't need a returned nopush entry because the
2401     // generic/unsafe copy does not cater for uninit arrays.
2402     StubRoutines::_arrayof_oop_arraycopy_uninit
2403       = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2404 
2405     // for oop copies reuse arrayof entries for non-arrayof cases
2406     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2407     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2408     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2409     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2410     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2411     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2412 
2413     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2414     // checkcast nopush entry is needed by generic copy
2415     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2416     // note that we don't need a returned nopush entry because the
2417     // generic copy does not cater for uninit arrays.
2418     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2419 
2420 
2421     // unsafe arraycopy may fallback on conjoint stubs
2422     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2423                                                               StubRoutines::_jshort_arraycopy_nopush,
2424                                                               StubRoutines::_jint_arraycopy_nopush,
2425                                                               StubRoutines::_jlong_arraycopy_nopush);
2426 
2427     // generic arraycopy may fallback on conjoint stubs
2428     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2429                                                                StubRoutines::_jshort_arraycopy_nopush,
2430                                                                StubRoutines::_jint_arraycopy_nopush,
2431                                                                StubRoutines::_oop_arraycopy_nopush,
2432                                                                StubRoutines::_jlong_arraycopy_nopush,
2433                                                                StubRoutines::_checkcast_arraycopy_nopush);
2434 
2435     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2436     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2437     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2438     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2439     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2440     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2441 
2442     StubRoutines::_unsafe_setmemory    = generate_unsafe_setmemory();
2443   }
2444 
2445   void aes_load_keys(const Register &key, VectorRegister *working_vregs, int rounds) {
2446     const int step = 16;
2447     for (int i = 0; i < rounds; i++) {
2448       __ vle32_v(working_vregs[i], key);
2449       // The keys are stored in little-endian array, while we need
2450       // to operate in big-endian.
2451       // So performing an endian-swap here with vrev8.v instruction
2452       __ vrev8_v(working_vregs[i], working_vregs[i]);
2453       __ addi(key, key, step);
2454     }
2455   }
2456 
2457   void aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2458     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2459 
2460     __ vxor_vv(res, res, working_vregs[0]);
2461     for (int i = 1; i < rounds - 1; i++) {
2462       __ vaesem_vv(res, working_vregs[i]);
2463     }
2464     __ vaesef_vv(res, working_vregs[rounds - 1]);
2465   }
2466 
2467   // Arguments:
2468   //
2469   // Inputs:
2470   //   c_rarg0   - source byte array address
2471   //   c_rarg1   - destination byte array address
2472   //   c_rarg2   - sessionKe (key) in little endian int array
2473   //
2474   address generate_aescrypt_encryptBlock() {
2475     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2476 
2477     __ align(CodeEntryAlignment);
2478     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2479     StubCodeMark mark(this, stub_id);
2480 
2481     Label L_aes128, L_aes192;
2482 
2483     const Register from        = c_rarg0;  // source array address
2484     const Register to          = c_rarg1;  // destination array address
2485     const Register key         = c_rarg2;  // key array address
2486     const Register keylen      = c_rarg3;
2487 
2488     VectorRegister working_vregs[] = {
2489       v4, v5, v6, v7, v8, v9, v10, v11,
2490       v12, v13, v14, v15, v16, v17, v18
2491     };
2492     const VectorRegister res   = v19;
2493 
2494     address start = __ pc();
2495     __ enter();
2496 
2497     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2498 
2499     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2500     __ vle32_v(res, from);
2501 
2502     __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2503     __ bltu(keylen, t2, L_aes128);
2504     __ beq(keylen, t2, L_aes192);
2505     // Else we fallthrough to the biggest case (256-bit key size)
2506 
2507     // Note: the following function performs key += 15*16
2508     aes_load_keys(key, working_vregs, 15);
2509     aes_encrypt(res, working_vregs, 15);
2510     __ vse32_v(res, to);
2511     __ mv(c_rarg0, 0);
2512     __ leave();
2513     __ ret();
2514 
2515   __ bind(L_aes192);
2516     // Note: the following function performs key += 13*16
2517     aes_load_keys(key, working_vregs, 13);
2518     aes_encrypt(res, working_vregs, 13);
2519     __ vse32_v(res, to);
2520     __ mv(c_rarg0, 0);
2521     __ leave();
2522     __ ret();
2523 
2524   __ bind(L_aes128);
2525     // Note: the following function performs key += 11*16
2526     aes_load_keys(key, working_vregs, 11);
2527     aes_encrypt(res, working_vregs, 11);
2528     __ vse32_v(res, to);
2529     __ mv(c_rarg0, 0);
2530     __ leave();
2531     __ ret();
2532 
2533     return start;
2534   }
2535 
2536   void aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2537     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2538 
2539     __ vxor_vv(res, res, working_vregs[rounds - 1]);
2540     for (int i = rounds - 2; i > 0; i--) {
2541       __ vaesdm_vv(res, working_vregs[i]);
2542     }
2543     __ vaesdf_vv(res, working_vregs[0]);
2544   }
2545 
2546   // Arguments:
2547   //
2548   // Inputs:
2549   //   c_rarg0   - source byte array address
2550   //   c_rarg1   - destination byte array address
2551   //   c_rarg2   - sessionKe (key) in little endian int array
2552   //
2553   address generate_aescrypt_decryptBlock() {
2554     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2555 
2556     __ align(CodeEntryAlignment);
2557     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2558     StubCodeMark mark(this, stub_id);
2559 
2560     Label L_aes128, L_aes192;
2561 
2562     const Register from        = c_rarg0;  // source array address
2563     const Register to          = c_rarg1;  // destination array address
2564     const Register key         = c_rarg2;  // key array address
2565     const Register keylen      = c_rarg3;
2566 
2567     VectorRegister working_vregs[] = {
2568       v4, v5, v6, v7, v8, v9, v10, v11,
2569       v12, v13, v14, v15, v16, v17, v18
2570     };
2571     const VectorRegister res   = v19;
2572 
2573     address start = __ pc();
2574     __ enter(); // required for proper stackwalking of RuntimeStub frame
2575 
2576     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2577 
2578     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2579     __ vle32_v(res, from);
2580 
2581     __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2582     __ bltu(keylen, t2, L_aes128);
2583     __ beq(keylen, t2, L_aes192);
2584     // Else we fallthrough to the biggest case (256-bit key size)
2585 
2586     // Note: the following function performs key += 15*16
2587     aes_load_keys(key, working_vregs, 15);
2588     aes_decrypt(res, working_vregs, 15);
2589     __ vse32_v(res, to);
2590     __ mv(c_rarg0, 0);
2591     __ leave();
2592     __ ret();
2593 
2594   __ bind(L_aes192);
2595     // Note: the following function performs key += 13*16
2596     aes_load_keys(key, working_vregs, 13);
2597     aes_decrypt(res, working_vregs, 13);
2598     __ vse32_v(res, to);
2599     __ mv(c_rarg0, 0);
2600     __ leave();
2601     __ ret();
2602 
2603   __ bind(L_aes128);
2604     // Note: the following function performs key += 11*16
2605     aes_load_keys(key, working_vregs, 11);
2606     aes_decrypt(res, working_vregs, 11);
2607     __ vse32_v(res, to);
2608     __ mv(c_rarg0, 0);
2609     __ leave();
2610     __ ret();
2611 
2612     return start;
2613   }
2614 
2615   void cipherBlockChaining_encryptAESCrypt(int round, Register from, Register to, Register key,
2616                                            Register rvec, Register input_len) {
2617     const Register len = x29;
2618 
2619     VectorRegister working_vregs[] = {
2620       v1, v2, v3, v4, v5, v6, v7, v8,
2621       v9, v10, v11, v12, v13, v14, v15
2622     };
2623 
2624     const unsigned int BLOCK_SIZE = 16;
2625 
2626     __ mv(len, input_len);
2627     // load init rvec
2628     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2629     __ vle32_v(v16, rvec);
2630 
2631     aes_load_keys(key, working_vregs, round);
2632     Label L_enc_loop;
2633     __ bind(L_enc_loop);
2634     // Encrypt from source by block size
2635       __ vle32_v(v17, from);
2636       __ addi(from, from, BLOCK_SIZE);
2637       __ vxor_vv(v16, v16, v17);
2638       aes_encrypt(v16, working_vregs, round);
2639       __ vse32_v(v16, to);
2640       __ addi(to, to, BLOCK_SIZE);
2641       __ subi(len, len, BLOCK_SIZE);
2642       __ bnez(len, L_enc_loop);
2643 
2644     // save current rvec and return
2645     __ vse32_v(v16, rvec);
2646     __ mv(x10, input_len);
2647     __ leave();
2648     __ ret();
2649   }
2650 
2651   // Arguments:
2652   //
2653   // Inputs:
2654   //   c_rarg0   - source byte array address
2655   //   c_rarg1   - destination byte array address
2656   //   c_rarg2   - K (key) in little endian int array
2657   //   c_rarg3   - r vector byte array address
2658   //   c_rarg4   - input length
2659   //
2660   // Output:
2661   //   x10       - input length
2662   //
2663   address generate_cipherBlockChaining_encryptAESCrypt() {
2664     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2665     __ align(CodeEntryAlignment);
2666     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2667     StubCodeMark mark(this, stub_id);
2668 
2669     const Register from       = c_rarg0;
2670     const Register to         = c_rarg1;
2671     const Register key        = c_rarg2;
2672     const Register rvec       = c_rarg3;
2673     const Register input_len  = c_rarg4;
2674 
2675     const Register keylen     = x28;
2676 
2677     address start = __ pc();
2678     __ enter();
2679 
2680     Label L_aes128, L_aes192;
2681     // Compute #rounds for AES based on the length of the key array
2682     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2683     __ mv(t0, 52);
2684     __ bltu(keylen, t0, L_aes128);
2685     __ beq(keylen, t0, L_aes192);
2686     // Else we fallthrough to the biggest case (256-bit key size)
2687 
2688     // Note: the following function performs key += 15*16
2689     cipherBlockChaining_encryptAESCrypt(15, from, to, key, rvec, input_len);
2690 
2691     // Note: the following function performs key += 11*16
2692     __ bind(L_aes128);
2693     cipherBlockChaining_encryptAESCrypt(11, from, to, key, rvec, input_len);
2694 
2695     // Note: the following function performs key += 13*16
2696     __ bind(L_aes192);
2697     cipherBlockChaining_encryptAESCrypt(13, from, to, key, rvec, input_len);
2698 
2699     return start;
2700   }
2701 
2702   void cipherBlockChaining_decryptAESCrypt(int round, Register from, Register to, Register key,
2703                                            Register rvec, Register input_len) {
2704     const Register len = x29;
2705 
2706     VectorRegister working_vregs[] = {
2707       v1, v2, v3, v4, v5, v6, v7, v8,
2708       v9, v10, v11, v12, v13, v14, v15
2709     };
2710 
2711     const unsigned int BLOCK_SIZE = 16;
2712 
2713     __ mv(len, input_len);
2714     // load init rvec
2715     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2716     __ vle32_v(v16, rvec);
2717 
2718     aes_load_keys(key, working_vregs, round);
2719     Label L_dec_loop;
2720     // Decrypt from source by block size
2721     __ bind(L_dec_loop);
2722       __ vle32_v(v17, from);
2723       __ addi(from, from, BLOCK_SIZE);
2724       __ vmv_v_v(v18, v17);
2725       aes_decrypt(v17, working_vregs, round);
2726       __ vxor_vv(v17, v17, v16);
2727       __ vse32_v(v17, to);
2728       __ vmv_v_v(v16, v18);
2729       __ addi(to, to, BLOCK_SIZE);
2730       __ subi(len, len, BLOCK_SIZE);
2731       __ bnez(len, L_dec_loop);
2732 
2733     // save current rvec and return
2734     __ vse32_v(v16, rvec);
2735     __ mv(x10, input_len);
2736     __ leave();
2737     __ ret();
2738   }
2739 
2740   // Arguments:
2741   //
2742   // Inputs:
2743   //   c_rarg0   - source byte array address
2744   //   c_rarg1   - destination byte array address
2745   //   c_rarg2   - K (key) in little endian int array
2746   //   c_rarg3   - r vector byte array address
2747   //   c_rarg4   - input length
2748   //
2749   // Output:
2750   //   x10       - input length
2751   //
2752   address generate_cipherBlockChaining_decryptAESCrypt() {
2753     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2754     __ align(CodeEntryAlignment);
2755     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
2756     StubCodeMark mark(this, stub_id);
2757 
2758     const Register from        = c_rarg0;
2759     const Register to          = c_rarg1;
2760     const Register key         = c_rarg2;
2761     const Register rvec        = c_rarg3;
2762     const Register input_len   = c_rarg4;
2763 
2764     const Register keylen      = x28;
2765 
2766     address start = __ pc();
2767     __ enter();
2768 
2769     Label L_aes128, L_aes192, L_aes128_loop, L_aes192_loop, L_aes256_loop;
2770     // Compute #rounds for AES based on the length of the key array
2771     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2772     __ mv(t0, 52);
2773     __ bltu(keylen, t0, L_aes128);
2774     __ beq(keylen, t0, L_aes192);
2775     // Else we fallthrough to the biggest case (256-bit key size)
2776 
2777     // Note: the following function performs key += 15*16
2778     cipherBlockChaining_decryptAESCrypt(15, from, to, key, rvec, input_len);
2779 
2780     // Note: the following function performs key += 11*16
2781     __ bind(L_aes128);
2782     cipherBlockChaining_decryptAESCrypt(11, from, to, key, rvec, input_len);
2783 
2784     // Note: the following function performs key += 13*16
2785     __ bind(L_aes192);
2786     cipherBlockChaining_decryptAESCrypt(13, from, to, key, rvec, input_len);
2787 
2788     return start;
2789   }
2790 
2791   // Load big-endian 128-bit from memory.
2792   void be_load_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2793     __ ld(counter_lo, Address(counter, 8)); // Load 128-bits from counter
2794     __ ld(counter_hi, Address(counter));
2795     __ rev8(counter_lo, counter_lo);        // Convert big-endian to little-endian
2796     __ rev8(counter_hi, counter_hi);
2797   }
2798 
2799   // Little-endian 128-bit + 64-bit -> 128-bit addition.
2800   void add_counter_128(Register counter_hi, Register counter_lo) {
2801     assert_different_registers(counter_hi, counter_lo, t0);
2802     __ addi(counter_lo, counter_lo, 1);
2803     __ seqz(t0, counter_lo);                // Check for result overflow
2804     __ add(counter_hi, counter_hi, t0);     // Add 1 if overflow otherwise 0
2805   }
2806 
2807   // Store big-endian 128-bit to memory.
2808   void be_store_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2809     assert_different_registers(counter_hi, counter_lo, t0, t1);
2810     __ rev8(t0, counter_lo);                // Convert little-endian to big-endian
2811     __ rev8(t1, counter_hi);
2812     __ sd(t0, Address(counter, 8));         // Store 128-bits to counter
2813     __ sd(t1, Address(counter));
2814   }
2815 
2816   void counterMode_AESCrypt(int round, Register in, Register out, Register key, Register counter,
2817                             Register input_len,  Register saved_encrypted_ctr, Register used_ptr) {
2818     // Algorithm:
2819     //
2820     //   aes_load_keys();
2821     //   load_counter_128(counter_hi, counter_lo, counter);
2822     //
2823     //   L_next:
2824     //     if (used >= BLOCK_SIZE) goto L_main_loop;
2825     //
2826     //   L_encrypt_next:
2827     //       *out = *in ^ saved_encrypted_ctr[used]);
2828     //       out++; in++; used++; len--;
2829     //       if (len == 0) goto L_exit;
2830     //       goto L_next;
2831     //
2832     //   L_main_loop:
2833     //     if (len == 0) goto L_exit;
2834     //     saved_encrypted_ctr = aes_encrypt(counter);
2835     //
2836     //     add_counter_128(counter_hi, counter_lo);
2837     //     be_store_counter_128(counter_hi, counter_lo, counter);
2838     //     used = 0;
2839     //
2840     //     if(len < BLOCK_SIZE) goto L_encrypt_next;
2841     //
2842     //     v_in = load_16Byte(in);
2843     //     v_out = load_16Byte(out);
2844     //     v_saved_encrypted_ctr = load_16Byte(saved_encrypted_ctr);
2845     //     v_out = v_in ^ v_saved_encrypted_ctr;
2846     //     out += BLOCK_SIZE;
2847     //     in += BLOCK_SIZE;
2848     //     len -= BLOCK_SIZE;
2849     //     used = BLOCK_SIZE;
2850     //     goto L_main_loop;
2851     //
2852     //
2853     //   L_exit:
2854     //     store(used);
2855     //     result = input_len
2856     //     return result;
2857 
2858     const Register used          = x28;
2859     const Register len           = x29;
2860     const Register counter_hi    = x30;
2861     const Register counter_lo    = x31;
2862     const Register block_size    = t2;
2863 
2864     const unsigned int BLOCK_SIZE = 16;
2865 
2866     VectorRegister working_vregs[] = {
2867       v1, v2, v3, v4, v5, v6, v7, v8,
2868       v9, v10, v11, v12, v13, v14, v15
2869     };
2870 
2871     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2872 
2873     __ lwu(used, Address(used_ptr));
2874     __ mv(len, input_len);
2875     __ mv(block_size, BLOCK_SIZE);
2876 
2877     // load keys to working_vregs according to round
2878     aes_load_keys(key, working_vregs, round);
2879 
2880     // 128-bit big-endian load
2881     be_load_counter_128(counter_hi, counter_lo, counter);
2882 
2883     Label L_next, L_encrypt_next, L_main_loop, L_exit;
2884     // Check the last saved_encrypted_ctr used value, we fall through
2885     // to L_encrypt_next when the used value lower than block_size
2886     __ bind(L_next);
2887     __ bgeu(used, block_size, L_main_loop);
2888 
2889     // There is still data left fewer than block_size after L_main_loop
2890     // or last used, we encrypt them one by one.
2891     __ bind(L_encrypt_next);
2892     __ add(t0, saved_encrypted_ctr, used);
2893     __ lbu(t1, Address(t0));
2894     __ lbu(t0, Address(in));
2895     __ xorr(t1, t1, t0);
2896     __ sb(t1, Address(out));
2897     __ addi(in, in, 1);
2898     __ addi(out, out, 1);
2899     __ addi(used, used, 1);
2900     __ subi(len, len, 1);
2901     __ beqz(len, L_exit);
2902     __ j(L_next);
2903 
2904     // We will calculate the next saved_encrypted_ctr and encrypt the blocks of data
2905     // one by one until there is less than a full block remaining if len not zero
2906     __ bind(L_main_loop);
2907     __ beqz(len, L_exit);
2908     __ vle32_v(v16, counter);
2909 
2910     // encrypt counter according to round
2911     aes_encrypt(v16, working_vregs, round);
2912 
2913     __ vse32_v(v16, saved_encrypted_ctr);
2914 
2915     // 128-bit little-endian increment
2916     add_counter_128(counter_hi, counter_lo);
2917     // 128-bit big-endian store
2918     be_store_counter_128(counter_hi, counter_lo, counter);
2919 
2920     __ mv(used, 0);
2921     // Check if we have a full block_size
2922     __ bltu(len, block_size, L_encrypt_next);
2923 
2924     // We have one full block to encrypt at least
2925     __ vle32_v(v17, in);
2926     __ vxor_vv(v16, v16, v17);
2927     __ vse32_v(v16, out);
2928     __ add(out, out, block_size);
2929     __ add(in, in, block_size);
2930     __ sub(len, len, block_size);
2931     __ mv(used, block_size);
2932     __ j(L_main_loop);
2933 
2934     __ bind(L_exit);
2935     __ sw(used, Address(used_ptr));
2936     __ mv(x10, input_len);
2937     __ leave();
2938     __ ret();
2939   };
2940 
2941   // CTR AES crypt.
2942   // Arguments:
2943   //
2944   // Inputs:
2945   //   c_rarg0   - source byte array address
2946   //   c_rarg1   - destination byte array address
2947   //   c_rarg2   - K (key) in little endian int array
2948   //   c_rarg3   - counter vector byte array address
2949   //   c_rarg4   - input length
2950   //   c_rarg5   - saved encryptedCounter start
2951   //   c_rarg6   - saved used length
2952   //
2953   // Output:
2954   //   x10       - input length
2955   //
2956   address generate_counterMode_AESCrypt() {
2957     assert(UseAESCTRIntrinsics, "need AES instructions (Zvkned extension) and Zbb extension support");
2958 
2959     __ align(CodeEntryAlignment);
2960     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
2961     StubCodeMark mark(this, stub_id);
2962 
2963     const Register in                  = c_rarg0;
2964     const Register out                 = c_rarg1;
2965     const Register key                 = c_rarg2;
2966     const Register counter             = c_rarg3;
2967     const Register input_len           = c_rarg4;
2968     const Register saved_encrypted_ctr = c_rarg5;
2969     const Register used_len_ptr        = c_rarg6;
2970 
2971     const Register keylen              = c_rarg7; // temporary register
2972 
2973     const address start = __ pc();
2974     __ enter();
2975 
2976     Label L_exit;
2977     __ beqz(input_len, L_exit);
2978 
2979     Label L_aes128, L_aes192;
2980     // Compute #rounds for AES based on the length of the key array
2981     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2982     __ mv(t0, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2983     __ bltu(keylen, t0, L_aes128);
2984     __ beq(keylen, t0, L_aes192);
2985     // Else we fallthrough to the biggest case (256-bit key size)
2986 
2987     // Note: the following function performs crypt with key += 15*16
2988     counterMode_AESCrypt(15, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2989 
2990     // Note: the following function performs crypt with key += 13*16
2991     __ bind(L_aes192);
2992     counterMode_AESCrypt(13, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2993 
2994     // Note: the following function performs crypt with key += 11*16
2995     __ bind(L_aes128);
2996     counterMode_AESCrypt(11, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2997 
2998     __ bind(L_exit);
2999     __ mv(x10, input_len);
3000     __ leave();
3001     __ ret();
3002 
3003     return start;
3004   }
3005 
3006   void ghash_loop(Register state, Register subkeyH, Register data, Register blocks,
3007                   VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3) {
3008     VectorRegister partial_hash = vtmp1;
3009     VectorRegister hash_subkey  = vtmp2;
3010     VectorRegister cipher_text  = vtmp3;
3011 
3012     const unsigned int BLOCK_SIZE = 16;
3013 
3014     __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3015     __ vle64_v(hash_subkey, subkeyH);
3016     __ vrev8_v(hash_subkey, hash_subkey);
3017     __ vle64_v(partial_hash, state);
3018     __ vrev8_v(partial_hash, partial_hash);
3019 
3020     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
3021     Label L_ghash_loop;
3022     __ bind(L_ghash_loop);
3023       __ vle32_v(cipher_text, data);
3024       __ addi(data, data, BLOCK_SIZE);
3025       __ vghsh_vv(partial_hash, hash_subkey, cipher_text);
3026       __ subi(blocks, blocks, 1);
3027       __ bnez(blocks, L_ghash_loop);
3028 
3029     __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3030     __ vrev8_v(partial_hash, partial_hash);
3031     __ vse64_v(partial_hash, state);
3032   }
3033 
3034   /**
3035    *  Arguments:
3036    *
3037    *  Input:
3038    *  c_rarg0   - current state address
3039    *  c_rarg1   - H key address
3040    *  c_rarg2   - data address
3041    *  c_rarg3   - number of blocks
3042    *
3043    *  Output:
3044    *  Updated state at c_rarg0
3045    */
3046   address generate_ghash_processBlocks() {
3047     assert(UseGHASHIntrinsics, "need GHASH instructions (Zvkg extension) and Zvbb support");
3048 
3049     __ align(CodeEntryAlignment);
3050     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
3051     StubCodeMark mark(this, stub_id);
3052 
3053     address start = __ pc();
3054     __ enter();
3055 
3056     Register state   = c_rarg0;
3057     Register subkeyH = c_rarg1;
3058     Register data    = c_rarg2;
3059     Register blocks  = c_rarg3;
3060 
3061     VectorRegister vtmp1 = v1;
3062     VectorRegister vtmp2 = v2;
3063     VectorRegister vtmp3 = v3;
3064 
3065     ghash_loop(state, subkeyH, data, blocks, vtmp1, vtmp2, vtmp3);
3066 
3067     __ leave();
3068     __ ret();
3069 
3070     return start;
3071   }
3072 
3073   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
3074   void compare_string_8_x_LU(Register tmpL, Register tmpU,
3075                              Register strL, Register strU, Label& DIFF) {
3076     const Register tmp = x30, tmpLval = x12;
3077 
3078     int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3079     assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3080 
3081 #ifdef ASSERT
3082     if (AvoidUnalignedAccesses) {
3083       Label align_ok;
3084       __ andi(t0, strL, 0x7);
3085       __ beqz(t0, align_ok);
3086       __ stop("bad alignment");
3087       __ bind(align_ok);
3088     }
3089 #endif
3090     __ ld(tmpLval, Address(strL));
3091     __ addi(strL, strL, wordSize);
3092 
3093     // compare first 4 characters
3094     __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3095     __ addi(strU, strU, wordSize);
3096     __ inflate_lo32(tmpL, tmpLval);
3097     __ xorr(tmp, tmpU, tmpL);
3098     __ bnez(tmp, DIFF);
3099 
3100     // compare second 4 characters
3101     __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3102     __ addi(strU, strU, wordSize);
3103     __ inflate_hi32(tmpL, tmpLval);
3104     __ xorr(tmp, tmpU, tmpL);
3105     __ bnez(tmp, DIFF);
3106   }
3107 
3108   // x10  = result
3109   // x11  = str1
3110   // x12  = cnt1
3111   // x13  = str2
3112   // x14  = cnt2
3113   // x28  = tmp1
3114   // x29  = tmp2
3115   // x30  = tmp3
3116   address generate_compare_long_string_different_encoding(StubId stub_id) {
3117     bool isLU;
3118     switch (stub_id) {
3119     case StubId::stubgen_compare_long_string_LU_id:
3120       isLU = true;
3121       break;
3122     case StubId::stubgen_compare_long_string_UL_id:
3123       isLU = false;
3124       break;
3125     default:
3126       ShouldNotReachHere();
3127     };
3128     __ align(CodeEntryAlignment);
3129     StubCodeMark mark(this, stub_id);
3130     address entry = __ pc();
3131     Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
3132     const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
3133                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
3134 
3135     int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3136     assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3137 
3138     Register strU = isLU ? str2 : str1,
3139              strL = isLU ? str1 : str2,
3140              tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
3141              tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
3142 
3143     if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
3144       // Load 4 bytes from strL to make sure main loop is 8-byte aligned
3145       // cnt2 is >= 68 here, no need to check it for >= 0
3146       __ lwu(tmpL, Address(strL));
3147       __ addi(strL, strL, wordSize / 2);
3148       __ load_long_misaligned(tmpU, Address(strU), tmp4, (base_offset % 8) != 0 ? 4 : 8);
3149       __ addi(strU, strU, wordSize);
3150       __ inflate_lo32(tmp3, tmpL);
3151       __ mv(tmpL, tmp3);
3152       __ xorr(tmp3, tmpU, tmpL);
3153       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3154       __ subi(cnt2, cnt2, wordSize / 2);
3155     }
3156 
3157     // we are now 8-bytes aligned on strL when AvoidUnalignedAccesses is true
3158     __ subi(cnt2, cnt2, wordSize * 2);
3159     __ bltz(cnt2, TAIL);
3160     __ bind(SMALL_LOOP); // smaller loop
3161       __ subi(cnt2, cnt2, wordSize * 2);
3162       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3163       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3164       __ bgez(cnt2, SMALL_LOOP);
3165       __ addi(t0, cnt2, wordSize * 2);
3166       __ beqz(t0, DONE);
3167     __ bind(TAIL);  // 1..15 characters left
3168       // Aligned access. Load bytes in portions - 4, 2, 1.
3169 
3170       __ addi(t0, cnt2, wordSize);
3171       __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
3172       __ bltz(t0, LOAD_LAST);
3173       // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
3174       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3175       __ subi(cnt2, cnt2, wordSize);
3176       __ beqz(cnt2, DONE);  // no character left
3177       __ bind(LOAD_LAST);   // cnt2 = 1..7 characters left
3178 
3179       __ subi(cnt2, cnt2, wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
3180       __ slli(t0, cnt2, 1);     // t0 is now an offset in strU which points to last 16 bytes
3181       __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
3182       __ add(strU, strU, t0);   // Address of last 16 bytes in UTF-16 string
3183       __ load_int_misaligned(tmpL, Address(strL), t0, false);
3184       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3185       __ inflate_lo32(tmp3, tmpL);
3186       __ mv(tmpL, tmp3);
3187       __ xorr(tmp3, tmpU, tmpL);
3188       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3189 
3190       __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
3191       __ addi(strU, strU, wordSize);   // Address of last 8 bytes in UTF-16 string
3192       __ load_int_misaligned(tmpL, Address(strL), t0, false);
3193       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3194       __ inflate_lo32(tmp3, tmpL);
3195       __ mv(tmpL, tmp3);
3196       __ xorr(tmp3, tmpU, tmpL);
3197       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3198       __ j(DONE); // no character left
3199 
3200       // Find the first different characters in the longwords and
3201       // compute their difference.
3202     __ bind(CALCULATE_DIFFERENCE);
3203       // count bits of trailing zero chars
3204       __ ctzc_bits(tmp4, tmp3);
3205       __ srl(tmp1, tmp1, tmp4);
3206       __ srl(tmp2, tmp2, tmp4);
3207       __ zext(tmp1, tmp1, 16);
3208       __ zext(tmp2, tmp2, 16);
3209       __ sub(result, tmp1, tmp2);
3210     __ bind(DONE);
3211       __ ret();
3212     return entry;
3213   }
3214 
3215   address generate_method_entry_barrier() {
3216     __ align(CodeEntryAlignment);
3217     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
3218     StubCodeMark mark(this, stub_id);
3219 
3220     Label deoptimize_label;
3221 
3222     address start = __ pc();
3223 
3224     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
3225 
3226     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
3227       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
3228       Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
3229       __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
3230       __ lwu(t1, t1);
3231       __ sw(t1, thread_epoch_addr);
3232       // There are two ways this can work:
3233       // - The writer did system icache shootdown after the instruction stream update.
3234       //   Hence do nothing.
3235       // - The writer trust us to make sure our icache is in sync before entering.
3236       //   Hence use cmodx fence (fence.i, may change).
3237       if (UseCtxFencei) {
3238         __ cmodx_fence();
3239       }
3240       __ membar(__ LoadLoad);
3241     }
3242 
3243     __ set_last_Java_frame(sp, fp, ra);
3244 
3245     __ enter();
3246     __ addi(t1, sp, wordSize);
3247 
3248     __ subi(sp, sp, 4 * wordSize);
3249 
3250     __ push_call_clobbered_registers();
3251 
3252     __ mv(c_rarg0, t1);
3253     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
3254 
3255     __ reset_last_Java_frame(true);
3256 
3257     __ mv(t0, x10);
3258 
3259     __ pop_call_clobbered_registers();
3260 
3261     __ bnez(t0, deoptimize_label);
3262 
3263     __ leave();
3264     __ ret();
3265 
3266     __ BIND(deoptimize_label);
3267 
3268     __ ld(t0, Address(sp, 0));
3269     __ ld(fp, Address(sp, wordSize));
3270     __ ld(ra, Address(sp, wordSize * 2));
3271     __ ld(t1, Address(sp, wordSize * 3));
3272 
3273     __ mv(sp, t0);
3274     __ jr(t1);
3275 
3276     return start;
3277   }
3278 
3279   // x10  = result
3280   // x11  = str1
3281   // x12  = cnt1
3282   // x13  = str2
3283   // x14  = cnt2
3284   // x28  = tmp1
3285   // x29  = tmp2
3286   // x30  = tmp3
3287   // x31  = tmp4
3288   address generate_compare_long_string_same_encoding(StubId stub_id) {
3289     bool isLL;
3290     switch (stub_id) {
3291     case StubId::stubgen_compare_long_string_LL_id:
3292       isLL = true;
3293       break;
3294     case StubId::stubgen_compare_long_string_UU_id:
3295       isLL = false;
3296       break;
3297     default:
3298       ShouldNotReachHere();
3299     };
3300     __ align(CodeEntryAlignment);
3301     StubCodeMark mark(this, stub_id);
3302     address entry = __ pc();
3303     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
3304           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
3305     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
3306                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
3307     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
3308 
3309     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
3310     // update cnt2 counter with already loaded 8 bytes
3311     __ subi(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
3312     // update pointers, because of previous read
3313     __ addi(str1, str1, wordSize);
3314     __ addi(str2, str2, wordSize);
3315     // less than 16 bytes left?
3316     __ subi(cnt2, cnt2, isLL ? 16 : 8);
3317     __ push_reg(spilled_regs, sp);
3318     __ bltz(cnt2, TAIL);
3319     __ bind(SMALL_LOOP);
3320       // compare 16 bytes of strings with same encoding
3321       __ ld(tmp5, Address(str1));
3322       __ addi(str1, str1, 8);
3323       __ xorr(tmp4, tmp1, tmp2);
3324       __ ld(cnt1, Address(str2));
3325       __ addi(str2, str2, 8);
3326       __ bnez(tmp4, DIFF);
3327       __ ld(tmp1, Address(str1));
3328       __ addi(str1, str1, 8);
3329       __ xorr(tmp4, tmp5, cnt1);
3330       __ ld(tmp2, Address(str2));
3331       __ addi(str2, str2, 8);
3332       __ bnez(tmp4, DIFF2);
3333 
3334       __ subi(cnt2, cnt2, isLL ? 16 : 8);
3335       __ bgez(cnt2, SMALL_LOOP);
3336     __ bind(TAIL);
3337       __ addi(cnt2, cnt2, isLL ? 16 : 8);
3338       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
3339       __ subi(cnt2, cnt2, isLL ? 8 : 4);
3340       __ blez(cnt2, CHECK_LAST);
3341       __ xorr(tmp4, tmp1, tmp2);
3342       __ bnez(tmp4, DIFF);
3343       __ ld(tmp1, Address(str1));
3344       __ addi(str1, str1, 8);
3345       __ ld(tmp2, Address(str2));
3346       __ addi(str2, str2, 8);
3347       __ subi(cnt2, cnt2, isLL ? 8 : 4);
3348     __ bind(CHECK_LAST);
3349       if (!isLL) {
3350         __ add(cnt2, cnt2, cnt2); // now in bytes
3351       }
3352       __ xorr(tmp4, tmp1, tmp2);
3353       __ bnez(tmp4, DIFF);
3354       __ add(str1, str1, cnt2);
3355       __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
3356       __ add(str2, str2, cnt2);
3357       __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
3358       __ xorr(tmp4, tmp5, cnt1);
3359       __ beqz(tmp4, LENGTH_DIFF);
3360       // Find the first different characters in the longwords and
3361       // compute their difference.
3362     __ bind(DIFF2);
3363       // count bits of trailing zero chars
3364       __ ctzc_bits(tmp3, tmp4, isLL);
3365       __ srl(tmp5, tmp5, tmp3);
3366       __ srl(cnt1, cnt1, tmp3);
3367       if (isLL) {
3368         __ zext(tmp5, tmp5, 8);
3369         __ zext(cnt1, cnt1, 8);
3370       } else {
3371         __ zext(tmp5, tmp5, 16);
3372         __ zext(cnt1, cnt1, 16);
3373       }
3374       __ sub(result, tmp5, cnt1);
3375       __ j(LENGTH_DIFF);
3376     __ bind(DIFF);
3377       // count bits of trailing zero chars
3378       __ ctzc_bits(tmp3, tmp4, isLL);
3379       __ srl(tmp1, tmp1, tmp3);
3380       __ srl(tmp2, tmp2, tmp3);
3381       if (isLL) {
3382         __ zext(tmp1, tmp1, 8);
3383         __ zext(tmp2, tmp2, 8);
3384       } else {
3385         __ zext(tmp1, tmp1, 16);
3386         __ zext(tmp2, tmp2, 16);
3387       }
3388       __ sub(result, tmp1, tmp2);
3389       __ j(LENGTH_DIFF);
3390     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
3391       __ xorr(tmp4, tmp1, tmp2);
3392       __ bnez(tmp4, DIFF);
3393     __ bind(LENGTH_DIFF);
3394       __ pop_reg(spilled_regs, sp);
3395       __ ret();
3396     return entry;
3397   }
3398 
3399   void generate_compare_long_strings() {
3400     StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_LL_id);
3401     StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_UU_id);
3402     StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_LU_id);
3403     StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_UL_id);
3404   }
3405 
3406   // x10 result
3407   // x11 src
3408   // x12 src count
3409   // x13 pattern
3410   // x14 pattern count
3411   address generate_string_indexof_linear(StubId stub_id)
3412   {
3413     bool needle_isL;
3414     bool haystack_isL;
3415     switch (stub_id) {
3416     case StubId::stubgen_string_indexof_linear_ll_id:
3417       needle_isL = true;
3418       haystack_isL = true;
3419       break;
3420     case StubId::stubgen_string_indexof_linear_ul_id:
3421       needle_isL = true;
3422       haystack_isL = false;
3423       break;
3424     case StubId::stubgen_string_indexof_linear_uu_id:
3425       needle_isL = false;
3426       haystack_isL = false;
3427       break;
3428     default:
3429       ShouldNotReachHere();
3430     };
3431 
3432     __ align(CodeEntryAlignment);
3433     StubCodeMark mark(this, stub_id);
3434     address entry = __ pc();
3435 
3436     int needle_chr_size = needle_isL ? 1 : 2;
3437     int haystack_chr_size = haystack_isL ? 1 : 2;
3438     int needle_chr_shift = needle_isL ? 0 : 1;
3439     int haystack_chr_shift = haystack_isL ? 0 : 1;
3440     bool isL = needle_isL && haystack_isL;
3441     // parameters
3442     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
3443     // temporary registers
3444     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
3445     // redefinitions
3446     Register ch1 = x28, ch2 = x29;
3447     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
3448 
3449     __ push_reg(spilled_regs, sp);
3450 
3451     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
3452           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
3453           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
3454           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
3455           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
3456           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
3457 
3458     __ ld(ch1, Address(needle));
3459     __ ld(ch2, Address(haystack));
3460     // src.length - pattern.length
3461     __ sub(haystack_len, haystack_len, needle_len);
3462 
3463     // first is needle[0]
3464     __ zext(first, ch1, needle_isL ? 8 : 16);
3465 
3466     uint64_t mask0101 = UCONST64(0x0101010101010101);
3467     uint64_t mask0001 = UCONST64(0x0001000100010001);
3468     __ mv(mask1, haystack_isL ? mask0101 : mask0001);
3469     __ mul(first, first, mask1);
3470     uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
3471     uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
3472     __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
3473     if (needle_isL != haystack_isL) {
3474       __ mv(tmp, ch1);
3475     }
3476     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
3477     __ blez(haystack_len, L_SMALL);
3478 
3479     if (needle_isL != haystack_isL) {
3480       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3481     }
3482     // xorr, sub, orr, notr, andr
3483     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
3484     // eg:
3485     // first:        aa aa aa aa aa aa aa aa
3486     // ch2:          aa aa li nx jd ka aa aa
3487     // match_mask:   80 80 00 00 00 00 80 80
3488     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3489 
3490     // search first char of needle, if success, goto L_HAS_ZERO;
3491     __ bnez(match_mask, L_HAS_ZERO);
3492     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3493     __ addi(result, result, wordSize / haystack_chr_size);
3494     __ addi(haystack, haystack, wordSize);
3495     __ bltz(haystack_len, L_POST_LOOP);
3496 
3497     __ bind(L_LOOP);
3498     __ ld(ch2, Address(haystack));
3499     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3500     __ bnez(match_mask, L_HAS_ZERO);
3501 
3502     __ bind(L_LOOP_PROCEED);
3503     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3504     __ addi(haystack, haystack, wordSize);
3505     __ addi(result, result, wordSize / haystack_chr_size);
3506     __ bgez(haystack_len, L_LOOP);
3507 
3508     __ bind(L_POST_LOOP);
3509     __ mv(ch2, -wordSize / haystack_chr_size);
3510     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
3511     __ ld(ch2, Address(haystack));
3512     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3513     __ neg(haystack_len, haystack_len);
3514     __ xorr(ch2, first, ch2);
3515     __ sub(match_mask, ch2, mask1);
3516     __ orr(ch2, ch2, mask2);
3517     __ mv(trailing_zeros, -1); // all bits set
3518     __ j(L_SMALL_PROCEED);
3519 
3520     __ align(OptoLoopAlignment);
3521     __ bind(L_SMALL);
3522     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3523     __ neg(haystack_len, haystack_len);
3524     if (needle_isL != haystack_isL) {
3525       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3526     }
3527     __ xorr(ch2, first, ch2);
3528     __ sub(match_mask, ch2, mask1);
3529     __ orr(ch2, ch2, mask2);
3530     __ mv(trailing_zeros, -1); // all bits set
3531 
3532     __ bind(L_SMALL_PROCEED);
3533     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
3534     __ notr(ch2, ch2);
3535     __ andr(match_mask, match_mask, ch2);
3536     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
3537     __ beqz(match_mask, NOMATCH);
3538 
3539     __ bind(L_SMALL_HAS_ZERO_LOOP);
3540     // count bits of trailing zero chars
3541     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, ch2, tmp);
3542     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3543     __ mv(ch2, wordSize / haystack_chr_size);
3544     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
3545     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3546     __ mv(trailing_zeros, wordSize / haystack_chr_size);
3547     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3548 
3549     __ bind(L_SMALL_CMP_LOOP);
3550     __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
3551     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3552     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
3553     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3554     __ addi(trailing_zeros, trailing_zeros, 1);
3555     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
3556     __ beq(first, ch2, L_SMALL_CMP_LOOP);
3557 
3558     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
3559     __ beqz(match_mask, NOMATCH);
3560     // count bits of trailing zero chars
3561     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3562     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3563     __ addi(result, result, 1);
3564     __ addi(haystack, haystack, haystack_chr_size);
3565     __ j(L_SMALL_HAS_ZERO_LOOP);
3566 
3567     __ align(OptoLoopAlignment);
3568     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
3569     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3570     __ j(DONE);
3571 
3572     __ align(OptoLoopAlignment);
3573     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
3574     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3575     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3576     __ j(DONE);
3577 
3578     __ align(OptoLoopAlignment);
3579     __ bind(L_HAS_ZERO);
3580     // count bits of trailing zero chars
3581     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3582     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3583     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
3584     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
3585     __ subi(result, result, 1); // array index from 0, so result -= 1
3586 
3587     __ bind(L_HAS_ZERO_LOOP);
3588     __ mv(needle_len, wordSize / haystack_chr_size);
3589     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
3590     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
3591     // load next 8 bytes from haystack, and increase result index
3592     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3593     __ addi(result, result, 1);
3594     __ mv(trailing_zeros, wordSize / haystack_chr_size);
3595     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3596 
3597     // compare one char
3598     __ bind(L_CMP_LOOP);
3599     __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
3600     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
3601     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3602     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3603     __ addi(trailing_zeros, trailing_zeros, 1); // next char index
3604     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
3605     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
3606     __ beq(needle_len, ch2, L_CMP_LOOP);
3607 
3608     __ bind(L_CMP_LOOP_NOMATCH);
3609     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
3610     // count bits of trailing zero chars
3611     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, needle_len, ch2);
3612     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3613     __ addi(haystack, haystack, haystack_chr_size);
3614     __ j(L_HAS_ZERO_LOOP);
3615 
3616     __ align(OptoLoopAlignment);
3617     __ bind(L_CMP_LOOP_LAST_CMP);
3618     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
3619     __ j(DONE);
3620 
3621     __ align(OptoLoopAlignment);
3622     __ bind(L_CMP_LOOP_LAST_CMP2);
3623     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3624     __ addi(result, result, 1);
3625     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3626     __ j(DONE);
3627 
3628     __ align(OptoLoopAlignment);
3629     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
3630     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
3631     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
3632     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
3633     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
3634     // result by analyzed characters value, so, we can just reset lower bits
3635     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
3636     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
3637     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
3638     // index of last analyzed substring inside current octet. So, haystack in at
3639     // respective start address. We need to advance it to next octet
3640     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
3641     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
3642     __ andi(result, result, haystack_isL ? -8 : -4);
3643     __ slli(tmp, match_mask, haystack_chr_shift);
3644     __ sub(haystack, haystack, tmp);
3645     __ sext(haystack_len, haystack_len, 32);
3646     __ j(L_LOOP_PROCEED);
3647 
3648     __ align(OptoLoopAlignment);
3649     __ bind(NOMATCH);
3650     __ mv(result, -1);
3651 
3652     __ bind(DONE);
3653     __ pop_reg(spilled_regs, sp);
3654     __ ret();
3655     return entry;
3656   }
3657 
3658   void generate_string_indexof_stubs()
3659   {
3660     StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ll_id);
3661     StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_uu_id);
3662     StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ul_id);
3663   }
3664 
3665 #ifdef COMPILER2
3666   void generate_lookup_secondary_supers_table_stub() {
3667     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
3668     StubCodeMark mark(this, stub_id);
3669 
3670     const Register
3671       r_super_klass  = x10,
3672       r_array_base   = x11,
3673       r_array_length = x12,
3674       r_array_index  = x13,
3675       r_sub_klass    = x14,
3676       result         = x15,
3677       r_bitmap       = x16;
3678 
3679     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
3680       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
3681       Label L_success;
3682       __ enter();
3683       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, result,
3684                                              r_array_base, r_array_length, r_array_index,
3685                                              r_bitmap, slot, /*stub_is_near*/true);
3686       __ leave();
3687       __ ret();
3688     }
3689   }
3690 
3691   // Slow path implementation for UseSecondarySupersTable.
3692   address generate_lookup_secondary_supers_table_slow_path_stub() {
3693     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
3694     StubCodeMark mark(this, stub_id);
3695 
3696     address start = __ pc();
3697     const Register
3698       r_super_klass  = x10,        // argument
3699       r_array_base   = x11,        // argument
3700       temp1          = x12,        // tmp
3701       r_array_index  = x13,        // argument
3702       result         = x15,        // argument
3703       r_bitmap       = x16;        // argument
3704 
3705 
3706     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
3707     __ ret();
3708 
3709     return start;
3710   }
3711 
3712   address generate_mulAdd()
3713   {
3714     __ align(CodeEntryAlignment);
3715     StubId stub_id = StubId::stubgen_mulAdd_id;
3716     StubCodeMark mark(this, stub_id);
3717 
3718     address entry = __ pc();
3719 
3720     const Register out     = x10;
3721     const Register in      = x11;
3722     const Register offset  = x12;
3723     const Register len     = x13;
3724     const Register k       = x14;
3725     const Register tmp     = x28;
3726 
3727     BLOCK_COMMENT("Entry:");
3728     __ enter();
3729     __ mul_add(out, in, offset, len, k, tmp);
3730     __ leave();
3731     __ ret();
3732 
3733     return entry;
3734   }
3735 
3736   /**
3737    *  Arguments:
3738    *
3739    *  Input:
3740    *    c_rarg0   - x address
3741    *    c_rarg1   - x length
3742    *    c_rarg2   - y address
3743    *    c_rarg3   - y length
3744    *    c_rarg4   - z address
3745    */
3746   address generate_multiplyToLen()
3747   {
3748     __ align(CodeEntryAlignment);
3749     StubId stub_id = StubId::stubgen_multiplyToLen_id;
3750     StubCodeMark mark(this, stub_id);
3751     address entry = __ pc();
3752 
3753     const Register x     = x10;
3754     const Register xlen  = x11;
3755     const Register y     = x12;
3756     const Register ylen  = x13;
3757     const Register z     = x14;
3758 
3759     const Register tmp0  = x15;
3760     const Register tmp1  = x16;
3761     const Register tmp2  = x17;
3762     const Register tmp3  = x7;
3763     const Register tmp4  = x28;
3764     const Register tmp5  = x29;
3765     const Register tmp6  = x30;
3766     const Register tmp7  = x31;
3767 
3768     BLOCK_COMMENT("Entry:");
3769     __ enter(); // required for proper stackwalking of RuntimeStub frame
3770     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3771     __ leave(); // required for proper stackwalking of RuntimeStub frame
3772     __ ret();
3773 
3774     return entry;
3775   }
3776 
3777   address generate_squareToLen()
3778   {
3779     __ align(CodeEntryAlignment);
3780     StubId stub_id = StubId::stubgen_squareToLen_id;
3781     StubCodeMark mark(this, stub_id);
3782     address entry = __ pc();
3783 
3784     const Register x     = x10;
3785     const Register xlen  = x11;
3786     const Register z     = x12;
3787     const Register y     = x14; // == x
3788     const Register ylen  = x15; // == xlen
3789 
3790     const Register tmp0  = x13; // zlen, unused
3791     const Register tmp1  = x16;
3792     const Register tmp2  = x17;
3793     const Register tmp3  = x7;
3794     const Register tmp4  = x28;
3795     const Register tmp5  = x29;
3796     const Register tmp6  = x30;
3797     const Register tmp7  = x31;
3798 
3799     BLOCK_COMMENT("Entry:");
3800     __ enter();
3801     __ mv(y, x);
3802     __ mv(ylen, xlen);
3803     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3804     __ leave();
3805     __ ret();
3806 
3807     return entry;
3808   }
3809 
3810   // Arguments:
3811   //
3812   // Input:
3813   //   c_rarg0   - newArr address
3814   //   c_rarg1   - oldArr address
3815   //   c_rarg2   - newIdx
3816   //   c_rarg3   - shiftCount
3817   //   c_rarg4   - numIter
3818   //
3819   address generate_bigIntegerLeftShift() {
3820     __ align(CodeEntryAlignment);
3821     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
3822     StubCodeMark mark(this, stub_id);
3823     address entry = __ pc();
3824 
3825     Label loop, exit;
3826 
3827     Register newArr        = c_rarg0;
3828     Register oldArr        = c_rarg1;
3829     Register newIdx        = c_rarg2;
3830     Register shiftCount    = c_rarg3;
3831     Register numIter       = c_rarg4;
3832 
3833     Register shiftRevCount = c_rarg5;
3834     Register oldArrNext    = t1;
3835 
3836     __ beqz(numIter, exit);
3837     __ shadd(newArr, newIdx, newArr, t0, 2);
3838 
3839     __ mv(shiftRevCount, 32);
3840     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3841 
3842     __ bind(loop);
3843     __ addi(oldArrNext, oldArr, 4);
3844     __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
3845     __ vle32_v(v0, oldArr);
3846     __ vle32_v(v4, oldArrNext);
3847     __ vsll_vx(v0, v0, shiftCount);
3848     __ vsrl_vx(v4, v4, shiftRevCount);
3849     __ vor_vv(v0, v0, v4);
3850     __ vse32_v(v0, newArr);
3851     __ sub(numIter, numIter, t0);
3852     __ shadd(oldArr, t0, oldArr, t1, 2);
3853     __ shadd(newArr, t0, newArr, t1, 2);
3854     __ bnez(numIter, loop);
3855 
3856     __ bind(exit);
3857     __ ret();
3858 
3859     return entry;
3860   }
3861 
3862   // Arguments:
3863   //
3864   // Input:
3865   //   c_rarg0   - newArr address
3866   //   c_rarg1   - oldArr address
3867   //   c_rarg2   - newIdx
3868   //   c_rarg3   - shiftCount
3869   //   c_rarg4   - numIter
3870   //
3871   address generate_bigIntegerRightShift() {
3872     __ align(CodeEntryAlignment);
3873     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
3874     StubCodeMark mark(this, stub_id);
3875     address entry = __ pc();
3876 
3877     Label loop, exit;
3878 
3879     Register newArr        = c_rarg0;
3880     Register oldArr        = c_rarg1;
3881     Register newIdx        = c_rarg2;
3882     Register shiftCount    = c_rarg3;
3883     Register numIter       = c_rarg4;
3884     Register idx           = numIter;
3885 
3886     Register shiftRevCount = c_rarg5;
3887     Register oldArrNext    = c_rarg6;
3888     Register newArrCur     = t0;
3889     Register oldArrCur     = t1;
3890 
3891     __ beqz(idx, exit);
3892     __ shadd(newArr, newIdx, newArr, t0, 2);
3893 
3894     __ mv(shiftRevCount, 32);
3895     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3896 
3897     __ bind(loop);
3898     __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3899     __ sub(idx, idx, t0);
3900     __ shadd(oldArrNext, idx, oldArr, t1, 2);
3901     __ shadd(newArrCur, idx, newArr, t1, 2);
3902     __ addi(oldArrCur, oldArrNext, 4);
3903     __ vle32_v(v0, oldArrCur);
3904     __ vle32_v(v4, oldArrNext);
3905     __ vsrl_vx(v0, v0, shiftCount);
3906     __ vsll_vx(v4, v4, shiftRevCount);
3907     __ vor_vv(v0, v0, v4);
3908     __ vse32_v(v0, newArrCur);
3909     __ bnez(idx, loop);
3910 
3911     __ bind(exit);
3912     __ ret();
3913 
3914     return entry;
3915   }
3916 #endif
3917 
3918 #ifdef COMPILER2
3919   class MontgomeryMultiplyGenerator : public MacroAssembler {
3920 
3921     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3922       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3923 
3924     RegSet _toSave;
3925     bool _squaring;
3926 
3927   public:
3928     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3929       : MacroAssembler(as->code()), _squaring(squaring) {
3930 
3931       // Register allocation
3932 
3933       RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
3934       Pa_base = *regs;       // Argument registers
3935       if (squaring) {
3936         Pb_base = Pa_base;
3937       } else {
3938         Pb_base = *++regs;
3939       }
3940       Pn_base = *++regs;
3941       Rlen= *++regs;
3942       inv = *++regs;
3943       Pm_base = *++regs;
3944 
3945                         // Working registers:
3946       Ra =  *++regs;    // The current digit of a, b, n, and m.
3947       Rb =  *++regs;
3948       Rm =  *++regs;
3949       Rn =  *++regs;
3950 
3951       Pa =  *++regs;      // Pointers to the current/next digit of a, b, n, and m.
3952       Pb =  *++regs;
3953       Pm =  *++regs;
3954       Pn =  *++regs;
3955 
3956       tmp0 =  *++regs;    // Three registers which form a
3957       tmp1 =  *++regs;    // triple-precision accumuator.
3958       tmp2 =  *++regs;
3959 
3960       Ri =  x6;         // Inner and outer loop indexes.
3961       Rj =  x7;
3962 
3963       Rhi_ab = x28;     // Product registers: low and high parts
3964       Rlo_ab = x29;     // of a*b and m*n.
3965       Rhi_mn = x30;
3966       Rlo_mn = x31;
3967 
3968       // x18 and up are callee-saved.
3969       _toSave = RegSet::range(x18, *regs) + Pm_base;
3970     }
3971 
3972   private:
3973     void save_regs() {
3974       push_reg(_toSave, sp);
3975     }
3976 
3977     void restore_regs() {
3978       pop_reg(_toSave, sp);
3979     }
3980 
3981     template <typename T>
3982     void unroll_2(Register count, T block) {
3983       Label loop, end, odd;
3984       beqz(count, end);
3985       test_bit(t0, count, 0);
3986       bnez(t0, odd);
3987       align(16);
3988       bind(loop);
3989       (this->*block)();
3990       bind(odd);
3991       (this->*block)();
3992       subi(count, count, 2);
3993       bgtz(count, loop);
3994       bind(end);
3995     }
3996 
3997     template <typename T>
3998     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3999       Label loop, end, odd;
4000       beqz(count, end);
4001       test_bit(tmp, count, 0);
4002       bnez(tmp, odd);
4003       align(16);
4004       bind(loop);
4005       (this->*block)(d, s, tmp);
4006       bind(odd);
4007       (this->*block)(d, s, tmp);
4008       subi(count, count, 2);
4009       bgtz(count, loop);
4010       bind(end);
4011     }
4012 
4013     void pre1(RegisterOrConstant i) {
4014       block_comment("pre1");
4015       // Pa = Pa_base;
4016       // Pb = Pb_base + i;
4017       // Pm = Pm_base;
4018       // Pn = Pn_base + i;
4019       // Ra = *Pa;
4020       // Rb = *Pb;
4021       // Rm = *Pm;
4022       // Rn = *Pn;
4023       if (i.is_register()) {
4024         slli(t0, i.as_register(), LogBytesPerWord);
4025       } else {
4026         mv(t0, i.as_constant());
4027         slli(t0, t0, LogBytesPerWord);
4028       }
4029 
4030       mv(Pa, Pa_base);
4031       add(Pb, Pb_base, t0);
4032       mv(Pm, Pm_base);
4033       add(Pn, Pn_base, t0);
4034 
4035       ld(Ra, Address(Pa));
4036       ld(Rb, Address(Pb));
4037       ld(Rm, Address(Pm));
4038       ld(Rn, Address(Pn));
4039 
4040       // Zero the m*n result.
4041       mv(Rhi_mn, zr);
4042       mv(Rlo_mn, zr);
4043     }
4044 
4045     // The core multiply-accumulate step of a Montgomery
4046     // multiplication.  The idea is to schedule operations as a
4047     // pipeline so that instructions with long latencies (loads and
4048     // multiplies) have time to complete before their results are
4049     // used.  This most benefits in-order implementations of the
4050     // architecture but out-of-order ones also benefit.
4051     void step() {
4052       block_comment("step");
4053       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4054       // Ra = *++Pa;
4055       // Rb = *--Pb;
4056       mulhu(Rhi_ab, Ra, Rb);
4057       mul(Rlo_ab, Ra, Rb);
4058       addi(Pa, Pa, wordSize);
4059       ld(Ra, Address(Pa));
4060       subi(Pb, Pb, wordSize);
4061       ld(Rb, Address(Pb));
4062       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
4063                                             // previous iteration.
4064       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4065       // Rm = *++Pm;
4066       // Rn = *--Pn;
4067       mulhu(Rhi_mn, Rm, Rn);
4068       mul(Rlo_mn, Rm, Rn);
4069       addi(Pm, Pm, wordSize);
4070       ld(Rm, Address(Pm));
4071       subi(Pn, Pn, wordSize);
4072       ld(Rn, Address(Pn));
4073       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4074     }
4075 
4076     void post1() {
4077       block_comment("post1");
4078 
4079       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4080       // Ra = *++Pa;
4081       // Rb = *--Pb;
4082       mulhu(Rhi_ab, Ra, Rb);
4083       mul(Rlo_ab, Ra, Rb);
4084       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4085       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4086 
4087       // *Pm = Rm = tmp0 * inv;
4088       mul(Rm, tmp0, inv);
4089       sd(Rm, Address(Pm));
4090 
4091       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4092       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4093       mulhu(Rhi_mn, Rm, Rn);
4094 
4095 #ifndef PRODUCT
4096       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4097       {
4098         mul(Rlo_mn, Rm, Rn);
4099         add(Rlo_mn, tmp0, Rlo_mn);
4100         Label ok;
4101         beqz(Rlo_mn, ok);
4102         stop("broken Montgomery multiply");
4103         bind(ok);
4104       }
4105 #endif
4106       // We have very carefully set things up so that
4107       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4108       // the lower half of Rm * Rn because we know the result already:
4109       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
4110       // tmp0 != 0.  So, rather than do a mul and an cad we just set
4111       // the carry flag iff tmp0 is nonzero.
4112       //
4113       // mul(Rlo_mn, Rm, Rn);
4114       // cad(zr, tmp0, Rlo_mn);
4115       subi(t0, tmp0, 1);
4116       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4117       cadc(tmp0, tmp1, Rhi_mn, t0);
4118       adc(tmp1, tmp2, zr, t0);
4119       mv(tmp2, zr);
4120     }
4121 
4122     void pre2(Register i, Register len) {
4123       block_comment("pre2");
4124       // Pa = Pa_base + i-len;
4125       // Pb = Pb_base + len;
4126       // Pm = Pm_base + i-len;
4127       // Pn = Pn_base + len;
4128 
4129       sub(Rj, i, len);
4130       // Rj == i-len
4131 
4132       // Ra as temp register
4133       slli(Ra, Rj, LogBytesPerWord);
4134       add(Pa, Pa_base, Ra);
4135       add(Pm, Pm_base, Ra);
4136       slli(Ra, len, LogBytesPerWord);
4137       add(Pb, Pb_base, Ra);
4138       add(Pn, Pn_base, Ra);
4139 
4140       // Ra = *++Pa;
4141       // Rb = *--Pb;
4142       // Rm = *++Pm;
4143       // Rn = *--Pn;
4144       addi(Pa, Pa, wordSize);
4145       ld(Ra, Address(Pa));
4146       subi(Pb, Pb, wordSize);
4147       ld(Rb, Address(Pb));
4148       addi(Pm, Pm, wordSize);
4149       ld(Rm, Address(Pm));
4150       subi(Pn, Pn, wordSize);
4151       ld(Rn, Address(Pn));
4152 
4153       mv(Rhi_mn, zr);
4154       mv(Rlo_mn, zr);
4155     }
4156 
4157     void post2(Register i, Register len) {
4158       block_comment("post2");
4159       sub(Rj, i, len);
4160 
4161       cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
4162 
4163       // As soon as we know the least significant digit of our result,
4164       // store it.
4165       // Pm_base[i-len] = tmp0;
4166       // Rj as temp register
4167       slli(Rj, Rj, LogBytesPerWord);
4168       add(Rj, Pm_base, Rj);
4169       sd(tmp0, Address(Rj));
4170 
4171       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4172       cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
4173       adc(tmp1, tmp2, zr, t0);
4174       mv(tmp2, zr);
4175     }
4176 
4177     // A carry in tmp0 after Montgomery multiplication means that we
4178     // should subtract multiples of n from our result in m.  We'll
4179     // keep doing that until there is no carry.
4180     void normalize(Register len) {
4181       block_comment("normalize");
4182       // while (tmp0)
4183       //   tmp0 = sub(Pm_base, Pn_base, tmp0, len);
4184       Label loop, post, again;
4185       Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
4186       beqz(tmp0, post); {
4187         bind(again); {
4188           mv(i, zr);
4189           mv(cnt, len);
4190           slli(Rn, i, LogBytesPerWord);
4191           add(Rm, Pm_base, Rn);
4192           ld(Rm, Address(Rm));
4193           add(Rn, Pn_base, Rn);
4194           ld(Rn, Address(Rn));
4195           mv(t0, 1); // set carry flag, i.e. no borrow
4196           align(16);
4197           bind(loop); {
4198             notr(Rn, Rn);
4199             add(Rm, Rm, t0);
4200             add(Rm, Rm, Rn);
4201             sltu(t0, Rm, Rn);
4202             slli(Rn, i, LogBytesPerWord); // Rn as temp register
4203             add(Rn, Pm_base, Rn);
4204             sd(Rm, Address(Rn));
4205             addi(i, i, 1);
4206             slli(Rn, i, LogBytesPerWord);
4207             add(Rm, Pm_base, Rn);
4208             ld(Rm, Address(Rm));
4209             add(Rn, Pn_base, Rn);
4210             ld(Rn, Address(Rn));
4211             subi(cnt, cnt, 1);
4212           } bnez(cnt, loop);
4213           subi(tmp0, tmp0, 1);
4214           add(tmp0, tmp0, t0);
4215         } bnez(tmp0, again);
4216       } bind(post);
4217     }
4218 
4219     // Move memory at s to d, reversing words.
4220     //    Increments d to end of copied memory
4221     //    Destroys tmp1, tmp2
4222     //    Preserves len
4223     //    Leaves s pointing to the address which was in d at start
4224     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4225       assert(tmp1->encoding() < x28->encoding(), "register corruption");
4226       assert(tmp2->encoding() < x28->encoding(), "register corruption");
4227 
4228       shadd(s, len, s, tmp1, LogBytesPerWord);
4229       mv(tmp1, len);
4230       unroll_2(tmp1,  &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4231       slli(tmp1, len, LogBytesPerWord);
4232       sub(s, d, tmp1);
4233     }
4234     // [63...0] -> [31...0][63...32]
4235     void reverse1(Register d, Register s, Register tmp) {
4236       subi(s, s, wordSize);
4237       ld(tmp, Address(s));
4238       ror(tmp, tmp, 32, t0);
4239       sd(tmp, Address(d));
4240       addi(d, d, wordSize);
4241     }
4242 
4243     void step_squaring() {
4244       // An extra ACC
4245       step();
4246       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4247     }
4248 
4249     void last_squaring(Register i) {
4250       Label dont;
4251       // if ((i & 1) == 0) {
4252       test_bit(t0, i, 0);
4253       bnez(t0, dont); {
4254         // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4255         // Ra = *++Pa;
4256         // Rb = *--Pb;
4257         mulhu(Rhi_ab, Ra, Rb);
4258         mul(Rlo_ab, Ra, Rb);
4259         acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4260       } bind(dont);
4261     }
4262 
4263     void extra_step_squaring() {
4264       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4265 
4266       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4267       // Rm = *++Pm;
4268       // Rn = *--Pn;
4269       mulhu(Rhi_mn, Rm, Rn);
4270       mul(Rlo_mn, Rm, Rn);
4271       addi(Pm, Pm, wordSize);
4272       ld(Rm, Address(Pm));
4273       subi(Pn, Pn, wordSize);
4274       ld(Rn, Address(Pn));
4275     }
4276 
4277     void post1_squaring() {
4278       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4279 
4280       // *Pm = Rm = tmp0 * inv;
4281       mul(Rm, tmp0, inv);
4282       sd(Rm, Address(Pm));
4283 
4284       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4285       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4286       mulhu(Rhi_mn, Rm, Rn);
4287 
4288 #ifndef PRODUCT
4289       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4290       {
4291         mul(Rlo_mn, Rm, Rn);
4292         add(Rlo_mn, tmp0, Rlo_mn);
4293         Label ok;
4294         beqz(Rlo_mn, ok); {
4295           stop("broken Montgomery multiply");
4296         } bind(ok);
4297       }
4298 #endif
4299       // We have very carefully set things up so that
4300       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4301       // the lower half of Rm * Rn because we know the result already:
4302       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
4303       // tmp0 != 0.  So, rather than do a mul and a cad we just set
4304       // the carry flag iff tmp0 is nonzero.
4305       //
4306       // mul(Rlo_mn, Rm, Rn);
4307       // cad(zr, tmp, Rlo_mn);
4308       subi(t0, tmp0, 1);
4309       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4310       cadc(tmp0, tmp1, Rhi_mn, t0);
4311       adc(tmp1, tmp2, zr, t0);
4312       mv(tmp2, zr);
4313     }
4314 
4315     // use t0 as carry
4316     void acc(Register Rhi, Register Rlo,
4317              Register tmp0, Register tmp1, Register tmp2) {
4318       cad(tmp0, tmp0, Rlo, t0);
4319       cadc(tmp1, tmp1, Rhi, t0);
4320       adc(tmp2, tmp2, zr, t0);
4321     }
4322 
4323   public:
4324     /**
4325      * Fast Montgomery multiplication.  The derivation of the
4326      * algorithm is in A Cryptographic Library for the Motorola
4327      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4328      *
4329      * Arguments:
4330      *
4331      * Inputs for multiplication:
4332      *   c_rarg0   - int array elements a
4333      *   c_rarg1   - int array elements b
4334      *   c_rarg2   - int array elements n (the modulus)
4335      *   c_rarg3   - int length
4336      *   c_rarg4   - int inv
4337      *   c_rarg5   - int array elements m (the result)
4338      *
4339      * Inputs for squaring:
4340      *   c_rarg0   - int array elements a
4341      *   c_rarg1   - int array elements n (the modulus)
4342      *   c_rarg2   - int length
4343      *   c_rarg3   - int inv
4344      *   c_rarg4   - int array elements m (the result)
4345      *
4346      */
4347     address generate_multiply() {
4348       Label argh, nothing;
4349       bind(argh);
4350       stop("MontgomeryMultiply total_allocation must be <= 8192");
4351 
4352       align(CodeEntryAlignment);
4353       address entry = pc();
4354 
4355       beqz(Rlen, nothing);
4356 
4357       enter();
4358 
4359       // Make room.
4360       mv(Ra, 512);
4361       bgt(Rlen, Ra, argh);
4362       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4363       sub(Ra, sp, Ra);
4364       andi(sp, Ra, -2 * wordSize);
4365 
4366       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
4367 
4368       {
4369         // Copy input args, reversing as we go.  We use Ra as a
4370         // temporary variable.
4371         reverse(Ra, Pa_base, Rlen, Ri, Rj);
4372         if (!_squaring)
4373           reverse(Ra, Pb_base, Rlen, Ri, Rj);
4374         reverse(Ra, Pn_base, Rlen, Ri, Rj);
4375       }
4376 
4377       // Push all call-saved registers and also Pm_base which we'll need
4378       // at the end.
4379       save_regs();
4380 
4381 #ifndef PRODUCT
4382       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4383       {
4384         ld(Rn, Address(Pn_base));
4385         mul(Rlo_mn, Rn, inv);
4386         mv(t0, -1);
4387         Label ok;
4388         beq(Rlo_mn, t0, ok);
4389         stop("broken inverse in Montgomery multiply");
4390         bind(ok);
4391       }
4392 #endif
4393 
4394       mv(Pm_base, Ra);
4395 
4396       mv(tmp0, zr);
4397       mv(tmp1, zr);
4398       mv(tmp2, zr);
4399 
4400       block_comment("for (int i = 0; i < len; i++) {");
4401       mv(Ri, zr); {
4402         Label loop, end;
4403         bge(Ri, Rlen, end);
4404 
4405         bind(loop);
4406         pre1(Ri);
4407 
4408         block_comment("  for (j = i; j; j--) {"); {
4409           mv(Rj, Ri);
4410           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4411         } block_comment("  } // j");
4412 
4413         post1();
4414         addiw(Ri, Ri, 1);
4415         blt(Ri, Rlen, loop);
4416         bind(end);
4417         block_comment("} // i");
4418       }
4419 
4420       block_comment("for (int i = len; i < 2*len; i++) {");
4421       mv(Ri, Rlen); {
4422         Label loop, end;
4423         slli(t0, Rlen, 1);
4424         bge(Ri, t0, end);
4425 
4426         bind(loop);
4427         pre2(Ri, Rlen);
4428 
4429         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4430           slliw(Rj, Rlen, 1);
4431           subw(Rj, Rj, Ri);
4432           subiw(Rj, Rj, 1);
4433           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4434         } block_comment("  } // j");
4435 
4436         post2(Ri, Rlen);
4437         addiw(Ri, Ri, 1);
4438         slli(t0, Rlen, 1);
4439         blt(Ri, t0, loop);
4440         bind(end);
4441       }
4442       block_comment("} // i");
4443 
4444       normalize(Rlen);
4445 
4446       mv(Ra, Pm_base);  // Save Pm_base in Ra
4447       restore_regs();  // Restore caller's Pm_base
4448 
4449       // Copy our result into caller's Pm_base
4450       reverse(Pm_base, Ra, Rlen, Ri, Rj);
4451 
4452       leave();
4453       bind(nothing);
4454       ret();
4455 
4456       return entry;
4457     }
4458 
4459     /**
4460      *
4461      * Arguments:
4462      *
4463      * Inputs:
4464      *   c_rarg0   - int array elements a
4465      *   c_rarg1   - int array elements n (the modulus)
4466      *   c_rarg2   - int length
4467      *   c_rarg3   - int inv
4468      *   c_rarg4   - int array elements m (the result)
4469      *
4470      */
4471     address generate_square() {
4472       Label argh;
4473       bind(argh);
4474       stop("MontgomeryMultiply total_allocation must be <= 8192");
4475 
4476       align(CodeEntryAlignment);
4477       address entry = pc();
4478 
4479       enter();
4480 
4481       // Make room.
4482       mv(Ra, 512);
4483       bgt(Rlen, Ra, argh);
4484       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4485       sub(Ra, sp, Ra);
4486       andi(sp, Ra, -2 * wordSize);
4487 
4488       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
4489 
4490       {
4491         // Copy input args, reversing as we go.  We use Ra as a
4492         // temporary variable.
4493         reverse(Ra, Pa_base, Rlen, Ri, Rj);
4494         reverse(Ra, Pn_base, Rlen, Ri, Rj);
4495       }
4496 
4497       // Push all call-saved registers and also Pm_base which we'll need
4498       // at the end.
4499       save_regs();
4500 
4501       mv(Pm_base, Ra);
4502 
4503       mv(tmp0, zr);
4504       mv(tmp1, zr);
4505       mv(tmp2, zr);
4506 
4507       block_comment("for (int i = 0; i < len; i++) {");
4508       mv(Ri, zr); {
4509         Label loop, end;
4510         bind(loop);
4511         bge(Ri, Rlen, end);
4512 
4513         pre1(Ri);
4514 
4515         block_comment("for (j = (i+1)/2; j; j--) {"); {
4516           addi(Rj, Ri, 1);
4517           srliw(Rj, Rj, 1);
4518           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4519         } block_comment("  } // j");
4520 
4521         last_squaring(Ri);
4522 
4523         block_comment("  for (j = i/2; j; j--) {"); {
4524           srliw(Rj, Ri, 1);
4525           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4526         } block_comment("  } // j");
4527 
4528         post1_squaring();
4529         addi(Ri, Ri, 1);
4530         blt(Ri, Rlen, loop);
4531 
4532         bind(end);
4533         block_comment("} // i");
4534       }
4535 
4536       block_comment("for (int i = len; i < 2*len; i++) {");
4537       mv(Ri, Rlen); {
4538         Label loop, end;
4539         bind(loop);
4540         slli(t0, Rlen, 1);
4541         bge(Ri, t0, end);
4542 
4543         pre2(Ri, Rlen);
4544 
4545         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4546           slli(Rj, Rlen, 1);
4547           sub(Rj, Rj, Ri);
4548           subi(Rj, Rj, 1);
4549           srliw(Rj, Rj, 1);
4550           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4551         } block_comment("  } // j");
4552 
4553         last_squaring(Ri);
4554 
4555         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4556           slli(Rj, Rlen, 1);
4557           sub(Rj, Rj, Ri);
4558           srliw(Rj, Rj, 1);
4559           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4560         } block_comment("  } // j");
4561 
4562         post2(Ri, Rlen);
4563         addi(Ri, Ri, 1);
4564         slli(t0, Rlen, 1);
4565         blt(Ri, t0, loop);
4566 
4567         bind(end);
4568         block_comment("} // i");
4569       }
4570 
4571       normalize(Rlen);
4572 
4573       mv(Ra, Pm_base);  // Save Pm_base in Ra
4574       restore_regs();  // Restore caller's Pm_base
4575 
4576       // Copy our result into caller's Pm_base
4577       reverse(Pm_base, Ra, Rlen, Ri, Rj);
4578 
4579       leave();
4580       ret();
4581 
4582       return entry;
4583     }
4584   };
4585 
4586 #endif // COMPILER2
4587 
4588   address generate_cont_thaw(Continuation::thaw_kind kind) {
4589     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
4590     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
4591 
4592     address start = __ pc();
4593 
4594     if (return_barrier) {
4595       __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4596     }
4597 
4598 #ifndef PRODUCT
4599     {
4600       Label OK;
4601       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4602       __ beq(sp, t0, OK);
4603       __ stop("incorrect sp");
4604       __ bind(OK);
4605     }
4606 #endif
4607 
4608     if (return_barrier) {
4609       // preserve possible return value from a method returning to the return barrier
4610       __ subi(sp, sp, 2 * wordSize);
4611       __ fsd(f10, Address(sp, 0 * wordSize));
4612       __ sd(x10, Address(sp, 1 * wordSize));
4613     }
4614 
4615     __ mv(c_rarg1, (return_barrier ? 1 : 0));
4616     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
4617     __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
4618 
4619     if (return_barrier) {
4620       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4621       __ ld(x10, Address(sp, 1 * wordSize));
4622       __ fld(f10, Address(sp, 0 * wordSize));
4623       __ addi(sp, sp, 2 * wordSize);
4624     }
4625 
4626 #ifndef PRODUCT
4627     {
4628       Label OK;
4629       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4630       __ beq(sp, t0, OK);
4631       __ stop("incorrect sp");
4632       __ bind(OK);
4633     }
4634 #endif
4635 
4636     Label thaw_success;
4637     // t1 contains the size of the frames to thaw, 0 if overflow or no more frames
4638     __ bnez(t1, thaw_success);
4639     __ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
4640     __ bind(thaw_success);
4641 
4642     // make room for the thawed frames
4643     __ sub(t0, sp, t1);
4644     __ andi(sp, t0, -16); // align
4645 
4646     if (return_barrier) {
4647       // save original return value -- again
4648       __ subi(sp, sp, 2 * wordSize);
4649       __ fsd(f10, Address(sp, 0 * wordSize));
4650       __ sd(x10, Address(sp, 1 * wordSize));
4651     }
4652 
4653     // If we want, we can templatize thaw by kind, and have three different entries
4654     __ mv(c_rarg1, kind);
4655 
4656     __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
4657     __ mv(t1, x10); // x10 is the sp of the yielding frame
4658 
4659     if (return_barrier) {
4660       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4661       __ ld(x10, Address(sp, 1 * wordSize));
4662       __ fld(f10, Address(sp, 0 * wordSize));
4663       __ addi(sp, sp, 2 * wordSize);
4664     } else {
4665       __ mv(x10, zr); // return 0 (success) from doYield
4666     }
4667 
4668     // we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
4669     __ mv(fp, t1);
4670     __ subi(sp, t1, 2 * wordSize); // now pointing to fp spill
4671 
4672     if (return_barrier_exception) {
4673       __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
4674       __ verify_oop(x10);
4675       __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
4676 
4677       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
4678 
4679       // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
4680 
4681       __ mv(x11, x10); // the exception handler
4682       __ mv(x10, x9); // restore return value contaning the exception oop
4683       __ verify_oop(x10);
4684 
4685       __ leave();
4686       __ mv(x13, ra);
4687       __ jr(x11); // the exception handler
4688     } else {
4689       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4690       __ leave();
4691       __ ret();
4692     }
4693 
4694     return start;
4695   }
4696 
4697   address generate_cont_thaw() {
4698     if (!Continuations::enabled()) return nullptr;
4699 
4700     StubId stub_id = StubId::stubgen_cont_thaw_id;
4701     StubCodeMark mark(this, stub_id);
4702     address start = __ pc();
4703     generate_cont_thaw(Continuation::thaw_top);
4704     return start;
4705   }
4706 
4707   address generate_cont_returnBarrier() {
4708     if (!Continuations::enabled()) return nullptr;
4709 
4710     // TODO: will probably need multiple return barriers depending on return type
4711     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
4712     StubCodeMark mark(this, stub_id);
4713     address start = __ pc();
4714 
4715     generate_cont_thaw(Continuation::thaw_return_barrier);
4716 
4717     return start;
4718   }
4719 
4720   address generate_cont_returnBarrier_exception() {
4721     if (!Continuations::enabled()) return nullptr;
4722 
4723     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
4724     StubCodeMark mark(this, stub_id);
4725     address start = __ pc();
4726 
4727     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
4728 
4729     return start;
4730   }
4731 
4732   address generate_cont_preempt_stub() {
4733     if (!Continuations::enabled()) return nullptr;
4734     StubId stub_id = StubId::stubgen_cont_preempt_id;
4735     StubCodeMark mark(this, stub_id);
4736     address start = __ pc();
4737 
4738     __ reset_last_Java_frame(true);
4739 
4740     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4741     __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4742 
4743     Label preemption_cancelled;
4744     __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset()));
4745     __ bnez(t0, preemption_cancelled);
4746 
4747     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4748     SharedRuntime::continuation_enter_cleanup(_masm);
4749     __ leave();
4750     __ ret();
4751 
4752     // We acquired the monitor after freezing the frames so call thaw to continue execution.
4753     __ bind(preemption_cancelled);
4754     __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset()));
4755     __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize)));
4756     __ la(t1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
4757     __ ld(t1, Address(t1));
4758     __ jr(t1);
4759 
4760     return start;
4761   }
4762 
4763 #ifdef COMPILER2
4764 
4765 #undef __
4766 #define __ this->
4767 
4768   class Sha2Generator : public MacroAssembler {
4769     StubCodeGenerator* _cgen;
4770    public:
4771       Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
4772       address generate_sha256_implCompress(StubId stub_id) {
4773         return generate_sha2_implCompress(Assembler::e32, stub_id);
4774       }
4775       address generate_sha512_implCompress(StubId stub_id) {
4776         return generate_sha2_implCompress(Assembler::e64, stub_id);
4777       }
4778    private:
4779 
4780     void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4781       if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
4782       else                            __ vle64_v(vr, sr);
4783     }
4784 
4785     void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4786       if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
4787       else                            __ vse64_v(vr, sr);
4788     }
4789 
4790     // Overview of the logic in each "quad round".
4791     //
4792     // The code below repeats 16/20 times the logic implementing four rounds
4793     // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
4794     // to implementing the 64/80 single rounds.
4795     //
4796     //    // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
4797     //    // Output:
4798     //    //   vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4799     //    vl1reXX.v vTmp1, ofs
4800     //
4801     //    // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
4802     //    addi ofs, ofs, 16/32
4803     //
4804     //    // Add constants to message schedule words:
4805     //    //  Input
4806     //    //    vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4807     //    //    vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
4808     //    //  Output
4809     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4810     //    vadd.vv vTmp0, vTmp1, vW0
4811     //
4812     //    //  2 rounds of working variables updates.
4813     //    //     vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
4814     //    //  Input:
4815     //    //    vState1 = {c[t],d[t],g[t],h[t]}   " = vState1[t] "
4816     //    //    vState0 = {a[t],b[t],e[t],f[t]}
4817     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4818     //    //  Output:
4819     //    //    vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}  " = vState0[t+2] "
4820     //    //        = {h[t+4],g[t+4],d[t+4],c[t+4]}  " = vState1[t+4] "
4821     //    vsha2cl.vv vState1, vState0, vTmp0
4822     //
4823     //    //  2 rounds of working variables updates.
4824     //    //     vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
4825     //    //  Input
4826     //    //   vState0 = {a[t],b[t],e[t],f[t]}       " = vState0[t] "
4827     //    //       = {h[t+2],g[t+2],d[t+2],c[t+2]}   " = vState1[t+2] "
4828     //    //   vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}   " = vState0[t+2] "
4829     //    //   vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4830     //    //  Output:
4831     //    //   vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]}   " = vState0[t+4] "
4832     //    vsha2ch.vv vState0, vState1, vTmp0
4833     //
4834     //    // Combine 2QW into 1QW
4835     //    //
4836     //    // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
4837     //    //     vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
4838     //    // and it can only take 3 vectors as inputs. Hence we need to combine
4839     //    // vW1[0] and vW2[1..3] in a single vector.
4840     //    //
4841     //    // vmerge Vt4, Vt1, Vt2, V0
4842     //    // Input
4843     //    //  V0 = mask // first word from vW2, 1..3 words from vW1
4844     //    //  vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
4845     //    //  vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
4846     //    // Output
4847     //    //  Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
4848     //    vmerge.vvm vTmp0, vW2, vW1, v0
4849     //
4850     //    // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
4851     //    // Input
4852     //    //  vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]}     W[ 3: 0]
4853     //    //  vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]}     W[15:12]
4854     //    //  vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]}     W[11: 9,4]
4855     //    // Output (next four message schedule words)
4856     //    //  vW0 = {W[t+19],  W[t+18],  W[t+17],  W[t+16]}  W[19:16]
4857     //    vsha2ms.vv vW0, vTmp0, vW3
4858     //
4859     // BEFORE
4860     //  vW0 - vW3 hold the message schedule words (initially the block words)
4861     //    vW0 = W[ 3: 0]   "oldest"
4862     //    vW1 = W[ 7: 4]
4863     //    vW2 = W[11: 8]
4864     //    vW3 = W[15:12]   "newest"
4865     //
4866     //  vt6 - vt7 hold the working state variables
4867     //    vState0 = {a[t],b[t],e[t],f[t]}   // initially {H5,H4,H1,H0}
4868     //    vState1 = {c[t],d[t],g[t],h[t]}   // initially {H7,H6,H3,H2}
4869     //
4870     // AFTER
4871     //  vW0 - vW3 hold the message schedule words (initially the block words)
4872     //    vW1 = W[ 7: 4]   "oldest"
4873     //    vW2 = W[11: 8]
4874     //    vW3 = W[15:12]
4875     //    vW0 = W[19:16]   "newest"
4876     //
4877     //  vState0 and vState1 hold the working state variables
4878     //    vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
4879     //    vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
4880     //
4881     //  The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
4882     //  hence the uses of those vectors rotate in each round, and we get back to the
4883     //  initial configuration every 4 quad-rounds. We could avoid those changes at
4884     //  the cost of moving those vectors at the end of each quad-rounds.
4885     void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
4886                          Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
4887                          bool gen_words = true, bool step_const = true) {
4888       __ vleXX_v(vset_sew, vtemp, scalarconst);
4889       if (step_const) {
4890         __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
4891       }
4892       __ vadd_vv(vtemp2, vtemp, rot1);
4893       __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
4894       __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
4895       if (gen_words) {
4896         __ vmerge_vvm(vtemp2, rot3, rot2);
4897         __ vsha2ms_vv(rot1, vtemp2, rot4);
4898       }
4899     }
4900 
4901     // Arguments:
4902     //
4903     // Inputs:
4904     //   c_rarg0   - byte[]  source+offset
4905     //   c_rarg1   - int[]   SHA.state
4906     //   c_rarg2   - int     offset
4907     //   c_rarg3   - int     limit
4908     //
4909     address generate_sha2_implCompress(Assembler::SEW vset_sew, StubId stub_id) {
4910       alignas(64) static const uint32_t round_consts_256[64] = {
4911         0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
4912         0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
4913         0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
4914         0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
4915         0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
4916         0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
4917         0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
4918         0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
4919         0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
4920         0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
4921         0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
4922         0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
4923         0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
4924         0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
4925         0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
4926         0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
4927       };
4928       alignas(64) static const uint64_t round_consts_512[80] = {
4929         0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
4930         0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
4931         0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
4932         0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
4933         0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
4934         0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
4935         0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
4936         0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
4937         0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
4938         0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
4939         0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
4940         0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
4941         0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
4942         0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
4943         0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
4944         0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
4945         0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
4946         0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
4947         0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
4948         0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
4949         0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
4950         0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
4951         0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
4952         0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
4953         0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
4954         0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
4955         0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
4956       };
4957       const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
4958 
4959       bool multi_block;
4960       switch (stub_id) {
4961       case StubId::stubgen_sha256_implCompress_id:
4962         assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4963         multi_block = false;
4964         break;
4965       case StubId::stubgen_sha256_implCompressMB_id:
4966         assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4967         multi_block = true;
4968         break;
4969       case StubId::stubgen_sha512_implCompress_id:
4970         assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4971         multi_block = false;
4972         break;
4973       case StubId::stubgen_sha512_implCompressMB_id:
4974         assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4975         multi_block = true;
4976         break;
4977       default:
4978         ShouldNotReachHere();
4979       };
4980       __ align(CodeEntryAlignment);
4981       StubCodeMark mark(_cgen, stub_id);
4982       address start = __ pc();
4983 
4984       Register buf   = c_rarg0;
4985       Register state = c_rarg1;
4986       Register ofs   = c_rarg2;
4987       Register limit = c_rarg3;
4988       Register consts =  t2; // caller saved
4989       Register state_c = x28; // caller saved
4990       VectorRegister vindex = v2;
4991       VectorRegister vW0 = v4;
4992       VectorRegister vW1 = v6;
4993       VectorRegister vW2 = v8;
4994       VectorRegister vW3 = v10;
4995       VectorRegister vState0 = v12;
4996       VectorRegister vState1 = v14;
4997       VectorRegister vHash0  = v16;
4998       VectorRegister vHash1  = v18;
4999       VectorRegister vTmp0   = v20;
5000       VectorRegister vTmp1   = v22;
5001 
5002       Label multi_block_loop;
5003 
5004       __ enter();
5005 
5006       address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
5007       la(consts, ExternalAddress(constant_table));
5008 
5009       // Register use in this function:
5010       //
5011       // VECTORS
5012       //  vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
5013       //             schedule words (Wt). They start with the message block
5014       //             content (W0 to W15), then further words in the message
5015       //             schedule generated via vsha2ms from previous Wt.
5016       //   Initially:
5017       //     vW0 = W[  3:0] = { W3,  W2,  W1,  W0}
5018       //     vW1 = W[  7:4] = { W7,  W6,  W5,  W4}
5019       //     vW2 = W[ 11:8] = {W11, W10,  W9,  W8}
5020       //     vW3 = W[15:12] = {W15, W14, W13, W12}
5021       //
5022       //  vState0 - vState1 hold the working state variables (a, b, ..., h)
5023       //    vState0 = {f[t],e[t],b[t],a[t]}
5024       //    vState1 = {h[t],g[t],d[t],c[t]}
5025       //   Initially:
5026       //    vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
5027       //    vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
5028       //
5029       //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
5030       //
5031       //  vTmp0 = temporary, Wt+Kt
5032       //  vTmp1 = temporary, Kt
5033       //
5034       //  vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
5035       //
5036       // During most of the function the vector state is configured so that each
5037       // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
5038 
5039       // vsha2ch/vsha2cl uses EGW of 4*SEW.
5040       // SHA256 SEW = e32, EGW = 128-bits
5041       // SHA512 SEW = e64, EGW = 256-bits
5042       //
5043       // VLEN is required to be at least 128.
5044       // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
5045       //
5046       // m1: LMUL=1/2
5047       // ta: tail agnostic (don't care about those lanes)
5048       // ma: mask agnostic (don't care about those lanes)
5049       // x0 is not written, we known the number of vector elements.
5050 
5051       if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
5052         __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
5053       } else {
5054         __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
5055       }
5056 
5057       int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
5058       __ li(t0, indexes);
5059       __ vmv_v_x(vindex, t0);
5060 
5061       // Step-over a,b, so we are pointing to c.
5062       // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
5063       __ addi(state_c, state, const_add/2);
5064 
5065       // Use index-load to get {f,e,b,a},{h,g,d,c}
5066       __ vluxei8_v(vState0, state, vindex);
5067       __ vluxei8_v(vState1, state_c, vindex);
5068 
5069       __ bind(multi_block_loop);
5070 
5071       // Capture the initial H values in vHash0 and vHash1 to allow for computing
5072       // the resulting H', since H' = H+{a',b',c',...,h'}.
5073       __ vmv_v_v(vHash0, vState0);
5074       __ vmv_v_v(vHash1, vState1);
5075 
5076       // Load the 512/1024-bits of the message block in vW0-vW3 and perform
5077       // an endian swap on each 4/8 bytes element.
5078       //
5079       // If Zvkb is not implemented one can use vrgather
5080       // with an index sequence to byte-swap.
5081       //  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
5082       //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
5083       //  this sequence. 'vid' gives us the N.
5084       __ vleXX_v(vset_sew, vW0, buf);
5085       __ vrev8_v(vW0, vW0);
5086       __ addi(buf, buf, const_add);
5087       __ vleXX_v(vset_sew, vW1, buf);
5088       __ vrev8_v(vW1, vW1);
5089       __ addi(buf, buf, const_add);
5090       __ vleXX_v(vset_sew, vW2, buf);
5091       __ vrev8_v(vW2, vW2);
5092       __ addi(buf, buf, const_add);
5093       __ vleXX_v(vset_sew, vW3, buf);
5094       __ vrev8_v(vW3, vW3);
5095       __ addi(buf, buf, const_add);
5096 
5097       // Set v0 up for the vmerge that replaces the first word (idx==0)
5098       __ vid_v(v0);
5099       __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
5100 
5101       VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
5102       int rot_pos = 0;
5103       // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
5104       const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
5105       for (int i = 0; i < qr_end; i++) {
5106         sha2_quad_round(vset_sew,
5107                    rotation_regs[(rot_pos + 0) & 0x3],
5108                    rotation_regs[(rot_pos + 1) & 0x3],
5109                    rotation_regs[(rot_pos + 2) & 0x3],
5110                    rotation_regs[(rot_pos + 3) & 0x3],
5111                    consts,
5112                    vTmp1, vTmp0, vState0, vState1);
5113         ++rot_pos;
5114       }
5115       // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
5116       // Note that we stop generating new message schedule words (Wt, vW0-13)
5117       // as we already generated all the words we end up consuming (i.e., W[63:60]).
5118       const int qr_c_end = qr_end + 4;
5119       for (int i = qr_end; i < qr_c_end; i++) {
5120         sha2_quad_round(vset_sew,
5121                    rotation_regs[(rot_pos + 0) & 0x3],
5122                    rotation_regs[(rot_pos + 1) & 0x3],
5123                    rotation_regs[(rot_pos + 2) & 0x3],
5124                    rotation_regs[(rot_pos + 3) & 0x3],
5125                    consts,
5126                    vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
5127         ++rot_pos;
5128       }
5129 
5130       //--------------------------------------------------------------------------------
5131       // Compute the updated hash value H'
5132       //   H' = H + {h',g',...,b',a'}
5133       //      = {h,g,...,b,a} + {h',g',...,b',a'}
5134       //      = {h+h',g+g',...,b+b',a+a'}
5135 
5136       // H' = H+{a',b',c',...,h'}
5137       __ vadd_vv(vState0, vHash0, vState0);
5138       __ vadd_vv(vState1, vHash1, vState1);
5139 
5140       if (multi_block) {
5141         int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
5142         __ subi(consts, consts, total_adds);
5143         __ addi(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
5144         __ ble(ofs, limit, multi_block_loop);
5145         __ mv(c_rarg0, ofs); // return ofs
5146       }
5147 
5148       // Store H[0..8] = {a,b,c,d,e,f,g,h} from
5149       //  vState0 = {f,e,b,a}
5150       //  vState1 = {h,g,d,c}
5151       __ vsuxei8_v(vState0, state,   vindex);
5152       __ vsuxei8_v(vState1, state_c, vindex);
5153 
5154       __ leave();
5155       __ ret();
5156 
5157       return start;
5158     }
5159   };
5160 
5161 #undef __
5162 #define __ _masm->
5163 
5164   // Set of L registers that correspond to a contiguous memory area.
5165   // Each 64-bit register typically corresponds to 2 32-bit integers.
5166   template <uint L>
5167   class RegCache {
5168   private:
5169     MacroAssembler *_masm;
5170     Register _regs[L];
5171 
5172   public:
5173     RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
5174       assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
5175       auto it = rs.begin();
5176       for (auto &r: _regs) {
5177         r = *it;
5178         ++it;
5179       }
5180     }
5181 
5182     // generate load for the i'th register
5183     void gen_load(uint i, Register base) {
5184       assert(i < L, "invalid i: %u", i);
5185       __ ld(_regs[i], Address(base, 8 * i));
5186     }
5187 
5188     // add i'th 32-bit integer to dest
5189     void add_u32(const Register dest, uint i, const Register rtmp = t0) {
5190       assert(i < 2 * L, "invalid i: %u", i);
5191 
5192       if (is_even(i)) {
5193         // Use the bottom 32 bits. No need to mask off the top 32 bits
5194         // as addw will do the right thing.
5195         __ addw(dest, dest, _regs[i / 2]);
5196       } else {
5197         // Use the top 32 bits by right-shifting them.
5198         __ srli(rtmp, _regs[i / 2], 32);
5199         __ addw(dest, dest, rtmp);
5200       }
5201     }
5202   };
5203 
5204   typedef RegCache<8> BufRegCache;
5205 
5206   // a += value + x + ac;
5207   // a = Integer.rotateLeft(a, s) + b;
5208   void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
5209                                Register a, Register b, Register c, Register d,
5210                                int k, int s, int t,
5211                                Register value) {
5212     // a += ac
5213     __ addw(a, a, t, t1);
5214 
5215     // a += x;
5216     reg_cache.add_u32(a, k);
5217     // a += value;
5218     __ addw(a, a, value);
5219 
5220     // a = Integer.rotateLeft(a, s) + b;
5221     __ rolw(a, a, s);
5222     __ addw(a, a, b);
5223   }
5224 
5225   // a += ((b & c) | ((~b) & d)) + x + ac;
5226   // a = Integer.rotateLeft(a, s) + b;
5227   void md5_FF(BufRegCache& reg_cache,
5228               Register a, Register b, Register c, Register d,
5229               int k, int s, int t,
5230               Register rtmp1, Register rtmp2) {
5231     // rtmp1 = b & c
5232     __ andr(rtmp1, b, c);
5233 
5234     // rtmp2 = (~b) & d
5235     __ andn(rtmp2, d, b);
5236 
5237     // rtmp1 = (b & c) | ((~b) & d)
5238     __ orr(rtmp1, rtmp1, rtmp2);
5239 
5240     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5241   }
5242 
5243   // a += ((b & d) | (c & (~d))) + x + ac;
5244   // a = Integer.rotateLeft(a, s) + b;
5245   void md5_GG(BufRegCache& reg_cache,
5246               Register a, Register b, Register c, Register d,
5247               int k, int s, int t,
5248               Register rtmp1, Register rtmp2) {
5249     // rtmp1 = b & d
5250     __ andr(rtmp1, b, d);
5251 
5252     // rtmp2 = c & (~d)
5253     __ andn(rtmp2, c, d);
5254 
5255     // rtmp1 = (b & d) | (c & (~d))
5256     __ orr(rtmp1, rtmp1, rtmp2);
5257 
5258     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5259   }
5260 
5261   // a += ((b ^ c) ^ d) + x + ac;
5262   // a = Integer.rotateLeft(a, s) + b;
5263   void md5_HH(BufRegCache& reg_cache,
5264               Register a, Register b, Register c, Register d,
5265               int k, int s, int t,
5266               Register rtmp1, Register rtmp2) {
5267     // rtmp1 = (b ^ c) ^ d
5268     __ xorr(rtmp2, b, c);
5269     __ xorr(rtmp1, rtmp2, d);
5270 
5271     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5272   }
5273 
5274   // a += (c ^ (b | (~d))) + x + ac;
5275   // a = Integer.rotateLeft(a, s) + b;
5276   void md5_II(BufRegCache& reg_cache,
5277               Register a, Register b, Register c, Register d,
5278               int k, int s, int t,
5279               Register rtmp1, Register rtmp2) {
5280     // rtmp1 = c ^ (b | (~d))
5281     __ orn(rtmp2, b, d);
5282     __ xorr(rtmp1, c, rtmp2);
5283 
5284     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5285   }
5286 
5287   // Arguments:
5288   //
5289   // Inputs:
5290   //   c_rarg0   - byte[]  source+offset
5291   //   c_rarg1   - int[]   SHA.state
5292   //   c_rarg2   - int     offset  (multi_block == True)
5293   //   c_rarg3   - int     limit   (multi_block == True)
5294   //
5295   // Registers:
5296   //    x0   zero  (zero)
5297   //    x1     ra  (return address)
5298   //    x2     sp  (stack pointer)
5299   //    x3     gp  (global pointer)
5300   //    x4     tp  (thread pointer)
5301   //    x5     t0  (tmp register)
5302   //    x6     t1  (tmp register)
5303   //    x7     t2  state0
5304   //    x8  f0/s0  (frame pointer)
5305   //    x9     s1
5306   //   x10     a0  rtmp1 / c_rarg0
5307   //   x11     a1  rtmp2 / c_rarg1
5308   //   x12     a2  a     / c_rarg2
5309   //   x13     a3  b     / c_rarg3
5310   //   x14     a4  c
5311   //   x15     a5  d
5312   //   x16     a6  buf
5313   //   x17     a7  state
5314   //   x18     s2  ofs     [saved-reg]  (multi_block == True)
5315   //   x19     s3  limit   [saved-reg]  (multi_block == True)
5316   //   x20     s4  state1  [saved-reg]
5317   //   x21     s5  state2  [saved-reg]
5318   //   x22     s6  state3  [saved-reg]
5319   //   x23     s7
5320   //   x24     s8  buf0    [saved-reg]
5321   //   x25     s9  buf1    [saved-reg]
5322   //   x26    s10  buf2    [saved-reg]
5323   //   x27    s11  buf3    [saved-reg]
5324   //   x28     t3  buf4
5325   //   x29     t4  buf5
5326   //   x30     t5  buf6
5327   //   x31     t6  buf7
5328   address generate_md5_implCompress(StubId stub_id) {
5329     __ align(CodeEntryAlignment);
5330     bool multi_block;
5331     switch (stub_id) {
5332     case StubId::stubgen_md5_implCompress_id:
5333       multi_block = false;
5334       break;
5335     case StubId::stubgen_md5_implCompressMB_id:
5336       multi_block = true;
5337       break;
5338     default:
5339       ShouldNotReachHere();
5340     };
5341     StubCodeMark mark(this, stub_id);
5342     address start = __ pc();
5343 
5344     // rotation constants
5345     const int S11 = 7;
5346     const int S12 = 12;
5347     const int S13 = 17;
5348     const int S14 = 22;
5349     const int S21 = 5;
5350     const int S22 = 9;
5351     const int S23 = 14;
5352     const int S24 = 20;
5353     const int S31 = 4;
5354     const int S32 = 11;
5355     const int S33 = 16;
5356     const int S34 = 23;
5357     const int S41 = 6;
5358     const int S42 = 10;
5359     const int S43 = 15;
5360     const int S44 = 21;
5361 
5362     const int64_t mask32 = 0xffffffff;
5363 
5364     Register buf_arg   = c_rarg0; // a0
5365     Register state_arg = c_rarg1; // a1
5366     Register ofs_arg   = c_rarg2; // a2
5367     Register limit_arg = c_rarg3; // a3
5368 
5369     // we'll copy the args to these registers to free up a0-a3
5370     // to use for other values manipulated by instructions
5371     // that can be compressed
5372     Register buf       = x16; // a6
5373     Register state     = x17; // a7
5374     Register ofs       = x18; // s2
5375     Register limit     = x19; // s3
5376 
5377     // using x12->15 to allow compressed instructions
5378     Register a         = x12; // a2
5379     Register b         = x13; // a3
5380     Register c         = x14; // a4
5381     Register d         = x15; // a5
5382 
5383     Register state0    =  x7; // t2
5384     Register state1    = x20; // s4
5385     Register state2    = x21; // s5
5386     Register state3    = x22; // s6
5387 
5388     // using x10->x11 to allow compressed instructions
5389     Register rtmp1     = x10; // a0
5390     Register rtmp2     = x11; // a1
5391 
5392     RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
5393     RegSet reg_cache_regs;
5394     reg_cache_regs += reg_cache_saved_regs;
5395     reg_cache_regs += RegSet::of(t3, t4, t5, t6);
5396     BufRegCache reg_cache(_masm, reg_cache_regs);
5397 
5398     RegSet saved_regs;
5399     if (multi_block) {
5400       saved_regs += RegSet::of(ofs, limit);
5401     }
5402     saved_regs += RegSet::of(state1, state2, state3);
5403     saved_regs += reg_cache_saved_regs;
5404 
5405     __ push_reg(saved_regs, sp);
5406 
5407     __ mv(buf, buf_arg);
5408     __ mv(state, state_arg);
5409     if (multi_block) {
5410       __ mv(ofs, ofs_arg);
5411       __ mv(limit, limit_arg);
5412     }
5413 
5414     // to minimize the number of memory operations:
5415     // read the 4 state 4-byte values in pairs, with a single ld,
5416     // and split them into 2 registers.
5417     //
5418     // And, as the core algorithm of md5 works on 32-bits words, so
5419     // in the following code, it does not care about the content of
5420     // higher 32-bits in state[x]. Based on this observation,
5421     // we can apply further optimization, which is to just ignore the
5422     // higher 32-bits in state0/state2, rather than set the higher
5423     // 32-bits of state0/state2 to zero explicitly with extra instructions.
5424     __ ld(state0, Address(state));
5425     __ srli(state1, state0, 32);
5426     __ ld(state2, Address(state, 8));
5427     __ srli(state3, state2, 32);
5428 
5429     Label md5_loop;
5430     __ BIND(md5_loop);
5431 
5432     __ mv(a, state0);
5433     __ mv(b, state1);
5434     __ mv(c, state2);
5435     __ mv(d, state3);
5436 
5437     // Round 1
5438     reg_cache.gen_load(0, buf);
5439     md5_FF(reg_cache, a, b, c, d,  0, S11, 0xd76aa478, rtmp1, rtmp2);
5440     md5_FF(reg_cache, d, a, b, c,  1, S12, 0xe8c7b756, rtmp1, rtmp2);
5441     reg_cache.gen_load(1, buf);
5442     md5_FF(reg_cache, c, d, a, b,  2, S13, 0x242070db, rtmp1, rtmp2);
5443     md5_FF(reg_cache, b, c, d, a,  3, S14, 0xc1bdceee, rtmp1, rtmp2);
5444     reg_cache.gen_load(2, buf);
5445     md5_FF(reg_cache, a, b, c, d,  4, S11, 0xf57c0faf, rtmp1, rtmp2);
5446     md5_FF(reg_cache, d, a, b, c,  5, S12, 0x4787c62a, rtmp1, rtmp2);
5447     reg_cache.gen_load(3, buf);
5448     md5_FF(reg_cache, c, d, a, b,  6, S13, 0xa8304613, rtmp1, rtmp2);
5449     md5_FF(reg_cache, b, c, d, a,  7, S14, 0xfd469501, rtmp1, rtmp2);
5450     reg_cache.gen_load(4, buf);
5451     md5_FF(reg_cache, a, b, c, d,  8, S11, 0x698098d8, rtmp1, rtmp2);
5452     md5_FF(reg_cache, d, a, b, c,  9, S12, 0x8b44f7af, rtmp1, rtmp2);
5453     reg_cache.gen_load(5, buf);
5454     md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
5455     md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
5456     reg_cache.gen_load(6, buf);
5457     md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
5458     md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
5459     reg_cache.gen_load(7, buf);
5460     md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
5461     md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
5462 
5463     // Round 2
5464     md5_GG(reg_cache, a, b, c, d,  1, S21, 0xf61e2562, rtmp1, rtmp2);
5465     md5_GG(reg_cache, d, a, b, c,  6, S22, 0xc040b340, rtmp1, rtmp2);
5466     md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
5467     md5_GG(reg_cache, b, c, d, a,  0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
5468     md5_GG(reg_cache, a, b, c, d,  5, S21, 0xd62f105d, rtmp1, rtmp2);
5469     md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
5470     md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
5471     md5_GG(reg_cache, b, c, d, a,  4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
5472     md5_GG(reg_cache, a, b, c, d,  9, S21, 0x21e1cde6, rtmp1, rtmp2);
5473     md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
5474     md5_GG(reg_cache, c, d, a, b,  3, S23, 0xf4d50d87, rtmp1, rtmp2);
5475     md5_GG(reg_cache, b, c, d, a,  8, S24, 0x455a14ed, rtmp1, rtmp2);
5476     md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
5477     md5_GG(reg_cache, d, a, b, c,  2, S22, 0xfcefa3f8, rtmp1, rtmp2);
5478     md5_GG(reg_cache, c, d, a, b,  7, S23, 0x676f02d9, rtmp1, rtmp2);
5479     md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
5480 
5481     // Round 3
5482     md5_HH(reg_cache, a, b, c, d,  5, S31, 0xfffa3942, rtmp1, rtmp2);
5483     md5_HH(reg_cache, d, a, b, c,  8, S32, 0x8771f681, rtmp1, rtmp2);
5484     md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
5485     md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
5486     md5_HH(reg_cache, a, b, c, d,  1, S31, 0xa4beea44, rtmp1, rtmp2);
5487     md5_HH(reg_cache, d, a, b, c,  4, S32, 0x4bdecfa9, rtmp1, rtmp2);
5488     md5_HH(reg_cache, c, d, a, b,  7, S33, 0xf6bb4b60, rtmp1, rtmp2);
5489     md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
5490     md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
5491     md5_HH(reg_cache, d, a, b, c,  0, S32, 0xeaa127fa, rtmp1, rtmp2);
5492     md5_HH(reg_cache, c, d, a, b,  3, S33, 0xd4ef3085, rtmp1, rtmp2);
5493     md5_HH(reg_cache, b, c, d, a,  6, S34, 0x04881d05, rtmp1, rtmp2);
5494     md5_HH(reg_cache, a, b, c, d,  9, S31, 0xd9d4d039, rtmp1, rtmp2);
5495     md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
5496     md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
5497     md5_HH(reg_cache, b, c, d, a,  2, S34, 0xc4ac5665, rtmp1, rtmp2);
5498 
5499     // Round 4
5500     md5_II(reg_cache, a, b, c, d,  0, S41, 0xf4292244, rtmp1, rtmp2);
5501     md5_II(reg_cache, d, a, b, c,  7, S42, 0x432aff97, rtmp1, rtmp2);
5502     md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
5503     md5_II(reg_cache, b, c, d, a,  5, S44, 0xfc93a039, rtmp1, rtmp2);
5504     md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
5505     md5_II(reg_cache, d, a, b, c,  3, S42, 0x8f0ccc92, rtmp1, rtmp2);
5506     md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
5507     md5_II(reg_cache, b, c, d, a,  1, S44, 0x85845dd1, rtmp1, rtmp2);
5508     md5_II(reg_cache, a, b, c, d,  8, S41, 0x6fa87e4f, rtmp1, rtmp2);
5509     md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
5510     md5_II(reg_cache, c, d, a, b,  6, S43, 0xa3014314, rtmp1, rtmp2);
5511     md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
5512     md5_II(reg_cache, a, b, c, d,  4, S41, 0xf7537e82, rtmp1, rtmp2);
5513     md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
5514     md5_II(reg_cache, c, d, a, b,  2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
5515     md5_II(reg_cache, b, c, d, a,  9, S44, 0xeb86d391, rtmp1, rtmp2);
5516 
5517     __ addw(state0, state0, a);
5518     __ addw(state1, state1, b);
5519     __ addw(state2, state2, c);
5520     __ addw(state3, state3, d);
5521 
5522     if (multi_block) {
5523       __ addi(buf, buf, 64);
5524       __ addi(ofs, ofs, 64);
5525       // if (ofs <= limit) goto m5_loop
5526       __ bge(limit, ofs, md5_loop);
5527       __ mv(c_rarg0, ofs); // return ofs
5528     }
5529 
5530     // to minimize the number of memory operations:
5531     // write back the 4 state 4-byte values in pairs, with a single sd
5532     __ mv(t0, mask32);
5533     __ andr(state0, state0, t0);
5534     __ slli(state1, state1, 32);
5535     __ orr(state0, state0, state1);
5536     __ sd(state0, Address(state));
5537     __ andr(state2, state2, t0);
5538     __ slli(state3, state3, 32);
5539     __ orr(state2, state2, state3);
5540     __ sd(state2, Address(state, 8));
5541 
5542     __ pop_reg(saved_regs, sp);
5543     __ ret();
5544 
5545     return (address) start;
5546   }
5547 
5548   /**
5549    * Perform the quarter round calculations on values contained within four vector registers.
5550    *
5551    * @param aVec the SIMD register containing only the "a" values
5552    * @param bVec the SIMD register containing only the "b" values
5553    * @param cVec the SIMD register containing only the "c" values
5554    * @param dVec the SIMD register containing only the "d" values
5555    * @param tmp_vr temporary vector register holds intermedia values.
5556    */
5557   void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
5558                           VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
5559     // a += b, d ^= a, d <<<= 16
5560     __ vadd_vv(aVec, aVec, bVec);
5561     __ vxor_vv(dVec, dVec, aVec);
5562     __ vrole32_vi(dVec, 16, tmp_vr);
5563 
5564     // c += d, b ^= c, b <<<= 12
5565     __ vadd_vv(cVec, cVec, dVec);
5566     __ vxor_vv(bVec, bVec, cVec);
5567     __ vrole32_vi(bVec, 12, tmp_vr);
5568 
5569     // a += b, d ^= a, d <<<= 8
5570     __ vadd_vv(aVec, aVec, bVec);
5571     __ vxor_vv(dVec, dVec, aVec);
5572     __ vrole32_vi(dVec, 8, tmp_vr);
5573 
5574     // c += d, b ^= c, b <<<= 7
5575     __ vadd_vv(cVec, cVec, dVec);
5576     __ vxor_vv(bVec, bVec, cVec);
5577     __ vrole32_vi(bVec, 7, tmp_vr);
5578   }
5579 
5580   /**
5581    * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
5582    *
5583    *  Input arguments:
5584    *  c_rarg0   - state, the starting state
5585    *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function
5586    *
5587    *  Implementation Note:
5588    *   Parallelization is achieved by loading individual state elements into vectors for N blocks.
5589    *   N depends on single vector register length.
5590    */
5591   address generate_chacha20Block() {
5592     Label L_Rounds;
5593 
5594     __ align(CodeEntryAlignment);
5595     StubId stub_id = StubId::stubgen_chacha20Block_id;
5596     StubCodeMark mark(this, stub_id);
5597     address start = __ pc();
5598     __ enter();
5599 
5600     const int states_len = 16;
5601     const int step = 4;
5602     const Register state = c_rarg0;
5603     const Register key_stream = c_rarg1;
5604     const Register tmp_addr = t0;
5605     const Register length = t1;
5606 
5607     // Organize vector registers in an array that facilitates
5608     // putting repetitive opcodes into loop structures below.
5609     const VectorRegister work_vrs[16] = {
5610       v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
5611       v8, v9, v10, v11, v12, v13, v14, v15
5612     };
5613     const VectorRegister tmp_vr = v16;
5614     const VectorRegister counter_vr = v17;
5615 
5616     {
5617       // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
5618       // in java level.
5619       __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
5620     }
5621 
5622     // Load from source state.
5623     // Every element in source state is duplicated to all elements in the corresponding vector.
5624     __ mv(tmp_addr, state);
5625     for (int i = 0; i < states_len; i += 1) {
5626       __ vlse32_v(work_vrs[i], tmp_addr, zr);
5627       __ addi(tmp_addr, tmp_addr, step);
5628     }
5629     // Adjust counter for every individual block.
5630     __ vid_v(counter_vr);
5631     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5632 
5633     // Perform 10 iterations of the 8 quarter round set
5634     {
5635       const Register loop = t2; // share t2 with other non-overlapping usages.
5636       __ mv(loop, 10);
5637       __ BIND(L_Rounds);
5638 
5639       chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8],  work_vrs[12], tmp_vr);
5640       chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9],  work_vrs[13], tmp_vr);
5641       chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
5642       chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
5643 
5644       chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
5645       chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
5646       chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8],  work_vrs[13], tmp_vr);
5647       chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9],  work_vrs[14], tmp_vr);
5648 
5649       __ subi(loop, loop, 1);
5650       __ bnez(loop, L_Rounds);
5651     }
5652 
5653     // Add the original state into the end working state.
5654     // We do this by first duplicating every element in source state array to the corresponding
5655     // vector, then adding it to the post-loop working state.
5656     __ mv(tmp_addr, state);
5657     for (int i = 0; i < states_len; i += 1) {
5658       __ vlse32_v(tmp_vr, tmp_addr, zr);
5659       __ addi(tmp_addr, tmp_addr, step);
5660       __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
5661     }
5662     // Add the counter overlay onto work_vrs[12] at the end.
5663     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5664 
5665     // Store result to key stream.
5666     {
5667       const Register stride = t2; // share t2 with other non-overlapping usages.
5668       // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
5669       __ mv(stride, 64);
5670       for (int i = 0; i < states_len; i += 1) {
5671         __ vsse32_v(work_vrs[i], key_stream, stride);
5672         __ addi(key_stream, key_stream, step);
5673       }
5674     }
5675 
5676     // Return length of output key_stream
5677     __ slli(c_rarg0, length, 6);
5678 
5679     __ leave();
5680     __ ret();
5681 
5682     return (address) start;
5683   }
5684 
5685 
5686   // ------------------------ SHA-1 intrinsic ------------------------
5687 
5688   // K't =
5689   //    5a827999, 0  <= t <= 19
5690   //    6ed9eba1, 20 <= t <= 39
5691   //    8f1bbcdc, 40 <= t <= 59
5692   //    ca62c1d6, 60 <= t <= 79
5693   void sha1_prepare_k(Register cur_k, int round) {
5694     assert(round >= 0 && round < 80, "must be");
5695 
5696     static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
5697     if ((round % 20) == 0) {
5698       __ mv(cur_k, ks[round/20]);
5699     }
5700   }
5701 
5702   // W't =
5703   //    M't,                                      0 <=  t <= 15
5704   //    ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5705   void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
5706     assert(round >= 0 && round < 80, "must be");
5707 
5708     if (round < 16) {
5709       // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
5710       //   in ws[0], high part contains W't-0, low part contains W't-1,
5711       //   in ws[1], high part contains W't-2, low part contains W't-3,
5712       //   ...
5713       //   in ws[7], high part contains W't-14, low part contains W't-15.
5714 
5715       if ((round % 2) == 0) {
5716         __ ld(ws[round/2], Address(buf, (round/2) * 8));
5717         // reverse bytes, as SHA-1 is defined in big-endian.
5718         __ revb(ws[round/2], ws[round/2]);
5719         __ srli(cur_w, ws[round/2], 32);
5720       } else {
5721         __ mv(cur_w, ws[round/2]);
5722       }
5723 
5724       return;
5725     }
5726 
5727     if ((round % 2) == 0) {
5728       int idx = 16;
5729       // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5730       __ srli(t1, ws[(idx-8)/2], 32);
5731       __ xorr(t0, ws[(idx-3)/2], t1);
5732 
5733       __ srli(t1, ws[(idx-14)/2], 32);
5734       __ srli(cur_w, ws[(idx-16)/2], 32);
5735       __ xorr(cur_w, cur_w, t1);
5736 
5737       __ xorr(cur_w, cur_w, t0);
5738       __ rolw(cur_w, cur_w, 1, t0);
5739 
5740       // copy the cur_w value to ws[8].
5741       // now, valid w't values are at:
5742       //  w0:       ws[0]'s lower 32 bits
5743       //  w1 ~ w14: ws[1] ~ ws[7]
5744       //  w15:      ws[8]'s higher 32 bits
5745       __ slli(ws[idx/2], cur_w, 32);
5746 
5747       return;
5748     }
5749 
5750     int idx = 17;
5751     // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5752     __ srli(t1, ws[(idx-3)/2], 32);
5753     __ xorr(t0, t1, ws[(idx-8)/2]);
5754 
5755     __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
5756 
5757     __ xorr(cur_w, cur_w, t0);
5758     __ rolw(cur_w, cur_w, 1, t0);
5759 
5760     // copy the cur_w value to ws[8]
5761     __ zext(cur_w, cur_w, 32);
5762     __ orr(ws[idx/2], ws[idx/2], cur_w);
5763 
5764     // shift the w't registers, so they start from ws[0] again.
5765     // now, valid w't values are at:
5766     //  w0 ~ w15: ws[0] ~ ws[7]
5767     Register ws_0 = ws[0];
5768     for (int i = 0; i < 16/2; i++) {
5769       ws[i] = ws[i+1];
5770     }
5771     ws[8] = ws_0;
5772   }
5773 
5774   // f't(x, y, z) =
5775   //    Ch(x, y, z)     = (x & y) ^ (~x & z)            , 0  <= t <= 19
5776   //    Parity(x, y, z) = x ^ y ^ z                     , 20 <= t <= 39
5777   //    Maj(x, y, z)    = (x & y) ^ (x & z) ^ (y & z)   , 40 <= t <= 59
5778   //    Parity(x, y, z) = x ^ y ^ z                     , 60 <= t <= 79
5779   void sha1_f(Register dst, Register x, Register y, Register z, int round) {
5780     assert(round >= 0 && round < 80, "must be");
5781     assert_different_registers(dst, x, y, z, t0, t1);
5782 
5783     if (round < 20) {
5784       // (x & y) ^ (~x & z)
5785       __ andr(t0, x, y);
5786       __ andn(dst, z, x);
5787       __ xorr(dst, dst, t0);
5788     } else if (round >= 40 && round < 60) {
5789       // (x & y) ^ (x & z) ^ (y & z)
5790       __ andr(t0, x, y);
5791       __ andr(t1, x, z);
5792       __ andr(dst, y, z);
5793       __ xorr(dst, dst, t0);
5794       __ xorr(dst, dst, t1);
5795     } else {
5796       // x ^ y ^ z
5797       __ xorr(dst, x, y);
5798       __ xorr(dst, dst, z);
5799     }
5800   }
5801 
5802   // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5803   // e = d
5804   // d = c
5805   // c = ROTL'30(b)
5806   // b = a
5807   // a = T
5808   void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
5809                           Register cur_k, Register cur_w, Register tmp, int round) {
5810     assert(round >= 0 && round < 80, "must be");
5811     assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
5812 
5813     // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5814 
5815     // cur_w will be recalculated at the beginning of each round,
5816     // so, we can reuse it as a temp register here.
5817     Register tmp2 = cur_w;
5818 
5819     // reuse e as a temporary register, as we will mv new value into it later
5820     Register tmp3 = e;
5821     __ add(tmp2, cur_k, tmp2);
5822     __ add(tmp3, tmp3, tmp2);
5823     __ rolw(tmp2, a, 5, t0);
5824 
5825     sha1_f(tmp, b, c, d, round);
5826 
5827     __ add(tmp2, tmp2, tmp);
5828     __ add(tmp2, tmp2, tmp3);
5829 
5830     // e = d
5831     // d = c
5832     // c = ROTL'30(b)
5833     // b = a
5834     // a = T
5835     __ mv(e, d);
5836     __ mv(d, c);
5837 
5838     __ rolw(c, b, 30);
5839     __ mv(b, a);
5840     __ mv(a, tmp2);
5841   }
5842 
5843   // H(i)0 = a + H(i-1)0
5844   // H(i)1 = b + H(i-1)1
5845   // H(i)2 = c + H(i-1)2
5846   // H(i)3 = d + H(i-1)3
5847   // H(i)4 = e + H(i-1)4
5848   void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
5849                               Register prev_ab, Register prev_cd, Register prev_e) {
5850     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5851 
5852     __ add(a, a, prev_ab);
5853     __ srli(prev_ab, prev_ab, 32);
5854     __ add(b, b, prev_ab);
5855 
5856     __ add(c, c, prev_cd);
5857     __ srli(prev_cd, prev_cd, 32);
5858     __ add(d, d, prev_cd);
5859 
5860     __ add(e, e, prev_e);
5861   }
5862 
5863   void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
5864                                 Register prev_ab, Register prev_cd, Register prev_e) {
5865     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
5866 
5867     __ slli(t0, b, 32);
5868     __ zext(prev_ab, a, 32);
5869     __ orr(prev_ab, prev_ab, t0);
5870 
5871     __ slli(t0, d, 32);
5872     __ zext(prev_cd, c, 32);
5873     __ orr(prev_cd, prev_cd, t0);
5874 
5875     __ mv(prev_e, e);
5876   }
5877 
5878   // Intrinsic for:
5879   //   void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
5880   //   void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
5881   //
5882   // Arguments:
5883   //
5884   // Inputs:
5885   //   c_rarg0: byte[]  src array + offset
5886   //   c_rarg1: int[]   SHA.state
5887   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5888   //   c_rarg2: int     offset
5889   //   c_rarg3: int     limit
5890   //
5891   // Outputs:
5892   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5893   //   c_rarg0: int offset, when (multi_block == true)
5894   //
5895   address generate_sha1_implCompress(StubId stub_id) {
5896       bool multi_block;
5897       switch (stub_id) {
5898       case StubId::stubgen_sha1_implCompress_id:
5899         multi_block = false;
5900         break;
5901       case StubId::stubgen_sha1_implCompressMB_id:
5902         multi_block = true;
5903         break;
5904       default:
5905         ShouldNotReachHere();
5906       };
5907     __ align(CodeEntryAlignment);
5908     StubCodeMark mark(this, stub_id);
5909 
5910     address start = __ pc();
5911     __ enter();
5912 
5913     RegSet saved_regs = RegSet::range(x18, x27);
5914     if (multi_block) {
5915       // use x9 as src below.
5916       saved_regs += RegSet::of(x9);
5917     }
5918     __ push_reg(saved_regs, sp);
5919 
5920     // c_rarg0 - c_rarg3: x10 - x13
5921     Register buf    = c_rarg0;
5922     Register state  = c_rarg1;
5923     Register offset = c_rarg2;
5924     Register limit  = c_rarg3;
5925     // use src to contain the original start point of the array.
5926     Register src    = x9;
5927 
5928     if (multi_block) {
5929       __ sub(limit, limit, offset);
5930       __ add(limit, limit, buf);
5931       __ sub(src, buf, offset);
5932     }
5933 
5934     // [args-reg]:  x14 - x17
5935     // [temp-reg]:  x28 - x31
5936     // [saved-reg]: x18 - x27
5937 
5938     // h0/1/2/3/4
5939     const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5940     // w0, w1, ... w15
5941     // put two adjecent w's in one register:
5942     //    one at high word part, another at low word part
5943     // at different round (even or odd), w't value reside in different items in ws[].
5944     // w0 ~ w15, either reside in
5945     //    ws[0] ~ ws[7], where
5946     //      w0 at higher 32 bits of ws[0],
5947     //      w1 at lower 32 bits of ws[0],
5948     //      ...
5949     //      w14 at higher 32 bits of ws[7],
5950     //      w15 at lower 32 bits of ws[7].
5951     // or, reside in
5952     //    w0:       ws[0]'s lower 32 bits
5953     //    w1 ~ w14: ws[1] ~ ws[7]
5954     //    w15:      ws[8]'s higher 32 bits
5955     Register ws[9] = {x29, x30, x31, x18,
5956                       x19, x20, x21, x22,
5957                       x23}; // auxiliary register for calculating w's value
5958     // current k't's value
5959     const Register cur_k = x24;
5960     // current w't's value
5961     const Register cur_w = x25;
5962     // values of a, b, c, d, e in the previous round
5963     const Register prev_ab = x26, prev_cd = x27;
5964     const Register prev_e = offset; // reuse offset/c_rarg2
5965 
5966     // load 5 words state into a, b, c, d, e.
5967     //
5968     // To minimize the number of memory operations, we apply following
5969     // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5970     // with a single ld, and split them into 2 registers.
5971     //
5972     // And, as the core algorithm of SHA-1 works on 32-bits words, so
5973     // in the following code, it does not care about the content of
5974     // higher 32-bits in a/b/c/d/e. Based on this observation,
5975     // we can apply further optimization, which is to just ignore the
5976     // higher 32-bits in a/c/e, rather than set the higher
5977     // 32-bits of a/c/e to zero explicitly with extra instructions.
5978     __ ld(a, Address(state, 0));
5979     __ srli(b, a, 32);
5980     __ ld(c, Address(state, 8));
5981     __ srli(d, c, 32);
5982     __ lw(e, Address(state, 16));
5983 
5984     Label L_sha1_loop;
5985     if (multi_block) {
5986       __ BIND(L_sha1_loop);
5987     }
5988 
5989     sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5990 
5991     for (int round = 0; round < 80; round++) {
5992       // prepare K't value
5993       sha1_prepare_k(cur_k, round);
5994 
5995       // prepare W't value
5996       sha1_prepare_w(cur_w, ws, buf, round);
5997 
5998       // one round process
5999       sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
6000     }
6001 
6002     // compute the intermediate hash value
6003     sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
6004 
6005     if (multi_block) {
6006       int64_t block_bytes = 16 * 4;
6007       __ addi(buf, buf, block_bytes);
6008 
6009       __ bge(limit, buf, L_sha1_loop, /* is_far */ true);
6010     }
6011 
6012     // store back the state.
6013     __ zext(a, a, 32);
6014     __ slli(b, b, 32);
6015     __ orr(a, a, b);
6016     __ sd(a, Address(state, 0));
6017     __ zext(c, c, 32);
6018     __ slli(d, d, 32);
6019     __ orr(c, c, d);
6020     __ sd(c, Address(state, 8));
6021     __ sw(e, Address(state, 16));
6022 
6023     // return offset
6024     if (multi_block) {
6025       __ sub(c_rarg0, buf, src);
6026     }
6027 
6028     __ pop_reg(saved_regs, sp);
6029 
6030     __ leave();
6031     __ ret();
6032 
6033     return (address) start;
6034   }
6035 
6036   /**
6037    * vector registers:
6038    *   input VectorRegister's:  intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
6039    *   index VectorRegister's:  idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
6040    *   output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
6041    *
6042    * NOTE: each field will occupy a vector register group
6043    */
6044   void base64_vector_encode_round(Register src, Register dst, Register codec,
6045                     Register size, Register stepSrc, Register stepDst,
6046                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
6047                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6048                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
6049                     Assembler::LMUL lmul) {
6050     // set vector register type/len
6051     __ vsetvli(x0, size, Assembler::e8, lmul);
6052 
6053     // segmented load src into v registers: mem(src) => vr(3)
6054     __ vlseg3e8_v(inputV1, src);
6055 
6056     // src = src + register_group_len_bytes * 3
6057     __ add(src, src, stepSrc);
6058 
6059     // encoding
6060     //   1. compute index into lookup table: vr(3) => vr(4)
6061     __ vsrl_vi(idxV1, inputV1, 2);
6062 
6063     __ vsrl_vi(idxV2, inputV2, 2);
6064     __ vsll_vi(inputV1, inputV1, 6);
6065     __ vor_vv(idxV2, idxV2, inputV1);
6066     __ vsrl_vi(idxV2, idxV2, 2);
6067 
6068     __ vsrl_vi(idxV3, inputV3, 4);
6069     __ vsll_vi(inputV2, inputV2, 4);
6070     __ vor_vv(idxV3, inputV2, idxV3);
6071     __ vsrl_vi(idxV3, idxV3, 2);
6072 
6073     __ vsll_vi(idxV4, inputV3, 2);
6074     __ vsrl_vi(idxV4, idxV4, 2);
6075 
6076     //   2. indexed load: vr(4) => vr(4)
6077     __ vluxei8_v(outputV1, codec, idxV1);
6078     __ vluxei8_v(outputV2, codec, idxV2);
6079     __ vluxei8_v(outputV3, codec, idxV3);
6080     __ vluxei8_v(outputV4, codec, idxV4);
6081 
6082     // segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
6083     __ vsseg4e8_v(outputV1, dst);
6084 
6085     // dst = dst + register_group_len_bytes * 4
6086     __ add(dst, dst, stepDst);
6087   }
6088 
6089   /**
6090    *  void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
6091    *
6092    *  Input arguments:
6093    *  c_rarg0   - src, source array
6094    *  c_rarg1   - sp, src start offset
6095    *  c_rarg2   - sl, src end offset
6096    *  c_rarg3   - dst, dest array
6097    *  c_rarg4   - dp, dst start offset
6098    *  c_rarg5   - isURL, Base64 or URL character set
6099    */
6100   address generate_base64_encodeBlock() {
6101     alignas(64) static const char toBase64[64] = {
6102       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6103       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6104       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6105       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6106       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6107     };
6108 
6109     alignas(64) static const char toBase64URL[64] = {
6110       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6111       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6112       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6113       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6114       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6115     };
6116 
6117     __ align(CodeEntryAlignment);
6118     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
6119     StubCodeMark mark(this, stub_id);
6120     address start = __ pc();
6121     __ enter();
6122 
6123     Register src    = c_rarg0;
6124     Register soff   = c_rarg1;
6125     Register send   = c_rarg2;
6126     Register dst    = c_rarg3;
6127     Register doff   = c_rarg4;
6128     Register isURL  = c_rarg5;
6129 
6130     Register codec  = c_rarg6;
6131     Register length = c_rarg7; // total length of src data in bytes
6132 
6133     Label ProcessData, Exit;
6134 
6135     // length should be multiple of 3
6136     __ sub(length, send, soff);
6137     // real src/dst to process data
6138     __ add(src, src, soff);
6139     __ add(dst, dst, doff);
6140 
6141     // load the codec base address
6142     __ la(codec, ExternalAddress((address) toBase64));
6143     __ beqz(isURL, ProcessData);
6144     __ la(codec, ExternalAddress((address) toBase64URL));
6145     __ BIND(ProcessData);
6146 
6147     // vector version
6148     if (UseRVV) {
6149       Label ProcessM2, ProcessM1, ProcessScalar;
6150 
6151       Register size      = soff;
6152       Register stepSrcM1 = send;
6153       Register stepSrcM2 = doff;
6154       Register stepDst   = isURL;
6155 
6156       __ mv(size, MaxVectorSize * 2);
6157       __ mv(stepSrcM1, MaxVectorSize * 3);
6158       __ slli(stepSrcM2, stepSrcM1, 1);
6159       __ mv(stepDst, MaxVectorSize * 2 * 4);
6160 
6161       __ blt(length, stepSrcM2, ProcessM1);
6162 
6163       __ BIND(ProcessM2);
6164       base64_vector_encode_round(src, dst, codec,
6165                     size, stepSrcM2, stepDst,
6166                     v2, v4, v6,         // inputs
6167                     v8, v10, v12, v14,  // indexes
6168                     v16, v18, v20, v22, // outputs
6169                     Assembler::m2);
6170 
6171       __ sub(length, length, stepSrcM2);
6172       __ bge(length, stepSrcM2, ProcessM2);
6173 
6174       __ BIND(ProcessM1);
6175       __ blt(length, stepSrcM1, ProcessScalar);
6176 
6177       __ srli(size, size, 1);
6178       __ srli(stepDst, stepDst, 1);
6179       base64_vector_encode_round(src, dst, codec,
6180                     size, stepSrcM1, stepDst,
6181                     v1, v2, v3,         // inputs
6182                     v4, v5, v6, v7,     // indexes
6183                     v8, v9, v10, v11,   // outputs
6184                     Assembler::m1);
6185       __ sub(length, length, stepSrcM1);
6186 
6187       __ BIND(ProcessScalar);
6188     }
6189 
6190     // scalar version
6191     {
6192       Register byte1 = soff, byte0 = send, byte2 = doff;
6193       Register combined24Bits = isURL;
6194 
6195       __ beqz(length, Exit);
6196 
6197       Label ScalarLoop;
6198       __ BIND(ScalarLoop);
6199       {
6200         // plain:   [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
6201         // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
6202 
6203         // load 3 bytes src data
6204         __ lbu(byte0, Address(src, 0));
6205         __ lbu(byte1, Address(src, 1));
6206         __ lbu(byte2, Address(src, 2));
6207         __ addi(src, src, 3);
6208 
6209         // construct 24 bits from 3 bytes
6210         __ slliw(byte0, byte0, 16);
6211         __ slliw(byte1, byte1, 8);
6212         __ orr(combined24Bits, byte0, byte1);
6213         __ orr(combined24Bits, combined24Bits, byte2);
6214 
6215         // get codec index and encode(ie. load from codec by index)
6216         __ slliw(byte0, combined24Bits, 8);
6217         __ srliw(byte0, byte0, 26);
6218         __ add(byte0, codec, byte0);
6219         __ lbu(byte0, byte0);
6220 
6221         __ slliw(byte1, combined24Bits, 14);
6222         __ srliw(byte1, byte1, 26);
6223         __ add(byte1, codec, byte1);
6224         __ lbu(byte1, byte1);
6225 
6226         __ slliw(byte2, combined24Bits, 20);
6227         __ srliw(byte2, byte2, 26);
6228         __ add(byte2, codec, byte2);
6229         __ lbu(byte2, byte2);
6230 
6231         __ andi(combined24Bits, combined24Bits, 0x3f);
6232         __ add(combined24Bits, codec, combined24Bits);
6233         __ lbu(combined24Bits, combined24Bits);
6234 
6235         // store 4 bytes encoded data
6236         __ sb(byte0, Address(dst, 0));
6237         __ sb(byte1, Address(dst, 1));
6238         __ sb(byte2, Address(dst, 2));
6239         __ sb(combined24Bits, Address(dst, 3));
6240 
6241         __ subi(length, length, 3);
6242         __ addi(dst, dst, 4);
6243         // loop back
6244         __ bnez(length, ScalarLoop);
6245       }
6246     }
6247 
6248     __ BIND(Exit);
6249 
6250     __ leave();
6251     __ ret();
6252 
6253     return (address) start;
6254   }
6255 
6256   /**
6257    * vector registers:
6258    * input VectorRegister's:  intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
6259    * index VectorRegister's:  idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
6260    * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
6261    *
6262    * NOTE: each field will occupy a single vector register group
6263    */
6264   void base64_vector_decode_round(Register src, Register dst, Register codec,
6265                     Register size, Register stepSrc, Register stepDst, Register failedIdx,
6266                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
6267                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6268                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
6269                     Assembler::LMUL lmul) {
6270     // set vector register type/len
6271     __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
6272 
6273     // segmented load src into v registers: mem(src) => vr(4)
6274     __ vlseg4e8_v(inputV1, src);
6275 
6276     // src = src + register_group_len_bytes * 4
6277     __ add(src, src, stepSrc);
6278 
6279     // decoding
6280     //   1. indexed load: vr(4) => vr(4)
6281     __ vluxei8_v(idxV1, codec, inputV1);
6282     __ vluxei8_v(idxV2, codec, inputV2);
6283     __ vluxei8_v(idxV3, codec, inputV3);
6284     __ vluxei8_v(idxV4, codec, inputV4);
6285 
6286     //   2. check wrong data
6287     __ vor_vv(outputV1, idxV1, idxV2);
6288     __ vor_vv(outputV2, idxV3, idxV4);
6289     __ vor_vv(outputV1, outputV1, outputV2);
6290     __ vmseq_vi(v0, outputV1, -1);
6291     __ vfirst_m(failedIdx, v0);
6292     Label NoFailure, FailureAtIdx0;
6293     // valid value can only be -1 when < 0
6294     __ bltz(failedIdx, NoFailure);
6295     // when the first data (at index 0) fails, no need to process data anymore
6296     __ beqz(failedIdx, FailureAtIdx0);
6297     __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
6298     __ slli(stepDst, failedIdx, 1);
6299     __ add(stepDst, failedIdx, stepDst);
6300     __ BIND(NoFailure);
6301 
6302     //   3. compute the decoded data: vr(4) => vr(3)
6303     __ vsll_vi(idxV1, idxV1, 2);
6304     __ vsrl_vi(outputV1, idxV2, 4);
6305     __ vor_vv(outputV1, outputV1, idxV1);
6306 
6307     __ vsll_vi(idxV2, idxV2, 4);
6308     __ vsrl_vi(outputV2, idxV3, 2);
6309     __ vor_vv(outputV2, outputV2, idxV2);
6310 
6311     __ vsll_vi(idxV3, idxV3, 6);
6312     __ vor_vv(outputV3, idxV4, idxV3);
6313 
6314     // segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
6315     __ vsseg3e8_v(outputV1, dst);
6316 
6317     // dst = dst + register_group_len_bytes * 3
6318     __ add(dst, dst, stepDst);
6319     __ BIND(FailureAtIdx0);
6320   }
6321 
6322   /**
6323    * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
6324    *
6325    *  Input arguments:
6326    *  c_rarg0   - src, source array
6327    *  c_rarg1   - sp, src start offset
6328    *  c_rarg2   - sl, src end offset
6329    *  c_rarg3   - dst, dest array
6330    *  c_rarg4   - dp, dst start offset
6331    *  c_rarg5   - isURL, Base64 or URL character set
6332    *  c_rarg6   - isMIME, Decoding MIME block
6333    */
6334   address generate_base64_decodeBlock() {
6335 
6336     static const uint8_t fromBase64[256] = {
6337         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6338         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6339         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6340         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6341         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6342         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6343         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6344         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6345         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6346         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6347         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6348         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6349         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6350         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6351         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6352         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6353     };
6354 
6355     static const uint8_t fromBase64URL[256] = {
6356         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6357         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6358         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6359         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6360         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6361         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6362         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6363         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6364         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6365         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6366         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6367         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6368         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6369         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6370         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6371         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6372     };
6373 
6374     __ align(CodeEntryAlignment);
6375     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
6376     StubCodeMark mark(this, stub_id);
6377     address start = __ pc();
6378     __ enter();
6379 
6380     Register src    = c_rarg0;
6381     Register soff   = c_rarg1;
6382     Register send   = c_rarg2;
6383     Register dst    = c_rarg3;
6384     Register doff   = c_rarg4;
6385     Register isURL  = c_rarg5;
6386     Register isMIME = c_rarg6;
6387 
6388     Register codec     = c_rarg7;
6389     Register dstBackup = t6;
6390     Register length    = t3;     // total length of src data in bytes
6391 
6392     Label ProcessData, Exit;
6393     Label ProcessScalar, ScalarLoop;
6394 
6395     // passed in length (send - soff) is guaranteed to be > 4,
6396     // and in this intrinsic we only process data of length in multiple of 4,
6397     // it's not guaranteed to be multiple of 4 by java level, so do it explicitly
6398     __ sub(length, send, soff);
6399     __ andi(length, length, -4);
6400     // real src/dst to process data
6401     __ add(src, src, soff);
6402     __ add(dst, dst, doff);
6403     // backup of dst, used to calculate the return value at exit
6404     __ mv(dstBackup, dst);
6405 
6406     // load the codec base address
6407     __ la(codec, ExternalAddress((address) fromBase64));
6408     __ beqz(isURL, ProcessData);
6409     __ la(codec, ExternalAddress((address) fromBase64URL));
6410     __ BIND(ProcessData);
6411 
6412     // vector version
6413     if (UseRVV) {
6414       // for MIME case, it has a default length limit of 76 which could be
6415       // different(smaller) from (send - soff), so in MIME case, we go through
6416       // the scalar code path directly.
6417       __ bnez(isMIME, ScalarLoop);
6418 
6419       Label ProcessM1, ProcessM2;
6420 
6421       Register failedIdx = soff;
6422       Register stepSrcM1 = send;
6423       Register stepSrcM2 = doff;
6424       Register stepDst   = isURL;
6425       Register size      = t4;
6426 
6427       __ mv(size, MaxVectorSize * 2);
6428       __ mv(stepSrcM1, MaxVectorSize * 4);
6429       __ slli(stepSrcM2, stepSrcM1, 1);
6430       __ mv(stepDst, MaxVectorSize * 2 * 3);
6431 
6432       __ blt(length, stepSrcM2, ProcessM1);
6433 
6434 
6435       // Assembler::m2
6436       __ BIND(ProcessM2);
6437       base64_vector_decode_round(src, dst, codec,
6438                     size, stepSrcM2, stepDst, failedIdx,
6439                     v2, v4, v6, v8,      // inputs
6440                     v10, v12, v14, v16,  // indexes
6441                     v18, v20, v22,       // outputs
6442                     Assembler::m2);
6443       __ sub(length, length, stepSrcM2);
6444 
6445       // error check
6446       // valid value of failedIdx can only be -1 when < 0
6447       __ bgez(failedIdx, Exit);
6448 
6449       __ bge(length, stepSrcM2, ProcessM2);
6450 
6451 
6452       // Assembler::m1
6453       __ BIND(ProcessM1);
6454       __ blt(length, stepSrcM1, ProcessScalar);
6455 
6456       __ srli(size, size, 1);
6457       __ srli(stepDst, stepDst, 1);
6458       base64_vector_decode_round(src, dst, codec,
6459                     size, stepSrcM1, stepDst, failedIdx,
6460                     v1, v2, v3, v4,      // inputs
6461                     v5, v6, v7, v8,      // indexes
6462                     v9, v10, v11,        // outputs
6463                     Assembler::m1);
6464       __ sub(length, length, stepSrcM1);
6465 
6466       // error check
6467       // valid value of failedIdx can only be -1 when < 0
6468       __ bgez(failedIdx, Exit);
6469 
6470       __ BIND(ProcessScalar);
6471       __ beqz(length, Exit);
6472     }
6473 
6474     // scalar version
6475     {
6476       Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
6477       Register combined32Bits = t4;
6478 
6479       // encoded:   [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
6480       // plain:     [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
6481       __ BIND(ScalarLoop);
6482 
6483       // load 4 bytes encoded src data
6484       __ lbu(byte0, Address(src, 0));
6485       __ lbu(byte1, Address(src, 1));
6486       __ lbu(byte2, Address(src, 2));
6487       __ lbu(byte3, Address(src, 3));
6488       __ addi(src, src, 4);
6489 
6490       // get codec index and decode (ie. load from codec by index)
6491       __ add(byte0, codec, byte0);
6492       __ add(byte1, codec, byte1);
6493       __ lb(byte0, Address(byte0, 0));
6494       __ lb(byte1, Address(byte1, 0));
6495       __ add(byte2, codec, byte2);
6496       __ add(byte3, codec, byte3);
6497       __ lb(byte2, Address(byte2, 0));
6498       __ lb(byte3, Address(byte3, 0));
6499       __ slliw(byte0, byte0, 18);
6500       __ slliw(byte1, byte1, 12);
6501       __ orr(byte0, byte0, byte1);
6502       __ orr(byte0, byte0, byte3);
6503       __ slliw(byte2, byte2, 6);
6504       // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
6505       //  1. error check below
6506       //  2. decode below
6507       __ orr(combined32Bits, byte0, byte2);
6508 
6509       // error check
6510       __ bltz(combined32Bits, Exit);
6511 
6512       // store 3 bytes decoded data
6513       __ sraiw(byte0, combined32Bits, 16);
6514       __ sraiw(byte1, combined32Bits, 8);
6515       __ sb(byte0, Address(dst, 0));
6516       __ sb(byte1, Address(dst, 1));
6517       __ sb(combined32Bits, Address(dst, 2));
6518 
6519       __ subi(length, length, 4);
6520       __ addi(dst, dst, 3);
6521       // loop back
6522       __ bnez(length, ScalarLoop);
6523     }
6524 
6525     __ BIND(Exit);
6526     __ sub(c_rarg0, dst, dstBackup);
6527 
6528     __ leave();
6529     __ ret();
6530 
6531     return (address) start;
6532   }
6533 
6534   void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
6535     VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
6536     Register temp0, Register temp1, Register temp2,  Register temp3,
6537     VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
6538 
6539     assert((lmul == Assembler::m4 && step == 64) ||
6540            (lmul == Assembler::m2 && step == 32) ||
6541            (lmul == Assembler::m1 && step == 16),
6542            "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
6543     // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
6544     // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
6545     // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
6546     // In non-vectorized code, we update s1 and s2 as:
6547     //   s1 <- s1 + b1
6548     //   s2 <- s2 + s1
6549     //   s1 <- s1 + b2
6550     //   s2 <- s2 + b1
6551     //   ...
6552     //   s1 <- s1 + b64
6553     //   s2 <- s2 + s1
6554     // Putting above assignments together, we have:
6555     //   s1_new = s1 + b1 + b2 + ... + b64
6556     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
6557     //          = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
6558     //          = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
6559 
6560     __ mv(temp3, step);
6561     // Load data
6562     __ vsetvli(temp0, temp3, Assembler::e8, lmul);
6563     __ vle8_v(vbytes, buff);
6564     __ addi(buff, buff, step);
6565 
6566     // Upper bound reduction sum for s1_new:
6567     // 0xFF * 64 = 0x3FC0, so:
6568     // 1. Need to do vector-widening reduction sum
6569     // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
6570     __ vwredsumu_vs(vs1acc, vbytes, vzero);
6571     // Multiplication for s2_new
6572     __ vwmulu_vv(vs2acc, vtable, vbytes);
6573 
6574     // s2 = s2 + s1 * log2(step)
6575     __ slli(temp1, s1, exact_log2(step));
6576     __ add(s2, s2, temp1);
6577 
6578     // Summing up calculated results for s2_new
6579     if (MaxVectorSize > 16) {
6580       __ vsetvli(temp0, temp3, Assembler::e16, lmul);
6581     } else {
6582       // Half of vector-widening multiplication result is in successor of vs2acc
6583       // group for vlen == 16, in which case we need to double vector register
6584       // group width in order to reduction sum all of them
6585       Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
6586                                (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
6587       __ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
6588     }
6589     // Upper bound for reduction sum:
6590     // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
6591     // 1. Need to do vector-widening reduction sum
6592     // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
6593     __ vwredsumu_vs(vtemp1, vs2acc, vzero);
6594 
6595     // Extracting results for:
6596     // s1_new
6597     __ vmv_x_s(temp0, vs1acc);
6598     __ add(s1, s1, temp0);
6599     // s2_new
6600     __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
6601     __ vmv_x_s(temp1, vtemp1);
6602     __ add(s2, s2, temp1);
6603   }
6604 
6605   /***
6606    *  int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
6607    *
6608    *  Arguments:
6609    *
6610    *  Inputs:
6611    *   c_rarg0   - int   adler
6612    *   c_rarg1   - byte* buff (b + off)
6613    *   c_rarg2   - int   len
6614    *
6615    *  Output:
6616    *   c_rarg0   - int adler result
6617    */
6618   address generate_updateBytesAdler32() {
6619     __ align(CodeEntryAlignment);
6620     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
6621     StubCodeMark mark(this, stub_id);
6622     address start = __ pc();
6623 
6624     Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
6625       L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
6626 
6627     // Aliases
6628     Register adler  = c_rarg0;
6629     Register s1     = c_rarg0;
6630     Register s2     = c_rarg3;
6631     Register buff   = c_rarg1;
6632     Register len    = c_rarg2;
6633     Register nmax  = c_rarg4;
6634     Register base  = c_rarg5;
6635     Register count = c_rarg6;
6636     Register temp0 = t3;
6637     Register temp1 = t4;
6638     Register temp2 = t5;
6639     Register temp3 = t6;
6640 
6641     VectorRegister vzero = v31;
6642     VectorRegister vbytes = v8; // group: v8, v9, v10, v11
6643     VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
6644     VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
6645     VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
6646     VectorRegister vtable_32 = v4; // group: v4, v5
6647     VectorRegister vtable_16 = v30;
6648     VectorRegister vtemp1 = v28;
6649     VectorRegister vtemp2 = v29;
6650 
6651     // Max number of bytes we can process before having to take the mod
6652     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
6653     const uint64_t BASE = 0xfff1;
6654     const uint64_t NMAX = 0x15B0;
6655 
6656     // Loops steps
6657     int step_64 = 64;
6658     int step_32 = 32;
6659     int step_16 = 16;
6660     int step_1  = 1;
6661 
6662     __ enter(); // Required for proper stackwalking of RuntimeStub frame
6663     __ mv(temp1, 64);
6664     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
6665 
6666     // Generating accumulation coefficients for further calculations
6667     // vtable_64:
6668     __ vid_v(vtemp1);
6669     __ vrsub_vx(vtable_64, vtemp1, temp1);
6670     // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
6671 
6672     // vtable_32:
6673     __ mv(temp1, 32);
6674     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
6675     __ vid_v(vtemp1);
6676     __ vrsub_vx(vtable_32, vtemp1, temp1);
6677     // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
6678 
6679     __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
6680     // vtable_16:
6681     __ mv(temp1, 16);
6682     __ vid_v(vtemp1);
6683     __ vrsub_vx(vtable_16, vtemp1, temp1);
6684     // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
6685 
6686     __ vmv_v_i(vzero, 0);
6687 
6688     __ mv(base, BASE);
6689     __ mv(nmax, NMAX);
6690 
6691     // s1 is initialized to the lower 16 bits of adler
6692     // s2 is initialized to the upper 16 bits of adler
6693     __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
6694     __ zext(s1, adler, 16); // s1 = (adler & 0xffff)
6695 
6696     // The pipelined loop needs at least 16 elements for 1 iteration
6697     // It does check this, but it is more effective to skip to the cleanup loop
6698     __ mv(temp0, step_16);
6699     __ bgeu(len, temp0, L_nmax);
6700     __ beqz(len, L_combine);
6701 
6702     // Jumping to L_by1_loop
6703     __ subi(len, len, step_1);
6704     __ j(L_by1_loop);
6705 
6706   __ bind(L_nmax);
6707     __ sub(len, len, nmax);
6708     __ subi(count, nmax, 16);
6709     __ bltz(len, L_by16);
6710 
6711   // Align L_nmax loop by 64
6712   __ bind(L_nmax_loop_entry);
6713     __ subi(count, count, 32);
6714 
6715   __ bind(L_nmax_loop);
6716     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6717       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6718       vtemp1, vtemp2, step_64, Assembler::m4);
6719     __ subi(count, count, step_64);
6720     __ bgtz(count, L_nmax_loop);
6721 
6722     // There are three iterations left to do
6723     adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
6724       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6725       vtemp1, vtemp2, step_32, Assembler::m2);
6726     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6727       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6728       vtemp1, vtemp2, step_16, Assembler::m1);
6729 
6730     // s1 = s1 % BASE
6731     __ remuw(s1, s1, base);
6732     // s2 = s2 % BASE
6733     __ remuw(s2, s2, base);
6734 
6735     __ sub(len, len, nmax);
6736     __ subi(count, nmax, 16);
6737     __ bgez(len, L_nmax_loop_entry);
6738 
6739   __ bind(L_by16);
6740     __ add(len, len, count);
6741     __ bltz(len, L_by1);
6742     // Trying to unroll
6743     __ mv(temp3, step_64);
6744     __ blt(len, temp3, L_by16_loop);
6745 
6746   __ bind(L_by16_loop_unroll);
6747     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6748       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6749       vtemp1, vtemp2, step_64, Assembler::m4);
6750     __ subi(len, len, step_64);
6751     // By now the temp3 should still be 64
6752     __ bge(len, temp3, L_by16_loop_unroll);
6753 
6754   __ bind(L_by16_loop);
6755     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6756       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6757       vtemp1, vtemp2, step_16, Assembler::m1);
6758     __ subi(len, len, step_16);
6759     __ bgez(len, L_by16_loop);
6760 
6761   __ bind(L_by1);
6762     __ addi(len, len, 15);
6763     __ bltz(len, L_do_mod);
6764 
6765   __ bind(L_by1_loop);
6766     __ lbu(temp0, Address(buff, 0));
6767     __ addi(buff, buff, step_1);
6768     __ add(s1, temp0, s1);
6769     __ add(s2, s2, s1);
6770     __ subi(len, len, step_1);
6771     __ bgez(len, L_by1_loop);
6772 
6773   __ bind(L_do_mod);
6774     // s1 = s1 % BASE
6775     __ remuw(s1, s1, base);
6776     // s2 = s2 % BASE
6777     __ remuw(s2, s2, base);
6778 
6779     // Combine lower bits and higher bits
6780     // adler = s1 | (s2 << 16)
6781   __ bind(L_combine);
6782     __ slli(s2, s2, 16);
6783     __ orr(s1, s1, s2);
6784 
6785     __ leave(); // Required for proper stackwalking of RuntimeStub frame
6786     __ ret();
6787 
6788     return start;
6789   }
6790 
6791 #endif // COMPILER2
6792 
6793   // x10 = input (float16)
6794   // f10 = result (float)
6795   // t1  = temporary register
6796   address generate_float16ToFloat() {
6797     __ align(CodeEntryAlignment);
6798     StubId stub_id = StubId::stubgen_hf2f_id;
6799     StubCodeMark mark(this, stub_id);
6800     address entry = __ pc();
6801     BLOCK_COMMENT("float16ToFloat:");
6802 
6803     FloatRegister dst = f10;
6804     Register src = x10;
6805     Label NaN_SLOW;
6806 
6807     assert(VM_Version::supports_float16_float_conversion(), "must");
6808 
6809     // On riscv, NaN needs a special process as fcvt does not work in that case.
6810     // On riscv, Inf does not need a special process as fcvt can handle it correctly.
6811     // but we consider to get the slow path to process NaN and Inf at the same time,
6812     // as both of them are rare cases, and if we try to get the slow path to handle
6813     // only NaN case it would sacrifise the performance for normal cases,
6814     // i.e. non-NaN and non-Inf cases.
6815 
6816     // check whether it's a NaN or +/- Inf.
6817     __ mv(t0, 0x7c00);
6818     __ andr(t1, src, t0);
6819     // jump to stub processing NaN and Inf cases.
6820     __ beq(t0, t1, NaN_SLOW);
6821 
6822     // non-NaN or non-Inf cases, just use built-in instructions.
6823     __ fmv_h_x(dst, src);
6824     __ fcvt_s_h(dst, dst);
6825     __ ret();
6826 
6827     __ bind(NaN_SLOW);
6828     // following instructions mainly focus on NaN, as riscv does not handle
6829     // NaN well with fcvt, but the code also works for Inf at the same time.
6830 
6831     // construct a NaN in 32 bits from the NaN in 16 bits,
6832     // we need the payloads of non-canonical NaNs to be preserved.
6833     __ mv(t1, 0x7f800000);
6834     // sign-bit was already set via sign-extension if necessary.
6835     __ slli(t0, src, 13);
6836     __ orr(t1, t0, t1);
6837     __ fmv_w_x(dst, t1);
6838 
6839     __ ret();
6840     return entry;
6841   }
6842 
6843   // f10 = input (float)
6844   // x10 = result (float16)
6845   // f11 = temporary float register
6846   // t1  = temporary register
6847   address generate_floatToFloat16() {
6848     __ align(CodeEntryAlignment);
6849     StubId stub_id = StubId::stubgen_f2hf_id;
6850     StubCodeMark mark(this, stub_id);
6851     address entry = __ pc();
6852     BLOCK_COMMENT("floatToFloat16:");
6853 
6854     Register dst = x10;
6855     FloatRegister src = f10, ftmp = f11;
6856     Label NaN_SLOW;
6857 
6858     assert(VM_Version::supports_float16_float_conversion(), "must");
6859 
6860     // On riscv, NaN needs a special process as fcvt does not work in that case.
6861 
6862     // check whether it's a NaN.
6863     // replace fclass with feq as performance optimization.
6864     __ feq_s(t0, src, src);
6865     // jump to stub processing NaN cases.
6866     __ beqz(t0, NaN_SLOW);
6867 
6868     // non-NaN cases, just use built-in instructions.
6869     __ fcvt_h_s(ftmp, src);
6870     __ fmv_x_h(dst, ftmp);
6871     __ ret();
6872 
6873     __ bind(NaN_SLOW);
6874 
6875     __ float_to_float16_NaN(dst, src, t0, t1);
6876 
6877     __ ret();
6878     return entry;
6879   }
6880 
6881 #ifdef COMPILER2
6882 
6883 static const int64_t right_2_bits = right_n_bits(2);
6884 static const int64_t right_3_bits = right_n_bits(3);
6885 
6886   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
6887   // are represented as long[5], with BITS_PER_LIMB = 26.
6888   // Pack five 26-bit limbs into three 64-bit registers.
6889   void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
6890     assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
6891 
6892     // The goal is to have 128-bit value in dest2:dest1:dest0
6893     __ ld(dest0, Address(src, 0));    // 26 bits in dest0
6894 
6895     __ ld(tmp1, Address(src, sizeof(jlong)));
6896     __ slli(tmp1, tmp1, 26);
6897     __ add(dest0, dest0, tmp1);       // 52 bits in dest0
6898 
6899     __ ld(tmp2, Address(src, 2 * sizeof(jlong)));
6900     __ slli(tmp1, tmp2, 52);
6901     __ add(dest0, dest0, tmp1);       // dest0 is full
6902 
6903     __ srli(dest1, tmp2, 12);         // 14-bit in dest1
6904 
6905     __ ld(tmp1, Address(src, 3 * sizeof(jlong)));
6906     __ slli(tmp1, tmp1, 14);
6907     __ add(dest1, dest1, tmp1);       // 40-bit in dest1
6908 
6909     __ ld(tmp1, Address(src, 4 * sizeof(jlong)));
6910     __ slli(tmp2, tmp1, 40);
6911     __ add(dest1, dest1, tmp2);       // dest1 is full
6912 
6913     if (dest2->is_valid()) {
6914       __ srli(tmp1, tmp1, 24);
6915       __ mv(dest2, tmp1);               // 2 bits in dest2
6916     } else {
6917 #ifdef ASSERT
6918       Label OK;
6919       __ srli(tmp1, tmp1, 24);
6920       __ beq(zr, tmp1, OK);           // 2 bits
6921       __ stop("high bits of Poly1305 integer should be zero");
6922       __ should_not_reach_here();
6923       __ bind(OK);
6924 #endif
6925     }
6926   }
6927 
6928   // As above, but return only a 128-bit integer, packed into two
6929   // 64-bit registers.
6930   void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
6931     poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
6932   }
6933 
6934   // U_2:U_1:U_0: += (U_2 >> 2) * 5
6935   void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
6936     assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
6937 
6938     // First, U_2:U_1:U_0 += (U_2 >> 2)
6939     __ srli(tmp1, U_2, 2);
6940     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6941     __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
6942     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6943     __ add(U_2, U_2, tmp2);
6944 
6945     // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
6946     __ slli(tmp1, tmp1, 2);
6947     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6948     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6949     __ add(U_2, U_2, tmp2);
6950   }
6951 
6952   // Poly1305, RFC 7539
6953   // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
6954 
6955   // Arguments:
6956   //    c_rarg0:   input_start -- where the input is stored
6957   //    c_rarg1:   length
6958   //    c_rarg2:   acc_start -- where the output will be stored
6959   //    c_rarg3:   r_start -- where the randomly generated 128-bit key is stored
6960 
6961   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
6962   // description of the tricks used to simplify and accelerate this
6963   // computation.
6964 
6965   address generate_poly1305_processBlocks() {
6966     __ align(CodeEntryAlignment);
6967     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
6968     StubCodeMark mark(this, stub_id);
6969     address start = __ pc();
6970     __ enter();
6971     Label here;
6972 
6973     RegSet saved_regs = RegSet::range(x18, x21);
6974     RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
6975     __ push_reg(saved_regs, sp);
6976 
6977     // Arguments
6978     const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
6979 
6980     // R_n is the 128-bit randomly-generated key, packed into two
6981     // registers. The caller passes this key to us as long[5], with
6982     // BITS_PER_LIMB = 26.
6983     const Register R_0 = *regs, R_1 = *++regs;
6984     poly1305_pack_26(R_0, R_1, r_start, t1, t2);
6985 
6986     // RR_n is (R_n >> 2) * 5
6987     const Register RR_0 = *++regs, RR_1 = *++regs;
6988     __ srli(t1, R_0, 2);
6989     __ shadd(RR_0, t1, t1, t2, 2);
6990     __ srli(t1, R_1, 2);
6991     __ shadd(RR_1, t1, t1, t2, 2);
6992 
6993     // U_n is the current checksum
6994     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
6995     poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
6996 
6997     static constexpr int BLOCK_LENGTH = 16;
6998     Label DONE, LOOP;
6999 
7000     __ mv(t1, BLOCK_LENGTH);
7001     __ blt(length, t1, DONE); {
7002       __ bind(LOOP);
7003 
7004       // S_n is to be the sum of U_n and the next block of data
7005       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7006       __ ld(S_0, Address(input_start, 0));
7007       __ ld(S_1, Address(input_start, wordSize));
7008 
7009       __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
7010       __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
7011       __ add(S_2, U_2, t1);
7012 
7013       __ addi(S_2, S_2, 1);
7014 
7015       const Register U_0HI = *++regs, U_1HI = *++regs;
7016 
7017       // NB: this logic depends on some of the special properties of
7018       // Poly1305 keys. In particular, because we know that the top
7019       // four bits of R_0 and R_1 are zero, we can add together
7020       // partial products without any risk of needing to propagate a
7021       // carry out.
7022       __ wide_mul(U_0, U_0HI, S_0, R_0);
7023       __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
7024       __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
7025 
7026       __ wide_mul(U_1, U_1HI, S_0, R_1);
7027       __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
7028       __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
7029 
7030       __ andi(U_2, R_0, right_2_bits);
7031       __ mul(U_2, S_2, U_2);
7032 
7033       // Partial reduction mod 2**130 - 5
7034       __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
7035       __ adc(U_2, U_2, U_1HI, t1);
7036       // Sum is now in U_2:U_1:U_0.
7037 
7038       // U_2:U_1:U_0: += (U_2 >> 2) * 5
7039       poly1305_reduce(U_2, U_1, U_0, t1, t2);
7040 
7041       __ subi(length, length, BLOCK_LENGTH);
7042       __ addi(input_start, input_start, BLOCK_LENGTH);
7043       __ mv(t1, BLOCK_LENGTH);
7044       __ bge(length, t1, LOOP);
7045     }
7046 
7047     // Further reduce modulo 2^130 - 5
7048     poly1305_reduce(U_2, U_1, U_0, t1, t2);
7049 
7050     // Unpack the sum into five 26-bit limbs and write to memory.
7051     // First 26 bits is the first limb
7052     __ slli(t1, U_0, 38); // Take lowest 26 bits
7053     __ srli(t1, t1, 38);
7054     __ sd(t1, Address(acc_start)); // First 26-bit limb
7055 
7056     // 27-52 bits of U_0 is the second limb
7057     __ slli(t1, U_0, 12); // Take next 27-52 bits
7058     __ srli(t1, t1, 38);
7059     __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
7060 
7061     // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
7062     __ srli(t1, U_0, 52);
7063     __ slli(t2, U_1, 50);
7064     __ srli(t2, t2, 38);
7065     __ add(t1, t1, t2);
7066     __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
7067 
7068     // Storing 15-40 bits of U_1
7069     __ slli(t1, U_1, 24); // Already used up 14 bits
7070     __ srli(t1, t1, 38); // Clear all other bits from t1
7071     __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
7072 
7073     // Storing 41-64 bits of U_1 and first three bits from U_2 in one register
7074     __ srli(t1, U_1, 40);
7075     __ andi(t2, U_2, right_3_bits);
7076     __ slli(t2, t2, 24);
7077     __ add(t1, t1, t2);
7078     __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
7079 
7080     __ bind(DONE);
7081     __ pop_reg(saved_regs, sp);
7082     __ leave(); // Required for proper stackwalking
7083     __ ret();
7084 
7085     return start;
7086   }
7087 
7088   address generate_arrays_hashcode_powers_of_31() {
7089     assert(UseRVV, "sanity");
7090     const int lmul = 2;
7091     const int stride = MaxVectorSize / sizeof(jint) * lmul;
7092     __ align(CodeEntryAlignment);
7093     StubCodeMark mark(this, "StubRoutines", "arrays_hashcode_powers_of_31");
7094     address start = __ pc();
7095     for (int i = stride; i >= 0; i--) {
7096         jint power_of_31 = 1;
7097         for (int j = i; j > 0; j--) {
7098           power_of_31 = java_multiply(power_of_31, 31);
7099         }
7100         __ emit_int32(power_of_31);
7101     }
7102 
7103     return start;
7104   }
7105 
7106 #endif // COMPILER2
7107 
7108   /**
7109    *  Arguments:
7110    *
7111    * Inputs:
7112    *   c_rarg0   - int crc
7113    *   c_rarg1   - byte* buf
7114    *   c_rarg2   - int length
7115    *
7116    * Output:
7117    *   c_rarg0   - int crc result
7118    */
7119   address generate_updateBytesCRC32() {
7120     assert(UseCRC32Intrinsics, "what are we doing here?");
7121 
7122     __ align(CodeEntryAlignment);
7123     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7124     StubCodeMark mark(this, stub_id);
7125 
7126     address start = __ pc();
7127 
7128     // input parameters
7129     const Register crc    = c_rarg0;  // crc
7130     const Register buf    = c_rarg1;  // source java byte array address
7131     const Register len    = c_rarg2;  // length
7132 
7133     BLOCK_COMMENT("Entry:");
7134     __ enter(); // required for proper stackwalking of RuntimeStub frame
7135 
7136     __ kernel_crc32(crc, buf, len,
7137                     c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables
7138                     c_rarg7, t2, t3, t4, t5, t6);       // misc tmps
7139 
7140     __ leave(); // required for proper stackwalking of RuntimeStub frame
7141     __ ret();
7142 
7143     return start;
7144   }
7145 
7146   // exception handler for upcall stubs
7147   address generate_upcall_stub_exception_handler() {
7148     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
7149     StubCodeMark mark(this, stub_id);
7150     address start = __ pc();
7151 
7152     // Native caller has no idea how to handle exceptions,
7153     // so we just crash here. Up to callee to catch exceptions.
7154     __ verify_oop(x10); // return a exception oop in a0
7155     __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
7156     __ should_not_reach_here();
7157 
7158     return start;
7159   }
7160 
7161   // load Method* target of MethodHandle
7162   // j_rarg0 = jobject receiver
7163   // xmethod = Method* result
7164   address generate_upcall_stub_load_target() {
7165 
7166     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
7167     StubCodeMark mark(this, stub_id);
7168     address start = __ pc();
7169 
7170     __ resolve_global_jobject(j_rarg0, t0, t1);
7171       // Load target method from receiver
7172     __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1);
7173     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1);
7174     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1);
7175     __ access_load_at(T_ADDRESS, IN_HEAP, xmethod,
7176                       Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7177                       noreg, noreg);
7178     __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7179 
7180     __ ret();
7181 
7182     return start;
7183   }
7184 
7185 #undef __
7186 
7187   // Initialization
7188   void generate_preuniverse_stubs() {
7189     // preuniverse stubs are not needed for riscv
7190   }
7191 
7192   void generate_initial_stubs() {
7193     // Generate initial stubs and initializes the entry points
7194 
7195     // entry points that exist in all platforms Note: This is code
7196     // that could be shared among different platforms - however the
7197     // benefit seems to be smaller than the disadvantage of having a
7198     // much more complicated generator structure. See also comment in
7199     // stubRoutines.hpp.
7200 
7201     StubRoutines::_forward_exception_entry = generate_forward_exception();
7202 
7203     if (UnsafeMemoryAccess::_table == nullptr) {
7204       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
7205     }
7206 
7207     StubRoutines::_call_stub_entry =
7208       generate_call_stub(StubRoutines::_call_stub_return_address);
7209 
7210     // is referenced by megamorphic call
7211     StubRoutines::_catch_exception_entry = generate_catch_exception();
7212 
7213     if (UseCRC32Intrinsics) {
7214       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7215     }
7216 
7217     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
7218         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
7219       StubRoutines::_hf2f = generate_float16ToFloat();
7220       StubRoutines::_f2hf = generate_floatToFloat16();
7221     }
7222   }
7223 
7224   void generate_continuation_stubs() {
7225     // Continuation stubs:
7226     StubRoutines::_cont_thaw             = generate_cont_thaw();
7227     StubRoutines::_cont_returnBarrier    = generate_cont_returnBarrier();
7228     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7229     StubRoutines::_cont_preempt_stub     = generate_cont_preempt_stub();
7230   }
7231 
7232   void generate_final_stubs() {
7233     // support for verify_oop (must happen after universe_init)
7234     if (VerifyOops) {
7235       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7236     }
7237 
7238     // arraycopy stubs used by compilers
7239     generate_arraycopy_stubs();
7240 
7241     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
7242 
7243 #ifdef COMPILER2
7244     if (UseSecondarySupersTable) {
7245       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
7246       if (!InlineSecondarySupersTest) {
7247         generate_lookup_secondary_supers_table_stub();
7248       }
7249     }
7250 #endif // COMPILER2
7251 
7252     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
7253     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
7254 
7255     StubRoutines::riscv::set_completed();
7256   }
7257 
7258   void generate_compiler_stubs() {
7259 #ifdef COMPILER2
7260     if (UseMulAddIntrinsic) {
7261       StubRoutines::_mulAdd = generate_mulAdd();
7262     }
7263 
7264     if (UseMultiplyToLenIntrinsic) {
7265       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7266     }
7267 
7268     if (UseSquareToLenIntrinsic) {
7269       StubRoutines::_squareToLen = generate_squareToLen();
7270     }
7271 
7272     if (UseMontgomeryMultiplyIntrinsic) {
7273       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
7274       StubCodeMark mark(this, stub_id);
7275       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7276       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7277     }
7278 
7279     if (UseMontgomerySquareIntrinsic) {
7280       StubId stub_id = StubId::stubgen_montgomerySquare_id;
7281       StubCodeMark mark(this, stub_id);
7282       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7283       StubRoutines::_montgomerySquare = g.generate_square();
7284     }
7285 
7286     if (UseAESIntrinsics) {
7287       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7288       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7289       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7290       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7291     }
7292 
7293     if (UseAESCTRIntrinsics) {
7294       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7295     }
7296 
7297     if (UseGHASHIntrinsics) {
7298       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7299     }
7300 
7301     if (UsePoly1305Intrinsics) {
7302       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
7303     }
7304 
7305     if (UseRVV) {
7306       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7307       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7308     }
7309 
7310     if (UseVectorizedHashCodeIntrinsic && UseRVV) {
7311       StubRoutines::riscv::_arrays_hashcode_powers_of_31 = generate_arrays_hashcode_powers_of_31();
7312     }
7313 
7314     if (UseSHA256Intrinsics) {
7315       Sha2Generator sha2(_masm, this);
7316       StubRoutines::_sha256_implCompress   = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
7317       StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
7318     }
7319 
7320     if (UseSHA512Intrinsics) {
7321       Sha2Generator sha2(_masm, this);
7322       StubRoutines::_sha512_implCompress   = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
7323       StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
7324     }
7325 
7326     if (UseMD5Intrinsics) {
7327       StubRoutines::_md5_implCompress   = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
7328       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
7329     }
7330 
7331     if (UseChaCha20Intrinsics) {
7332       StubRoutines::_chacha20Block = generate_chacha20Block();
7333     }
7334 
7335     if (UseSHA1Intrinsics) {
7336       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
7337       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
7338     }
7339 
7340     if (UseBASE64Intrinsics) {
7341       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7342       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7343     }
7344 
7345     if (UseAdler32Intrinsics) {
7346       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7347     }
7348 
7349     generate_compare_long_strings();
7350 
7351     generate_string_indexof_stubs();
7352 
7353 #endif // COMPILER2
7354   }
7355 
7356  public:
7357   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
7358     switch(blob_id) {
7359     case BlobId::stubgen_preuniverse_id:
7360       generate_preuniverse_stubs();
7361       break;
7362     case BlobId::stubgen_initial_id:
7363       generate_initial_stubs();
7364       break;
7365     case BlobId::stubgen_continuation_id:
7366       generate_continuation_stubs();
7367       break;
7368     case BlobId::stubgen_compiler_id:
7369       generate_compiler_stubs();
7370       break;
7371     case BlobId::stubgen_final_id:
7372       generate_final_stubs();
7373       break;
7374     default:
7375       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
7376       break;
7377     };
7378   }
7379 }; // end class declaration
7380 
7381 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
7382   StubGenerator g(code, blob_id, stub_data);
7383 }