Old src/hotspot/cpu/riscv/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/universe.hpp"
  35 #include "nativeInst_riscv.hpp"
  36 #include "oops/instanceOop.hpp"
  37 #include "oops/method.hpp"
  38 #include "oops/objArrayKlass.hpp"
  39 #include "oops/oop.inline.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "prims/upcallLinker.hpp"
  42 #include "runtime/continuation.hpp"
  43 #include "runtime/continuationEntry.inline.hpp"
  44 #include "runtime/frame.inline.hpp"
  45 #include "runtime/handles.inline.hpp"
  46 #include "runtime/javaThread.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubCodeGenerator.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "utilities/align.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/runtime.hpp"
  54 #endif
  55 
  56 // Declaration and definition of StubGenerator (no .hpp file).
  57 // For a more detailed description of the stub routine structure
  58 // see the comment in stubRoutines.hpp
  59 
  60 #undef __
  61 #define __ _masm->
  62 
  63 #ifdef PRODUCT
  64 #define BLOCK_COMMENT(str) /* nothing */
  65 #else
  66 #define BLOCK_COMMENT(str) __ block_comment(str)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Stub Code definitions
  72 
  73 class StubGenerator: public StubCodeGenerator {
  74  private:
  75 
  76 #ifdef PRODUCT
  77 #define inc_counter_np(counter) ((void)0)
  78 #else
  79   void inc_counter_np_(uint& counter) {
  80     __ incrementw(ExternalAddress((address)&counter));
  81   }
  82 #define inc_counter_np(counter) \
  83   BLOCK_COMMENT("inc_counter " #counter); \
  84   inc_counter_np_(counter);
  85 #endif
  86 
  87   // Call stubs are used to call Java from C
  88   //
  89   // Arguments:
  90   //    c_rarg0:   call wrapper address                   address
  91   //    c_rarg1:   result                                 address
  92   //    c_rarg2:   result type                            BasicType
  93   //    c_rarg3:   method                                 Method*
  94   //    c_rarg4:   (interpreter) entry point              address
  95   //    c_rarg5:   parameters                             intptr_t*
  96   //    c_rarg6:   parameter size (in words)              int
  97   //    c_rarg7:   thread                                 Thread*
  98   //
  99   // There is no return from the stub itself as any Java result
 100   // is written to result
 101   //
 102   // we save x1 (ra) as the return PC at the base of the frame and
 103   // link x8 (fp) below it as the frame pointer installing sp (x2)
 104   // into fp.
 105   //
 106   // we save x10-x17, which accounts for all the c arguments.
 107   //
 108   // TODO: strictly do we need to save them all? they are treated as
 109   // volatile by C so could we omit saving the ones we are going to
 110   // place in global registers (thread? method?) or those we only use
 111   // during setup of the Java call?
 112   //
 113   // we don't need to save x5 which C uses as an indirect result location
 114   // return register.
 115   //
 116   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 117   // volatile
 118   //
 119   // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
 120   // registers and C expects to be callee-save
 121   //
 122   // so the stub frame looks like this when we enter Java code
 123   //
 124   //     [ return_from_Java     ] <--- sp
 125   //     [ argument word n      ]
 126   //      ...
 127   // -35 [ argument word 1      ]
 128   // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
 129   // -33 [ saved f27            ]
 130   // -32 [ saved f26            ]
 131   // -31 [ saved f25            ]
 132   // -30 [ saved f24            ]
 133   // -29 [ saved f23            ]
 134   // -28 [ saved f22            ]
 135   // -27 [ saved f21            ]
 136   // -26 [ saved f20            ]
 137   // -25 [ saved f19            ]
 138   // -24 [ saved f18            ]
 139   // -23 [ saved f9             ]
 140   // -22 [ saved f8             ]
 141   // -21 [ saved x27            ]
 142   // -20 [ saved x26            ]
 143   // -19 [ saved x25            ]
 144   // -18 [ saved x24            ]
 145   // -17 [ saved x23            ]
 146   // -16 [ saved x22            ]
 147   // -15 [ saved x21            ]
 148   // -14 [ saved x20            ]
 149   // -13 [ saved x19            ]
 150   // -12 [ saved x18            ]
 151   // -11 [ saved x9             ]
 152   // -10 [ call wrapper   (x10) ]
 153   //  -9 [ result         (x11) ]
 154   //  -8 [ result type    (x12) ]
 155   //  -7 [ method         (x13) ]
 156   //  -6 [ entry point    (x14) ]
 157   //  -5 [ parameters     (x15) ]
 158   //  -4 [ parameter size (x16) ]
 159   //  -3 [ thread         (x17) ]
 160   //  -2 [ saved fp       (x8)  ]
 161   //  -1 [ saved ra       (x1)  ]
 162   //   0 [                      ] <--- fp == saved sp (x2)
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off  = -34,
 167 
 168     frm_off            = sp_after_call_off,
 169     f27_off            = -33,
 170     f26_off            = -32,
 171     f25_off            = -31,
 172     f24_off            = -30,
 173     f23_off            = -29,
 174     f22_off            = -28,
 175     f21_off            = -27,
 176     f20_off            = -26,
 177     f19_off            = -25,
 178     f18_off            = -24,
 179     f9_off             = -23,
 180     f8_off             = -22,
 181 
 182     x27_off            = -21,
 183     x26_off            = -20,
 184     x25_off            = -19,
 185     x24_off            = -18,
 186     x23_off            = -17,
 187     x22_off            = -16,
 188     x21_off            = -15,
 189     x20_off            = -14,
 190     x19_off            = -13,
 191     x18_off            = -12,
 192     x9_off             = -11,
 193 
 194     call_wrapper_off   = -10,
 195     result_off         = -9,
 196     result_type_off    = -8,
 197     method_off         = -7,
 198     entry_point_off    = -6,
 199     parameters_off     = -5,
 200     parameter_size_off = -4,
 201     thread_off         = -3,
 202     fp_f               = -2,
 203     retaddr_off        = -1,
 204   };
 205 
 206   address generate_call_stub(address& return_address) {
 207     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 208            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 209            "adjust this code");
 210 
 211     StubCodeMark mark(this, "StubRoutines", "call_stub");
 212     address start = __ pc();
 213 
 214     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 215 
 216     const Address frm_save      (fp, frm_off           * wordSize);
 217     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 218     const Address result        (fp, result_off         * wordSize);
 219     const Address result_type   (fp, result_type_off    * wordSize);
 220     const Address method        (fp, method_off         * wordSize);
 221     const Address entry_point   (fp, entry_point_off    * wordSize);
 222     const Address parameters    (fp, parameters_off     * wordSize);
 223     const Address parameter_size(fp, parameter_size_off * wordSize);
 224 
 225     const Address thread        (fp, thread_off         * wordSize);
 226 
 227     const Address f27_save      (fp, f27_off            * wordSize);
 228     const Address f26_save      (fp, f26_off            * wordSize);
 229     const Address f25_save      (fp, f25_off            * wordSize);
 230     const Address f24_save      (fp, f24_off            * wordSize);
 231     const Address f23_save      (fp, f23_off            * wordSize);
 232     const Address f22_save      (fp, f22_off            * wordSize);
 233     const Address f21_save      (fp, f21_off            * wordSize);
 234     const Address f20_save      (fp, f20_off            * wordSize);
 235     const Address f19_save      (fp, f19_off            * wordSize);
 236     const Address f18_save      (fp, f18_off            * wordSize);
 237     const Address f9_save       (fp, f9_off             * wordSize);
 238     const Address f8_save       (fp, f8_off             * wordSize);
 239 
 240     const Address x27_save      (fp, x27_off            * wordSize);
 241     const Address x26_save      (fp, x26_off            * wordSize);
 242     const Address x25_save      (fp, x25_off            * wordSize);
 243     const Address x24_save      (fp, x24_off            * wordSize);
 244     const Address x23_save      (fp, x23_off            * wordSize);
 245     const Address x22_save      (fp, x22_off            * wordSize);
 246     const Address x21_save      (fp, x21_off            * wordSize);
 247     const Address x20_save      (fp, x20_off            * wordSize);
 248     const Address x19_save      (fp, x19_off            * wordSize);
 249     const Address x18_save      (fp, x18_off            * wordSize);
 250 
 251     const Address x9_save       (fp, x9_off             * wordSize);
 252 
 253     // stub code
 254 
 255     address riscv_entry = __ pc();
 256 
 257     // set up frame and move sp to end of save area
 258     __ enter();
 259     __ addi(sp, fp, sp_after_call_off * wordSize);
 260 
 261     // save register parameters and Java temporary/global registers
 262     // n.b. we save thread even though it gets installed in
 263     // xthread because we want to sanity check tp later
 264     __ sd(c_rarg7, thread);
 265     __ sw(c_rarg6, parameter_size);
 266     __ sd(c_rarg5, parameters);
 267     __ sd(c_rarg4, entry_point);
 268     __ sd(c_rarg3, method);
 269     __ sd(c_rarg2, result_type);
 270     __ sd(c_rarg1, result);
 271     __ sd(c_rarg0, call_wrapper);
 272 
 273     __ sd(x9, x9_save);
 274 
 275     __ sd(x18, x18_save);
 276     __ sd(x19, x19_save);
 277     __ sd(x20, x20_save);
 278     __ sd(x21, x21_save);
 279     __ sd(x22, x22_save);
 280     __ sd(x23, x23_save);
 281     __ sd(x24, x24_save);
 282     __ sd(x25, x25_save);
 283     __ sd(x26, x26_save);
 284     __ sd(x27, x27_save);
 285 
 286     __ fsd(f8,  f8_save);
 287     __ fsd(f9,  f9_save);
 288     __ fsd(f18, f18_save);
 289     __ fsd(f19, f19_save);
 290     __ fsd(f20, f20_save);
 291     __ fsd(f21, f21_save);
 292     __ fsd(f22, f22_save);
 293     __ fsd(f23, f23_save);
 294     __ fsd(f24, f24_save);
 295     __ fsd(f25, f25_save);
 296     __ fsd(f26, f26_save);
 297     __ fsd(f27, f27_save);
 298 
 299     __ frrm(t0);
 300     __ sd(t0, frm_save);
 301     // Set frm to the state we need. We do want Round to Nearest. We
 302     // don't want non-IEEE rounding modes.
 303     Label skip_fsrmi;
 304     guarantee(__ RoundingMode::rne == 0, "must be");
 305     __ beqz(t0, skip_fsrmi);
 306     __ fsrmi(__ RoundingMode::rne);
 307     __ bind(skip_fsrmi);
 308 
 309     // install Java thread in global register now we have saved
 310     // whatever value it held
 311     __ mv(xthread, c_rarg7);
 312 
 313     // And method
 314     __ mv(xmethod, c_rarg3);
 315 
 316     // set up the heapbase register
 317     __ reinit_heapbase();
 318 
 319 #ifdef ASSERT
 320     // make sure we have no pending exceptions
 321     {
 322       Label L;
 323       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 324       __ beqz(t0, L);
 325       __ stop("StubRoutines::call_stub: entered with pending exception");
 326       __ BIND(L);
 327     }
 328 #endif
 329     // pass parameters if any
 330     __ mv(esp, sp);
 331     __ slli(t0, c_rarg6, LogBytesPerWord);
 332     __ sub(t0, sp, t0); // Move SP out of the way
 333     __ andi(sp, t0, -2 * wordSize);
 334 
 335     BLOCK_COMMENT("pass parameters if any");
 336     Label parameters_done;
 337     // parameter count is still in c_rarg6
 338     // and parameter pointer identifying param 1 is in c_rarg5
 339     __ beqz(c_rarg6, parameters_done);
 340 
 341     address loop = __ pc();
 342     __ ld(t0, Address(c_rarg5, 0));
 343     __ addi(c_rarg5, c_rarg5, wordSize);
 344     __ addi(c_rarg6, c_rarg6, -1);
 345     __ push_reg(t0);
 346     __ bgtz(c_rarg6, loop);
 347 
 348     __ BIND(parameters_done);
 349 
 350     // call Java entry -- passing methdoOop, and current sp
 351     //      xmethod: Method*
 352     //      x19_sender_sp: sender sp
 353     BLOCK_COMMENT("call Java function");
 354     __ mv(x19_sender_sp, sp);
 355     __ jalr(c_rarg4);
 356 
 357     // save current address for use by exception handling code
 358 
 359     return_address = __ pc();
 360 
 361     // store result depending on type (everything that is not
 362     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 363     // n.b. this assumes Java returns an integral result in x10
 364     // and a floating result in j_farg0
 365     __ ld(j_rarg2, result);
 366     Label is_long, is_float, is_double, exit;
 367     __ ld(j_rarg1, result_type);
 368     __ mv(t0, (u1)T_OBJECT);
 369     __ beq(j_rarg1, t0, is_long);
 370     __ mv(t0, (u1)T_LONG);
 371     __ beq(j_rarg1, t0, is_long);
 372     __ mv(t0, (u1)T_FLOAT);
 373     __ beq(j_rarg1, t0, is_float);
 374     __ mv(t0, (u1)T_DOUBLE);
 375     __ beq(j_rarg1, t0, is_double);
 376 
 377     // handle T_INT case
 378     __ sw(x10, Address(j_rarg2));
 379 
 380     __ BIND(exit);
 381 
 382     // pop parameters
 383     __ addi(esp, fp, sp_after_call_off * wordSize);
 384 
 385 #ifdef ASSERT
 386     // verify that threads correspond
 387     {
 388       Label L, S;
 389       __ ld(t0, thread);
 390       __ bne(xthread, t0, S);
 391       __ get_thread(t0);
 392       __ beq(xthread, t0, L);
 393       __ BIND(S);
 394       __ stop("StubRoutines::call_stub: threads must correspond");
 395       __ BIND(L);
 396     }
 397 #endif
 398 
 399     __ pop_cont_fastpath(xthread);
 400 
 401     // restore callee-save registers
 402     __ fld(f27, f27_save);
 403     __ fld(f26, f26_save);
 404     __ fld(f25, f25_save);
 405     __ fld(f24, f24_save);
 406     __ fld(f23, f23_save);
 407     __ fld(f22, f22_save);
 408     __ fld(f21, f21_save);
 409     __ fld(f20, f20_save);
 410     __ fld(f19, f19_save);
 411     __ fld(f18, f18_save);
 412     __ fld(f9,  f9_save);
 413     __ fld(f8,  f8_save);
 414 
 415     __ ld(x27, x27_save);
 416     __ ld(x26, x26_save);
 417     __ ld(x25, x25_save);
 418     __ ld(x24, x24_save);
 419     __ ld(x23, x23_save);
 420     __ ld(x22, x22_save);
 421     __ ld(x21, x21_save);
 422     __ ld(x20, x20_save);
 423     __ ld(x19, x19_save);
 424     __ ld(x18, x18_save);
 425 
 426     __ ld(x9, x9_save);
 427 
 428     // restore frm
 429     Label skip_fsrm;
 430     __ ld(t0, frm_save);
 431     __ frrm(t1);
 432     __ beq(t0, t1, skip_fsrm);
 433     __ fsrm(t0);
 434     __ bind(skip_fsrm);
 435 
 436     __ ld(c_rarg0, call_wrapper);
 437     __ ld(c_rarg1, result);
 438     __ ld(c_rarg2, result_type);
 439     __ ld(c_rarg3, method);
 440     __ ld(c_rarg4, entry_point);
 441     __ ld(c_rarg5, parameters);
 442     __ ld(c_rarg6, parameter_size);
 443     __ ld(c_rarg7, thread);
 444 
 445     // leave frame and return to caller
 446     __ leave();
 447     __ ret();
 448 
 449     // handle return types different from T_INT
 450 
 451     __ BIND(is_long);
 452     __ sd(x10, Address(j_rarg2, 0));
 453     __ j(exit);
 454 
 455     __ BIND(is_float);
 456     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 457     __ j(exit);
 458 
 459     __ BIND(is_double);
 460     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 461     __ j(exit);
 462 
 463     return start;
 464   }
 465 
 466   // Return point for a Java call if there's an exception thrown in
 467   // Java code.  The exception is caught and transformed into a
 468   // pending exception stored in JavaThread that can be tested from
 469   // within the VM.
 470   //
 471   // Note: Usually the parameters are removed by the callee. In case
 472   // of an exception crossing an activation frame boundary, that is
 473   // not the case if the callee is compiled code => need to setup the
 474   // sp.
 475   //
 476   // x10: exception oop
 477 
 478   address generate_catch_exception() {
 479     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 480     address start = __ pc();
 481 
 482     // same as in generate_call_stub():
 483     const Address thread(fp, thread_off * wordSize);
 484 
 485 #ifdef ASSERT
 486     // verify that threads correspond
 487     {
 488       Label L, S;
 489       __ ld(t0, thread);
 490       __ bne(xthread, t0, S);
 491       __ get_thread(t0);
 492       __ beq(xthread, t0, L);
 493       __ bind(S);
 494       __ stop("StubRoutines::catch_exception: threads must correspond");
 495       __ bind(L);
 496     }
 497 #endif
 498 
 499     // set pending exception
 500     __ verify_oop(x10);
 501 
 502     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 503     __ mv(t0, (address)__FILE__);
 504     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 505     __ mv(t0, (int)__LINE__);
 506     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 507 
 508     // complete return to VM
 509     assert(StubRoutines::_call_stub_return_address != nullptr,
 510            "_call_stub_return_address must have been generated before");
 511     __ j(StubRoutines::_call_stub_return_address);
 512 
 513     return start;
 514   }
 515 
 516   // Continuation point for runtime calls returning with a pending
 517   // exception.  The pending exception check happened in the runtime
 518   // or native call stub.  The pending exception in Thread is
 519   // converted into a Java-level exception.
 520   //
 521   // Contract with Java-level exception handlers:
 522   // x10: exception
 523   // x13: throwing pc
 524   //
 525   // NOTE: At entry of this stub, exception-pc must be in RA !!
 526 
 527   // NOTE: this is always used as a jump target within generated code
 528   // so it just needs to be generated code with no x86 prolog
 529 
 530   address generate_forward_exception() {
 531     StubCodeMark mark(this, "StubRoutines", "forward exception");
 532     address start = __ pc();
 533 
 534     // Upon entry, RA points to the return address returning into
 535     // Java (interpreted or compiled) code; i.e., the return address
 536     // becomes the throwing pc.
 537     //
 538     // Arguments pushed before the runtime call are still on the stack
 539     // but the exception handler will reset the stack pointer ->
 540     // ignore them.  A potential result in registers can be ignored as
 541     // well.
 542 
 543 #ifdef ASSERT
 544     // make sure this code is only executed if there is a pending exception
 545     {
 546       Label L;
 547       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 548       __ bnez(t0, L);
 549       __ stop("StubRoutines::forward exception: no pending exception (1)");
 550       __ bind(L);
 551     }
 552 #endif
 553 
 554     // compute exception handler into x9
 555 
 556     // call the VM to find the handler address associated with the
 557     // caller address. pass thread in x10 and caller pc (ret address)
 558     // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
 559     // the stack.
 560     __ mv(c_rarg1, ra);
 561     // ra will be trashed by the VM call so we move it to x9
 562     // (callee-saved) because we also need to pass it to the handler
 563     // returned by this call.
 564     __ mv(x9, ra);
 565     BLOCK_COMMENT("call exception_handler_for_return_address");
 566     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 567                          SharedRuntime::exception_handler_for_return_address),
 568                     xthread, c_rarg1);
 569     // we should not really care that ra is no longer the callee
 570     // address. we saved the value the handler needs in x9 so we can
 571     // just copy it to x13. however, the C2 handler will push its own
 572     // frame and then calls into the VM and the VM code asserts that
 573     // the PC for the frame above the handler belongs to a compiled
 574     // Java method. So, we restore ra here to satisfy that assert.
 575     __ mv(ra, x9);
 576     // setup x10 & x13 & clear pending exception
 577     __ mv(x13, x9);
 578     __ mv(x9, x10);
 579     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 580     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 581 
 582 #ifdef ASSERT
 583     // make sure exception is set
 584     {
 585       Label L;
 586       __ bnez(x10, L);
 587       __ stop("StubRoutines::forward exception: no pending exception (2)");
 588       __ bind(L);
 589     }
 590 #endif
 591 
 592     // continue at exception handler
 593     // x10: exception
 594     // x13: throwing pc
 595     // x9: exception handler
 596     __ verify_oop(x10);
 597     __ jr(x9);
 598 
 599     return start;
 600   }
 601 
 602   // Non-destructive plausibility checks for oops
 603   //
 604   // Arguments:
 605   //    x10: oop to verify
 606   //    t0: error message
 607   //
 608   // Stack after saving c_rarg3:
 609   //    [tos + 0]: saved c_rarg3
 610   //    [tos + 1]: saved c_rarg2
 611   //    [tos + 2]: saved ra
 612   //    [tos + 3]: saved t1
 613   //    [tos + 4]: saved x10
 614   //    [tos + 5]: saved t0
 615   address generate_verify_oop() {
 616 
 617     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 618     address start = __ pc();
 619 
 620     Label exit, error;
 621 
 622     __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
 623 
 624     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 625     __ ld(c_rarg3, Address(c_rarg2));
 626     __ add(c_rarg3, c_rarg3, 1);
 627     __ sd(c_rarg3, Address(c_rarg2));
 628 
 629     // object is in x10
 630     // make sure object is 'reasonable'
 631     __ beqz(x10, exit); // if obj is null it is OK
 632 
 633     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 634     bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
 635 
 636     // return if everything seems ok
 637     __ bind(exit);
 638 
 639     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp);  // pop c_rarg2 and c_rarg3
 640     __ ret();
 641 
 642     // handle errors
 643     __ bind(error);
 644     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
 645 
 646     __ push_reg(RegSet::range(x0, x31), sp);
 647     // debug(char* msg, int64_t pc, int64_t regs[])
 648     __ mv(c_rarg0, t0);             // pass address of error message
 649     __ mv(c_rarg1, ra);             // pass return address
 650     __ mv(c_rarg2, sp);             // pass address of regs on stack
 651 #ifndef PRODUCT
 652     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 653 #endif
 654     BLOCK_COMMENT("call MacroAssembler::debug");
 655     __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 656     __ ebreak();
 657 
 658     return start;
 659   }
 660 
 661   // The inner part of zero_words().
 662   //
 663   // Inputs:
 664   // x28: the HeapWord-aligned base address of an array to zero.
 665   // x29: the count in HeapWords, x29 > 0.
 666   //
 667   // Returns x28 and x29, adjusted for the caller to clear.
 668   // x28: the base address of the tail of words left to clear.
 669   // x29: the number of words in the tail.
 670   //      x29 < MacroAssembler::zero_words_block_size.
 671 
 672   address generate_zero_blocks() {
 673     Label done;
 674 
 675     const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
 676 
 677     __ align(CodeEntryAlignment);
 678     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 679     address start = __ pc();
 680 
 681     if (UseBlockZeroing) {
 682       // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero
 683       // after alignment.
 684       Label small;
 685       int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize;
 686       __ mv(tmp1, low_limit);
 687       __ blt(cnt, tmp1, small);
 688       __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
 689       __ bind(small);
 690     }
 691 
 692     {
 693       // Clear the remaining blocks.
 694       Label loop;
 695       __ mv(tmp1, MacroAssembler::zero_words_block_size);
 696       __ blt(cnt, tmp1, done);
 697       __ bind(loop);
 698       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 699         __ sd(zr, Address(base, i * wordSize));
 700       }
 701       __ add(base, base, MacroAssembler::zero_words_block_size * wordSize);
 702       __ sub(cnt, cnt, MacroAssembler::zero_words_block_size);
 703       __ bge(cnt, tmp1, loop);
 704       __ bind(done);
 705     }
 706 
 707     __ ret();
 708 
 709     return start;
 710   }
 711 
 712   typedef enum {
 713     copy_forwards = 1,
 714     copy_backwards = -1
 715   } copy_direction;
 716 
 717   // Bulk copy of blocks of 8 words.
 718   //
 719   // count is a count of words.
 720   //
 721   // Precondition: count >= 8
 722   //
 723   // Postconditions:
 724   //
 725   // The least significant bit of count contains the remaining count
 726   // of words to copy.  The rest of count is trash.
 727   //
 728   // s and d are adjusted to point to the remaining words to copy
 729   //
 730   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 731                            copy_direction direction) {
 732     int unit = wordSize * direction;
 733     int bias = wordSize;
 734 
 735     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 736       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 737 
 738     const Register stride = x30;
 739 
 740     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 741       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 742     assert_different_registers(s, d, count, t0);
 743 
 744     Label again, drain;
 745     const char* stub_name = nullptr;
 746     if (direction == copy_forwards) {
 747       stub_name = "forward_copy_longs";
 748     } else {
 749       stub_name = "backward_copy_longs";
 750     }
 751     StubCodeMark mark(this, "StubRoutines", stub_name);
 752     __ align(CodeEntryAlignment);
 753     __ bind(start);
 754 
 755     if (direction == copy_forwards) {
 756       __ sub(s, s, bias);
 757       __ sub(d, d, bias);
 758     }
 759 
 760 #ifdef ASSERT
 761     // Make sure we are never given < 8 words
 762     {
 763       Label L;
 764 
 765       __ mv(t0, 8);
 766       __ bge(count, t0, L);
 767       __ stop("genrate_copy_longs called with < 8 words");
 768       __ bind(L);
 769     }
 770 #endif
 771 
 772     __ ld(tmp_reg0, Address(s, 1 * unit));
 773     __ ld(tmp_reg1, Address(s, 2 * unit));
 774     __ ld(tmp_reg2, Address(s, 3 * unit));
 775     __ ld(tmp_reg3, Address(s, 4 * unit));
 776     __ ld(tmp_reg4, Address(s, 5 * unit));
 777     __ ld(tmp_reg5, Address(s, 6 * unit));
 778     __ ld(tmp_reg6, Address(s, 7 * unit));
 779     __ ld(tmp_reg7, Address(s, 8 * unit));
 780     __ addi(s, s, 8 * unit);
 781 
 782     __ sub(count, count, 16);
 783     __ bltz(count, drain);
 784 
 785     __ bind(again);
 786 
 787     __ sd(tmp_reg0, Address(d, 1 * unit));
 788     __ sd(tmp_reg1, Address(d, 2 * unit));
 789     __ sd(tmp_reg2, Address(d, 3 * unit));
 790     __ sd(tmp_reg3, Address(d, 4 * unit));
 791     __ sd(tmp_reg4, Address(d, 5 * unit));
 792     __ sd(tmp_reg5, Address(d, 6 * unit));
 793     __ sd(tmp_reg6, Address(d, 7 * unit));
 794     __ sd(tmp_reg7, Address(d, 8 * unit));
 795 
 796     __ ld(tmp_reg0, Address(s, 1 * unit));
 797     __ ld(tmp_reg1, Address(s, 2 * unit));
 798     __ ld(tmp_reg2, Address(s, 3 * unit));
 799     __ ld(tmp_reg3, Address(s, 4 * unit));
 800     __ ld(tmp_reg4, Address(s, 5 * unit));
 801     __ ld(tmp_reg5, Address(s, 6 * unit));
 802     __ ld(tmp_reg6, Address(s, 7 * unit));
 803     __ ld(tmp_reg7, Address(s, 8 * unit));
 804 
 805     __ addi(s, s, 8 * unit);
 806     __ addi(d, d, 8 * unit);
 807 
 808     __ sub(count, count, 8);
 809     __ bgez(count, again);
 810 
 811     // Drain
 812     __ bind(drain);
 813 
 814     __ sd(tmp_reg0, Address(d, 1 * unit));
 815     __ sd(tmp_reg1, Address(d, 2 * unit));
 816     __ sd(tmp_reg2, Address(d, 3 * unit));
 817     __ sd(tmp_reg3, Address(d, 4 * unit));
 818     __ sd(tmp_reg4, Address(d, 5 * unit));
 819     __ sd(tmp_reg5, Address(d, 6 * unit));
 820     __ sd(tmp_reg6, Address(d, 7 * unit));
 821     __ sd(tmp_reg7, Address(d, 8 * unit));
 822     __ addi(d, d, 8 * unit);
 823 
 824     {
 825       Label L1, L2;
 826       __ test_bit(t0, count, 2);
 827       __ beqz(t0, L1);
 828 
 829       __ ld(tmp_reg0, Address(s, 1 * unit));
 830       __ ld(tmp_reg1, Address(s, 2 * unit));
 831       __ ld(tmp_reg2, Address(s, 3 * unit));
 832       __ ld(tmp_reg3, Address(s, 4 * unit));
 833       __ addi(s, s, 4 * unit);
 834 
 835       __ sd(tmp_reg0, Address(d, 1 * unit));
 836       __ sd(tmp_reg1, Address(d, 2 * unit));
 837       __ sd(tmp_reg2, Address(d, 3 * unit));
 838       __ sd(tmp_reg3, Address(d, 4 * unit));
 839       __ addi(d, d, 4 * unit);
 840 
 841       __ bind(L1);
 842 
 843       if (direction == copy_forwards) {
 844         __ addi(s, s, bias);
 845         __ addi(d, d, bias);
 846       }
 847 
 848       __ test_bit(t0, count, 1);
 849       __ beqz(t0, L2);
 850       if (direction == copy_backwards) {
 851         __ addi(s, s, 2 * unit);
 852         __ ld(tmp_reg0, Address(s));
 853         __ ld(tmp_reg1, Address(s, wordSize));
 854         __ addi(d, d, 2 * unit);
 855         __ sd(tmp_reg0, Address(d));
 856         __ sd(tmp_reg1, Address(d, wordSize));
 857       } else {
 858         __ ld(tmp_reg0, Address(s));
 859         __ ld(tmp_reg1, Address(s, wordSize));
 860         __ addi(s, s, 2 * unit);
 861         __ sd(tmp_reg0, Address(d));
 862         __ sd(tmp_reg1, Address(d, wordSize));
 863         __ addi(d, d, 2 * unit);
 864       }
 865       __ bind(L2);
 866     }
 867 
 868     __ ret();
 869   }
 870 
 871   Label copy_f, copy_b;
 872 
 873   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 874 
 875   void copy_memory_v(Register s, Register d, Register count, int step) {
 876     bool is_backward = step < 0;
 877     int granularity = uabs(step);
 878 
 879     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 880     assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
 881     Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
 882     Label loop_forward, loop_backward, done;
 883 
 884     __ mv(dst, d);
 885     __ mv(src, s);
 886     __ mv(cnt, count);
 887 
 888     __ bind(loop_forward);
 889     __ vsetvli(vl, cnt, sew, Assembler::m8);
 890     if (is_backward) {
 891       __ bne(vl, cnt, loop_backward);
 892     }
 893 
 894     __ vlex_v(v0, src, sew);
 895     __ sub(cnt, cnt, vl);
 896     if (sew != Assembler::e8) {
 897       // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 898       __ slli(vl, vl, sew);
 899     }
 900     __ add(src, src, vl);
 901 
 902     __ vsex_v(v0, dst, sew);
 903     __ add(dst, dst, vl);
 904     __ bnez(cnt, loop_forward);
 905 
 906     if (is_backward) {
 907       __ j(done);
 908 
 909       __ bind(loop_backward);
 910       __ sub(t0, cnt, vl);
 911       if (sew != Assembler::e8) {
 912         // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 913         __ slli(t0, t0, sew);
 914       }
 915       __ add(tmp1, s, t0);
 916       __ vlex_v(v0, tmp1, sew);
 917       __ add(tmp2, d, t0);
 918       __ vsex_v(v0, tmp2, sew);
 919       __ sub(cnt, cnt, vl);
 920       __ bnez(cnt, loop_forward);
 921       __ bind(done);
 922     }
 923   }
 924 
 925   // All-singing all-dancing memory copy.
 926   //
 927   // Copy count units of memory from s to d.  The size of a unit is
 928   // step, which can be positive or negative depending on the direction
 929   // of copy.
 930   //
 931   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 932                    Register s, Register d, Register count, int step) {
 933     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 934     if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
 935       return copy_memory_v(s, d, count, step);
 936     }
 937 
 938     bool is_backwards = step < 0;
 939     int granularity = uabs(step);
 940 
 941     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
 942     const Register gct1 = x28, gct2 = x29, gct3 = t2;
 943 
 944     Label same_aligned;
 945     Label copy_big, copy32_loop, copy8_loop, copy_small, done;
 946 
 947     // The size of copy32_loop body increases significantly with ZGC GC barriers.
 948     // Need conditional far branches to reach a point beyond the loop in this case.
 949     bool is_far = UseZGC && ZGenerational;
 950 
 951     __ beqz(count, done, is_far);
 952     __ slli(cnt, count, exact_log2(granularity));
 953     if (is_backwards) {
 954       __ add(src, s, cnt);
 955       __ add(dst, d, cnt);
 956     } else {
 957       __ mv(src, s);
 958       __ mv(dst, d);
 959     }
 960 
 961     if (is_aligned) {
 962       __ addi(t0, cnt, -32);
 963       __ bgez(t0, copy32_loop);
 964       __ addi(t0, cnt, -8);
 965       __ bgez(t0, copy8_loop, is_far);
 966       __ j(copy_small);
 967     } else {
 968       __ mv(t0, 16);
 969       __ blt(cnt, t0, copy_small, is_far);
 970 
 971       __ xorr(t0, src, dst);
 972       __ andi(t0, t0, 0b111);
 973       __ bnez(t0, copy_small, is_far);
 974 
 975       __ bind(same_aligned);
 976       __ andi(t0, src, 0b111);
 977       __ beqz(t0, copy_big);
 978       if (is_backwards) {
 979         __ addi(src, src, step);
 980         __ addi(dst, dst, step);
 981       }
 982       bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
 983       bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
 984       if (!is_backwards) {
 985         __ addi(src, src, step);
 986         __ addi(dst, dst, step);
 987       }
 988       __ addi(cnt, cnt, -granularity);
 989       __ beqz(cnt, done, is_far);
 990       __ j(same_aligned);
 991 
 992       __ bind(copy_big);
 993       __ mv(t0, 32);
 994       __ blt(cnt, t0, copy8_loop, is_far);
 995     }
 996 
 997     __ bind(copy32_loop);
 998     if (is_backwards) {
 999       __ addi(src, src, -wordSize * 4);
1000       __ addi(dst, dst, -wordSize * 4);
1001     }
1002     // we first load 32 bytes, then write it, so the direction here doesn't matter
1003     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src),     gct1);
1004     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8),  gct1);
1005     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
1006     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
1007 
1008     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst),     tmp3, gct1, gct2, gct3);
1009     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8),  tmp4, gct1, gct2, gct3);
1010     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
1011     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
1012 
1013     if (!is_backwards) {
1014       __ addi(src, src, wordSize * 4);
1015       __ addi(dst, dst, wordSize * 4);
1016     }
1017     __ addi(t0, cnt, -(32 + wordSize * 4));
1018     __ addi(cnt, cnt, -wordSize * 4);
1019     __ bgez(t0, copy32_loop); // cnt >= 32, do next loop
1020 
1021     __ beqz(cnt, done); // if that's all - done
1022 
1023     __ addi(t0, cnt, -8); // if not - copy the reminder
1024     __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
1025 
1026     __ bind(copy8_loop);
1027     if (is_backwards) {
1028       __ addi(src, src, -wordSize);
1029       __ addi(dst, dst, -wordSize);
1030     }
1031     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1032     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1033 
1034     if (!is_backwards) {
1035       __ addi(src, src, wordSize);
1036       __ addi(dst, dst, wordSize);
1037     }
1038     __ addi(t0, cnt, -(8 + wordSize));
1039     __ addi(cnt, cnt, -wordSize);
1040     __ bgez(t0, copy8_loop); // cnt >= 8, do next loop
1041 
1042     __ beqz(cnt, done); // if that's all - done
1043 
1044     __ bind(copy_small);
1045     if (is_backwards) {
1046       __ addi(src, src, step);
1047       __ addi(dst, dst, step);
1048     }
1049 
1050     bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
1051     bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
1052 
1053     if (!is_backwards) {
1054       __ addi(src, src, step);
1055       __ addi(dst, dst, step);
1056     }
1057     __ addi(cnt, cnt, -granularity);
1058     __ bgtz(cnt, copy_small);
1059 
1060     __ bind(done);
1061   }
1062 
1063   // Scan over array at a for count oops, verifying each one.
1064   // Preserves a and count, clobbers t0 and t1.
1065   void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1066     Label loop, end;
1067     __ mv(t1, zr);
1068     __ slli(t0, count, exact_log2(size));
1069     __ bind(loop);
1070     __ bgeu(t1, t0, end);
1071 
1072     __ add(temp, a, t1);
1073     if (size == (size_t)wordSize) {
1074       __ ld(temp, Address(temp, 0));
1075       __ verify_oop(temp);
1076     } else {
1077       __ lwu(temp, Address(temp, 0));
1078       __ decode_heap_oop(temp); // calls verify_oop
1079     }
1080     __ add(t1, t1, size);
1081     __ j(loop);
1082     __ bind(end);
1083   }
1084 
1085   // Arguments:
1086   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1087   //             ignored
1088   //   is_oop  - true => oop array, so generate store check code
1089   //   name    - stub name string
1090   //
1091   // Inputs:
1092   //   c_rarg0   - source array address
1093   //   c_rarg1   - destination array address
1094   //   c_rarg2   - element count, treated as ssize_t, can be zero
1095   //
1096   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1097   // the hardware handle it.  The two dwords within qwords that span
1098   // cache line boundaries will still be loaded and stored atomically.
1099   //
1100   // Side Effects:
1101   //   disjoint_int_copy_entry is set to the no-overlap entry point
1102   //   used by generate_conjoint_int_oop_copy().
1103   //
1104   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry,
1105                                  const char* name, bool dest_uninitialized = false) {
1106     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1107     RegSet saved_reg = RegSet::of(s, d, count);
1108     __ align(CodeEntryAlignment);
1109     StubCodeMark mark(this, "StubRoutines", name);
1110     address start = __ pc();
1111     __ enter();
1112 
1113     if (entry != nullptr) {
1114       *entry = __ pc();
1115       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1116       BLOCK_COMMENT("Entry:");
1117     }
1118 
1119     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1120     if (dest_uninitialized) {
1121       decorators |= IS_DEST_UNINITIALIZED;
1122     }
1123     if (aligned) {
1124       decorators |= ARRAYCOPY_ALIGNED;
1125     }
1126 
1127     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1128     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1129 
1130     if (is_oop) {
1131       // save regs before copy_memory
1132       __ push_reg(RegSet::of(d, count), sp);
1133     }
1134 
1135     {
1136       // UnsafeMemoryAccess page error: continue after unsafe access
1137       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1138       UnsafeMemoryAccessMark umam(this, add_entry, true);
1139       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1140     }
1141 
1142     if (is_oop) {
1143       __ pop_reg(RegSet::of(d, count), sp);
1144       if (VerifyOops) {
1145         verify_oop_array(size, d, count, t2);
1146       }
1147     }
1148 
1149     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1150 
1151     __ leave();
1152     __ mv(x10, zr); // return 0
1153     __ ret();
1154     return start;
1155   }
1156 
1157   // Arguments:
1158   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1159   //             ignored
1160   //   is_oop  - true => oop array, so generate store check code
1161   //   name    - stub name string
1162   //
1163   // Inputs:
1164   //   c_rarg0   - source array address
1165   //   c_rarg1   - destination array address
1166   //   c_rarg2   - element count, treated as ssize_t, can be zero
1167   //
1168   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1169   // the hardware handle it.  The two dwords within qwords that span
1170   // cache line boundaries will still be loaded and stored atomically.
1171   //
1172   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1173                                  address* entry, const char* name,
1174                                  bool dest_uninitialized = false) {
1175     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1176     RegSet saved_regs = RegSet::of(s, d, count);
1177     StubCodeMark mark(this, "StubRoutines", name);
1178     address start = __ pc();
1179     __ enter();
1180 
1181     if (entry != nullptr) {
1182       *entry = __ pc();
1183       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1184       BLOCK_COMMENT("Entry:");
1185     }
1186 
1187     // use fwd copy when (d-s) above_equal (count*size)
1188     __ sub(t0, d, s);
1189     __ slli(t1, count, exact_log2(size));
1190     Label L_continue;
1191     __ bltu(t0, t1, L_continue);
1192     __ j(nooverlap_target);
1193     __ bind(L_continue);
1194 
1195     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1196     if (dest_uninitialized) {
1197       decorators |= IS_DEST_UNINITIALIZED;
1198     }
1199     if (aligned) {
1200       decorators |= ARRAYCOPY_ALIGNED;
1201     }
1202 
1203     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1204     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1205 
1206     if (is_oop) {
1207       // save regs before copy_memory
1208       __ push_reg(RegSet::of(d, count), sp);
1209     }
1210 
1211     {
1212       // UnsafeMemoryAccess page error: continue after unsafe access
1213       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1214       UnsafeMemoryAccessMark umam(this, add_entry, true);
1215       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1216     }
1217 
1218     if (is_oop) {
1219       __ pop_reg(RegSet::of(d, count), sp);
1220       if (VerifyOops) {
1221         verify_oop_array(size, d, count, t2);
1222       }
1223     }
1224     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1225     __ leave();
1226     __ mv(x10, zr); // return 0
1227     __ ret();
1228     return start;
1229   }
1230 
1231   // Arguments:
1232   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1233   //             ignored
1234   //   name    - stub name string
1235   //
1236   // Inputs:
1237   //   c_rarg0   - source array address
1238   //   c_rarg1   - destination array address
1239   //   c_rarg2   - element count, treated as ssize_t, can be zero
1240   //
1241   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1242   // we let the hardware handle it.  The one to eight bytes within words,
1243   // dwords or qwords that span cache line boundaries will still be loaded
1244   // and stored atomically.
1245   //
1246   // Side Effects:
1247   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1248   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1249   // we let the hardware handle it.  The one to eight bytes within words,
1250   // dwords or qwords that span cache line boundaries will still be loaded
1251   // and stored atomically.
1252   //
1253   // Side Effects:
1254   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1255   //   used by generate_conjoint_byte_copy().
1256   //
1257   address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) {
1258     const bool not_oop = false;
1259     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1260   }
1261 
1262   // Arguments:
1263   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1264   //             ignored
1265   //   name    - stub name string
1266   //
1267   // Inputs:
1268   //   c_rarg0   - source array address
1269   //   c_rarg1   - destination array address
1270   //   c_rarg2   - element count, treated as ssize_t, can be zero
1271   //
1272   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1273   // we let the hardware handle it.  The one to eight bytes within words,
1274   // dwords or qwords that span cache line boundaries will still be loaded
1275   // and stored atomically.
1276   //
1277   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1278                                       address* entry, const char* name) {
1279     const bool not_oop = false;
1280     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1281   }
1282 
1283   // Arguments:
1284   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1285   //             ignored
1286   //   name    - stub name string
1287   //
1288   // Inputs:
1289   //   c_rarg0   - source array address
1290   //   c_rarg1   - destination array address
1291   //   c_rarg2   - element count, treated as ssize_t, can be zero
1292   //
1293   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1294   // let the hardware handle it.  The two or four words within dwords
1295   // or qwords that span cache line boundaries will still be loaded
1296   // and stored atomically.
1297   //
1298   // Side Effects:
1299   //   disjoint_short_copy_entry is set to the no-overlap entry point
1300   //   used by generate_conjoint_short_copy().
1301   //
1302   address generate_disjoint_short_copy(bool aligned,
1303                                        address* entry, const char* name) {
1304     const bool not_oop = false;
1305     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1306   }
1307 
1308   // Arguments:
1309   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1310   //             ignored
1311   //   name    - stub name string
1312   //
1313   // Inputs:
1314   //   c_rarg0   - source array address
1315   //   c_rarg1   - destination array address
1316   //   c_rarg2   - element count, treated as ssize_t, can be zero
1317   //
1318   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1319   // let the hardware handle it.  The two or four words within dwords
1320   // or qwords that span cache line boundaries will still be loaded
1321   // and stored atomically.
1322   //
1323   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1324                                        address* entry, const char* name) {
1325     const bool not_oop = false;
1326     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1327   }
1328 
1329   // Arguments:
1330   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1331   //             ignored
1332   //   name    - stub name string
1333   //
1334   // Inputs:
1335   //   c_rarg0   - source array address
1336   //   c_rarg1   - destination array address
1337   //   c_rarg2   - element count, treated as ssize_t, can be zero
1338   //
1339   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1340   // the hardware handle it.  The two dwords within qwords that span
1341   // cache line boundaries will still be loaded and stored atomically.
1342   //
1343   // Side Effects:
1344   //   disjoint_int_copy_entry is set to the no-overlap entry point
1345   //   used by generate_conjoint_int_oop_copy().
1346   //
1347   address generate_disjoint_int_copy(bool aligned, address* entry,
1348                                      const char* name, bool dest_uninitialized = false) {
1349     const bool not_oop = false;
1350     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1351   }
1352 
1353   // Arguments:
1354   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1355   //             ignored
1356   //   name    - stub name string
1357   //
1358   // Inputs:
1359   //   c_rarg0   - source array address
1360   //   c_rarg1   - destination array address
1361   //   c_rarg2   - element count, treated as ssize_t, can be zero
1362   //
1363   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1364   // the hardware handle it.  The two dwords within qwords that span
1365   // cache line boundaries will still be loaded and stored atomically.
1366   //
1367   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1368                                      address* entry, const char* name,
1369                                      bool dest_uninitialized = false) {
1370     const bool not_oop = false;
1371     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1372   }
1373 
1374 
1375   // Arguments:
1376   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1377   //             ignored
1378   //   name    - stub name string
1379   //
1380   // Inputs:
1381   //   c_rarg0   - source array address
1382   //   c_rarg1   - destination array address
1383   //   c_rarg2   - element count, treated as size_t, can be zero
1384   //
1385   // Side Effects:
1386   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1387   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1388   //
1389   address generate_disjoint_long_copy(bool aligned, address* entry,
1390                                       const char* name, bool dest_uninitialized = false) {
1391     const bool not_oop = false;
1392     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1393   }
1394 
1395   // Arguments:
1396   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1397   //             ignored
1398   //   name    - stub name string
1399   //
1400   // Inputs:
1401   //   c_rarg0   - source array address
1402   //   c_rarg1   - destination array address
1403   //   c_rarg2   - element count, treated as size_t, can be zero
1404   //
1405   address generate_conjoint_long_copy(bool aligned,
1406                                       address nooverlap_target, address* entry,
1407                                       const char* name, bool dest_uninitialized = false) {
1408     const bool not_oop = false;
1409     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1410   }
1411 
1412   // Arguments:
1413   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1414   //             ignored
1415   //   name    - stub name string
1416   //
1417   // Inputs:
1418   //   c_rarg0   - source array address
1419   //   c_rarg1   - destination array address
1420   //   c_rarg2   - element count, treated as size_t, can be zero
1421   //
1422   // Side Effects:
1423   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1424   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1425   //
1426   address generate_disjoint_oop_copy(bool aligned, address* entry,
1427                                      const char* name, bool dest_uninitialized) {
1428     const bool is_oop = true;
1429     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1430     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1431   }
1432 
1433   // Arguments:
1434   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1435   //             ignored
1436   //   name    - stub name string
1437   //
1438   // Inputs:
1439   //   c_rarg0   - source array address
1440   //   c_rarg1   - destination array address
1441   //   c_rarg2   - element count, treated as size_t, can be zero
1442   //
1443   address generate_conjoint_oop_copy(bool aligned,
1444                                      address nooverlap_target, address* entry,
1445                                      const char* name, bool dest_uninitialized) {
1446     const bool is_oop = true;
1447     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1448     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1449                                   name, dest_uninitialized);
1450   }
1451 
1452   // Helper for generating a dynamic type check.
1453   // Smashes t0, t1.
1454   void generate_type_check(Register sub_klass,
1455                            Register super_check_offset,
1456                            Register super_klass,
1457                            Label& L_success) {
1458     assert_different_registers(sub_klass, super_check_offset, super_klass);
1459 
1460     BLOCK_COMMENT("type_check:");
1461 
1462     Label L_miss;
1463 
1464     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
1465     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1466 
1467     // Fall through on failure!
1468     __ BIND(L_miss);
1469   }
1470 
1471   //
1472   //  Generate checkcasting array copy stub
1473   //
1474   //  Input:
1475   //    c_rarg0   - source array address
1476   //    c_rarg1   - destination array address
1477   //    c_rarg2   - element count, treated as ssize_t, can be zero
1478   //    c_rarg3   - size_t ckoff (super_check_offset)
1479   //    c_rarg4   - oop ckval (super_klass)
1480   //
1481   //  Output:
1482   //    x10 ==  0  -  success
1483   //    x10 == -1^K - failure, where K is partial transfer count
1484   //
1485   address generate_checkcast_copy(const char* name, address* entry,
1486                                   bool dest_uninitialized = false) {
1487     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1488 
1489     // Input registers (after setup_arg_regs)
1490     const Register from        = c_rarg0;   // source array address
1491     const Register to          = c_rarg1;   // destination array address
1492     const Register count       = c_rarg2;   // elementscount
1493     const Register ckoff       = c_rarg3;   // super_check_offset
1494     const Register ckval       = c_rarg4;   // super_klass
1495 
1496     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1497     RegSet wb_post_saved_regs  = RegSet::of(count);
1498 
1499     // Registers used as temps (x7, x9, x18 are save-on-entry)
1500     const Register count_save  = x19;       // orig elementscount
1501     const Register start_to    = x18;       // destination array start address
1502     const Register copied_oop  = x7;        // actual oop copied
1503     const Register r9_klass    = x9;        // oop._klass
1504 
1505     // Registers used as gc temps (x15, x16, x17 are save-on-call)
1506     const Register gct1 = x15, gct2 = x16, gct3 = x17;
1507 
1508     //---------------------------------------------------------------
1509     // Assembler stub will be used for this call to arraycopy
1510     // if the two arrays are subtypes of Object[] but the
1511     // destination array type is not equal to or a supertype
1512     // of the source type.  Each element must be separately
1513     // checked.
1514 
1515     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1516                                copied_oop, r9_klass, count_save);
1517 
1518     __ align(CodeEntryAlignment);
1519     StubCodeMark mark(this, "StubRoutines", name);
1520     address start = __ pc();
1521 
1522     __ enter(); // required for proper stackwalking of RuntimeStub frame
1523 
1524     // Caller of this entry point must set up the argument registers.
1525     if (entry != nullptr) {
1526       *entry = __ pc();
1527       BLOCK_COMMENT("Entry:");
1528     }
1529 
1530     // Empty array:  Nothing to do
1531     __ beqz(count, L_done);
1532 
1533     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1534 
1535 #ifdef ASSERT
1536     BLOCK_COMMENT("assert consistent ckoff/ckval");
1537     // The ckoff and ckval must be mutually consistent,
1538     // even though caller generates both.
1539     { Label L;
1540       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1541       __ lwu(start_to, Address(ckval, sco_offset));
1542       __ beq(ckoff, start_to, L);
1543       __ stop("super_check_offset inconsistent");
1544       __ bind(L);
1545     }
1546 #endif //ASSERT
1547 
1548     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1549     if (dest_uninitialized) {
1550       decorators |= IS_DEST_UNINITIALIZED;
1551     }
1552 
1553     bool is_oop = true;
1554     int element_size = UseCompressedOops ? 4 : 8;
1555 
1556     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1557     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1558 
1559     // save the original count
1560     __ mv(count_save, count);
1561 
1562     // Copy from low to high addresses
1563     __ mv(start_to, to);              // Save destination array start address
1564     __ j(L_load_element);
1565 
1566     // ======== begin loop ========
1567     // (Loop is rotated; its entry is L_load_element.)
1568     // Loop control:
1569     //   for count to 0 do
1570     //     copied_oop = load_heap_oop(from++)
1571     //     ... generate_type_check ...
1572     //     store_heap_oop(to++, copied_oop)
1573     //   end
1574 
1575     __ align(OptoLoopAlignment);
1576 
1577     __ BIND(L_store_element);
1578     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1579                       Address(to, 0), copied_oop,
1580                       gct1, gct2, gct3);
1581     __ add(to, to, UseCompressedOops ? 4 : 8);
1582     __ sub(count, count, 1);
1583     __ beqz(count, L_do_card_marks);
1584 
1585     // ======== loop entry is here ========
1586     __ BIND(L_load_element);
1587     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1588                      copied_oop, Address(from, 0),
1589                      gct1);
1590     __ add(from, from, UseCompressedOops ? 4 : 8);
1591     __ beqz(copied_oop, L_store_element);
1592 
1593     __ load_klass(r9_klass, copied_oop);// query the object klass
1594     generate_type_check(r9_klass, ckoff, ckval, L_store_element);
1595     // ======== end loop ========
1596 
1597     // It was a real error; we must depend on the caller to finish the job.
1598     // Register count = remaining oops, count_orig = total oops.
1599     // Emit GC store barriers for the oops we have copied and report
1600     // their number to the caller.
1601 
1602     __ sub(count, count_save, count);     // K = partially copied oop count
1603     __ xori(count, count, -1);                   // report (-1^K) to caller
1604     __ beqz(count, L_done_pop);
1605 
1606     __ BIND(L_do_card_marks);
1607     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs);
1608 
1609     __ bind(L_done_pop);
1610     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1611     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1612 
1613     __ bind(L_done);
1614     __ mv(x10, count);
1615     __ leave();
1616     __ ret();
1617 
1618     return start;
1619   }
1620 
1621   // Perform range checks on the proposed arraycopy.
1622   // Kills temp, but nothing else.
1623   // Also, clean the sign bits of src_pos and dst_pos.
1624   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1625                               Register src_pos, // source position (c_rarg1)
1626                               Register dst,     // destination array oo (c_rarg2)
1627                               Register dst_pos, // destination position (c_rarg3)
1628                               Register length,
1629                               Register temp,
1630                               Label& L_failed) {
1631     BLOCK_COMMENT("arraycopy_range_checks:");
1632 
1633     assert_different_registers(t0, temp);
1634 
1635     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1636     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1637     __ addw(temp, length, src_pos);
1638     __ bgtu(temp, t0, L_failed);
1639 
1640     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1641     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1642     __ addw(temp, length, dst_pos);
1643     __ bgtu(temp, t0, L_failed);
1644 
1645     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1646     __ zero_extend(src_pos, src_pos, 32);
1647     __ zero_extend(dst_pos, dst_pos, 32);
1648 
1649     BLOCK_COMMENT("arraycopy_range_checks done");
1650   }
1651 
1652   //
1653   //  Generate 'unsafe' array copy stub
1654   //  Though just as safe as the other stubs, it takes an unscaled
1655   //  size_t argument instead of an element count.
1656   //
1657   //  Input:
1658   //    c_rarg0   - source array address
1659   //    c_rarg1   - destination array address
1660   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1661   //
1662   // Examines the alignment of the operands and dispatches
1663   // to a long, int, short, or byte copy loop.
1664   //
1665   address generate_unsafe_copy(const char* name,
1666                                address byte_copy_entry,
1667                                address short_copy_entry,
1668                                address int_copy_entry,
1669                                address long_copy_entry) {
1670     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1671                 int_copy_entry != nullptr && long_copy_entry != nullptr);
1672     Label L_long_aligned, L_int_aligned, L_short_aligned;
1673     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1674 
1675     __ align(CodeEntryAlignment);
1676     StubCodeMark mark(this, "StubRoutines", name);
1677     address start = __ pc();
1678     __ enter(); // required for proper stackwalking of RuntimeStub frame
1679 
1680     // bump this on entry, not on exit:
1681     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1682 
1683     __ orr(t0, s, d);
1684     __ orr(t0, t0, count);
1685 
1686     __ andi(t0, t0, BytesPerLong - 1);
1687     __ beqz(t0, L_long_aligned);
1688     __ andi(t0, t0, BytesPerInt - 1);
1689     __ beqz(t0, L_int_aligned);
1690     __ test_bit(t0, t0, 0);
1691     __ beqz(t0, L_short_aligned);
1692     __ j(RuntimeAddress(byte_copy_entry));
1693 
1694     __ BIND(L_short_aligned);
1695     __ srli(count, count, LogBytesPerShort);  // size => short_count
1696     __ j(RuntimeAddress(short_copy_entry));
1697     __ BIND(L_int_aligned);
1698     __ srli(count, count, LogBytesPerInt);    // size => int_count
1699     __ j(RuntimeAddress(int_copy_entry));
1700     __ BIND(L_long_aligned);
1701     __ srli(count, count, LogBytesPerLong);   // size => long_count
1702     __ j(RuntimeAddress(long_copy_entry));
1703 
1704     return start;
1705   }
1706 
1707   //
1708   //  Generate generic array copy stubs
1709   //
1710   //  Input:
1711   //    c_rarg0    -  src oop
1712   //    c_rarg1    -  src_pos (32-bits)
1713   //    c_rarg2    -  dst oop
1714   //    c_rarg3    -  dst_pos (32-bits)
1715   //    c_rarg4    -  element count (32-bits)
1716   //
1717   //  Output:
1718   //    x10 ==  0  -  success
1719   //    x10 == -1^K - failure, where K is partial transfer count
1720   //
1721   address generate_generic_copy(const char* name,
1722                                 address byte_copy_entry, address short_copy_entry,
1723                                 address int_copy_entry, address oop_copy_entry,
1724                                 address long_copy_entry, address checkcast_copy_entry) {
1725     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1726                 int_copy_entry != nullptr && oop_copy_entry != nullptr &&
1727                 long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
1728     Label L_failed, L_failed_0, L_objArray;
1729     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1730 
1731     // Input registers
1732     const Register src        = c_rarg0;  // source array oop
1733     const Register src_pos    = c_rarg1;  // source position
1734     const Register dst        = c_rarg2;  // destination array oop
1735     const Register dst_pos    = c_rarg3;  // destination position
1736     const Register length     = c_rarg4;
1737 
1738     // Registers used as temps
1739     const Register dst_klass = c_rarg5;
1740 
1741     __ align(CodeEntryAlignment);
1742 
1743     StubCodeMark mark(this, "StubRoutines", name);
1744 
1745     address start = __ pc();
1746 
1747     __ enter(); // required for proper stackwalking of RuntimeStub frame
1748 
1749     // bump this on entry, not on exit:
1750     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1751 
1752     //-----------------------------------------------------------------------
1753     // Assembler stub will be used for this call to arraycopy
1754     // if the following conditions are met:
1755     //
1756     // (1) src and dst must not be null.
1757     // (2) src_pos must not be negative.
1758     // (3) dst_pos must not be negative.
1759     // (4) length  must not be negative.
1760     // (5) src klass and dst klass should be the same and not null.
1761     // (6) src and dst should be arrays.
1762     // (7) src_pos + length must not exceed length of src.
1763     // (8) dst_pos + length must not exceed length of dst.
1764     //
1765 
1766     // if src is null then return -1
1767     __ beqz(src, L_failed);
1768 
1769     // if [src_pos < 0] then return -1
1770     __ sign_extend(t0, src_pos, 32);
1771     __ bltz(t0, L_failed);
1772 
1773     // if dst is null then return -1
1774     __ beqz(dst, L_failed);
1775 
1776     // if [dst_pos < 0] then return -1
1777     __ sign_extend(t0, dst_pos, 32);
1778     __ bltz(t0, L_failed);
1779 
1780     // registers used as temp
1781     const Register scratch_length    = x28; // elements count to copy
1782     const Register scratch_src_klass = x29; // array klass
1783     const Register lh                = x30; // layout helper
1784 
1785     // if [length < 0] then return -1
1786     __ sign_extend(scratch_length, length, 32);    // length (elements count, 32-bits value)
1787     __ bltz(scratch_length, L_failed);
1788 
1789     __ load_klass(scratch_src_klass, src);
1790 #ifdef ASSERT
1791     {
1792       BLOCK_COMMENT("assert klasses not null {");
1793       Label L1, L2;
1794       __ bnez(scratch_src_klass, L2);   // it is broken if klass is null
1795       __ bind(L1);
1796       __ stop("broken null klass");
1797       __ bind(L2);
1798       __ load_klass(t0, dst, t1);
1799       __ beqz(t0, L1);     // this would be broken also
1800       BLOCK_COMMENT("} assert klasses not null done");
1801     }
1802 #endif
1803 
1804     // Load layout helper (32-bits)
1805     //
1806     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1807     // 32        30    24            16              8     2                 0
1808     //
1809     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1810     //
1811 
1812     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1813 
1814     // Handle objArrays completely differently...
1815     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1816     __ lw(lh, Address(scratch_src_klass, lh_offset));
1817     __ mv(t0, objArray_lh);
1818     __ beq(lh, t0, L_objArray);
1819 
1820     // if [src->klass() != dst->klass()] then return -1
1821     __ load_klass(t1, dst);
1822     __ bne(t1, scratch_src_klass, L_failed);
1823 
1824     // if src->is_Array() isn't null then return -1
1825     // i.e. (lh >= 0)
1826     __ bgez(lh, L_failed);
1827 
1828     // At this point, it is known to be a typeArray (array_tag 0x3).
1829 #ifdef ASSERT
1830     {
1831       BLOCK_COMMENT("assert primitive array {");
1832       Label L;
1833       __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1834       __ bge(lh, t1, L);
1835       __ stop("must be a primitive array");
1836       __ bind(L);
1837       BLOCK_COMMENT("} assert primitive array done");
1838     }
1839 #endif
1840 
1841     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1842                            t1, L_failed);
1843 
1844     // TypeArrayKlass
1845     //
1846     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1847     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1848     //
1849 
1850     const Register t0_offset = t0;    // array offset
1851     const Register x30_elsize = lh;   // element size
1852 
1853     // Get array_header_in_bytes()
1854     int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1855     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1856     __ slli(t0_offset, lh, XLEN - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1857     __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1858 
1859     __ add(src, src, t0_offset);           // src array offset
1860     __ add(dst, dst, t0_offset);           // dst array offset
1861     BLOCK_COMMENT("choose copy loop based on element size");
1862 
1863     // next registers should be set before the jump to corresponding stub
1864     const Register from     = c_rarg0;  // source array address
1865     const Register to       = c_rarg1;  // destination array address
1866     const Register count    = c_rarg2;  // elements count
1867 
1868     // 'from', 'to', 'count' registers should be set in such order
1869     // since they are the same as 'src', 'src_pos', 'dst'.
1870 
1871     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1872 
1873     // The possible values of elsize are 0-3, i.e. exact_log2(element
1874     // size in bytes).  We do a simple bitwise binary search.
1875   __ BIND(L_copy_bytes);
1876     __ test_bit(t0, x30_elsize, 1);
1877     __ bnez(t0, L_copy_ints);
1878     __ test_bit(t0, x30_elsize, 0);
1879     __ bnez(t0, L_copy_shorts);
1880     __ add(from, src, src_pos); // src_addr
1881     __ add(to, dst, dst_pos); // dst_addr
1882     __ sign_extend(count, scratch_length, 32); // length
1883     __ j(RuntimeAddress(byte_copy_entry));
1884 
1885   __ BIND(L_copy_shorts);
1886     __ shadd(from, src_pos, src, t0, 1); // src_addr
1887     __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1888     __ sign_extend(count, scratch_length, 32); // length
1889     __ j(RuntimeAddress(short_copy_entry));
1890 
1891   __ BIND(L_copy_ints);
1892     __ test_bit(t0, x30_elsize, 0);
1893     __ bnez(t0, L_copy_longs);
1894     __ shadd(from, src_pos, src, t0, 2); // src_addr
1895     __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
1896     __ sign_extend(count, scratch_length, 32); // length
1897     __ j(RuntimeAddress(int_copy_entry));
1898 
1899   __ BIND(L_copy_longs);
1900 #ifdef ASSERT
1901     {
1902       BLOCK_COMMENT("assert long copy {");
1903       Label L;
1904       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
1905       __ sign_extend(lh, lh, 32);
1906       __ mv(t0, LogBytesPerLong);
1907       __ beq(x30_elsize, t0, L);
1908       __ stop("must be long copy, but elsize is wrong");
1909       __ bind(L);
1910       BLOCK_COMMENT("} assert long copy done");
1911     }
1912 #endif
1913     __ shadd(from, src_pos, src, t0, 3); // src_addr
1914     __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
1915     __ sign_extend(count, scratch_length, 32); // length
1916     __ j(RuntimeAddress(long_copy_entry));
1917 
1918     // ObjArrayKlass
1919   __ BIND(L_objArray);
1920     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1921 
1922     Label L_plain_copy, L_checkcast_copy;
1923     // test array classes for subtyping
1924     __ load_klass(t2, dst);
1925     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
1926 
1927     // Identically typed arrays can be copied without element-wise checks.
1928     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1929                            t1, L_failed);
1930 
1931     __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1932     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1933     __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1934     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1935     __ sign_extend(count, scratch_length, 32); // length
1936   __ BIND(L_plain_copy);
1937     __ j(RuntimeAddress(oop_copy_entry));
1938 
1939   __ BIND(L_checkcast_copy);
1940     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
1941     {
1942       // Before looking at dst.length, make sure dst is also an objArray.
1943       __ lwu(t0, Address(t2, lh_offset));
1944       __ mv(t1, objArray_lh);
1945       __ bne(t0, t1, L_failed);
1946 
1947       // It is safe to examine both src.length and dst.length.
1948       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1949                              t2, L_failed);
1950 
1951       __ load_klass(dst_klass, dst); // reload
1952 
1953       // Marshal the base address arguments now, freeing registers.
1954       __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1955       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1956       __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1957       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1958       __ sign_extend(count, length, 32);      // length (reloaded)
1959       const Register sco_temp = c_rarg3;      // this register is free now
1960       assert_different_registers(from, to, count, sco_temp,
1961                                  dst_klass, scratch_src_klass);
1962 
1963       // Generate the type check.
1964       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
1965       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1966 
1967       // Smashes t0, t1
1968       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
1969 
1970       // Fetch destination element klass from the ObjArrayKlass header.
1971       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1972       __ ld(dst_klass, Address(dst_klass, ek_offset));
1973       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1974 
1975       // the checkcast_copy loop needs two extra arguments:
1976       assert(c_rarg3 == sco_temp, "#3 already in place");
1977       // Set up arguments for checkcast_copy_entry.
1978       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
1979       __ j(RuntimeAddress(checkcast_copy_entry));
1980     }
1981 
1982   __ BIND(L_failed);
1983     __ mv(x10, -1);
1984     __ leave();   // required for proper stackwalking of RuntimeStub frame
1985     __ ret();
1986 
1987     return start;
1988   }
1989 
1990   //
1991   // Generate stub for array fill. If "aligned" is true, the
1992   // "to" address is assumed to be heapword aligned.
1993   //
1994   // Arguments for generated stub:
1995   //   to:    c_rarg0
1996   //   value: c_rarg1
1997   //   count: c_rarg2 treated as signed
1998   //
1999   address generate_fill(BasicType t, bool aligned, const char* name) {
2000     __ align(CodeEntryAlignment);
2001     StubCodeMark mark(this, "StubRoutines", name);
2002     address start = __ pc();
2003 
2004     BLOCK_COMMENT("Entry:");
2005 
2006     const Register to        = c_rarg0;  // source array address
2007     const Register value     = c_rarg1;  // value
2008     const Register count     = c_rarg2;  // elements count
2009 
2010     const Register bz_base   = x28;      // base for block_zero routine
2011     const Register cnt_words = x29;      // temp register
2012     const Register tmp_reg   = t1;
2013 
2014     __ enter();
2015 
2016     Label L_fill_elements, L_exit1;
2017 
2018     int shift = -1;
2019     switch (t) {
2020       case T_BYTE:
2021         shift = 0;
2022 
2023         // Zero extend value
2024         // 8 bit -> 16 bit
2025         __ andi(value, value, 0xff);
2026         __ mv(tmp_reg, value);
2027         __ slli(tmp_reg, tmp_reg, 8);
2028         __ orr(value, value, tmp_reg);
2029 
2030         // 16 bit -> 32 bit
2031         __ mv(tmp_reg, value);
2032         __ slli(tmp_reg, tmp_reg, 16);
2033         __ orr(value, value, tmp_reg);
2034 
2035         __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2036         __ bltu(count, tmp_reg, L_fill_elements);
2037         break;
2038       case T_SHORT:
2039         shift = 1;
2040         // Zero extend value
2041         // 16 bit -> 32 bit
2042         __ andi(value, value, 0xffff);
2043         __ mv(tmp_reg, value);
2044         __ slli(tmp_reg, tmp_reg, 16);
2045         __ orr(value, value, tmp_reg);
2046 
2047         // Short arrays (< 8 bytes) fill by element
2048         __ mv(tmp_reg, 8 >> shift);
2049         __ bltu(count, tmp_reg, L_fill_elements);
2050         break;
2051       case T_INT:
2052         shift = 2;
2053 
2054         // Short arrays (< 8 bytes) fill by element
2055         __ mv(tmp_reg, 8 >> shift);
2056         __ bltu(count, tmp_reg, L_fill_elements);
2057         break;
2058       default: ShouldNotReachHere();
2059     }
2060 
2061     // Align source address at 8 bytes address boundary.
2062     Label L_skip_align1, L_skip_align2, L_skip_align4;
2063     if (!aligned) {
2064       switch (t) {
2065         case T_BYTE:
2066           // One byte misalignment happens only for byte arrays.
2067           __ test_bit(t0, to, 0);
2068           __ beqz(t0, L_skip_align1);
2069           __ sb(value, Address(to, 0));
2070           __ addi(to, to, 1);
2071           __ addiw(count, count, -1);
2072           __ bind(L_skip_align1);
2073           // Fallthrough
2074         case T_SHORT:
2075           // Two bytes misalignment happens only for byte and short (char) arrays.
2076           __ test_bit(t0, to, 1);
2077           __ beqz(t0, L_skip_align2);
2078           __ sh(value, Address(to, 0));
2079           __ addi(to, to, 2);
2080           __ addiw(count, count, -(2 >> shift));
2081           __ bind(L_skip_align2);
2082           // Fallthrough
2083         case T_INT:
2084           // Align to 8 bytes, we know we are 4 byte aligned to start.
2085           __ test_bit(t0, to, 2);
2086           __ beqz(t0, L_skip_align4);
2087           __ sw(value, Address(to, 0));
2088           __ addi(to, to, 4);
2089           __ addiw(count, count, -(4 >> shift));
2090           __ bind(L_skip_align4);
2091           break;
2092         default: ShouldNotReachHere();
2093       }
2094     }
2095 
2096     //
2097     //  Fill large chunks
2098     //
2099     __ srliw(cnt_words, count, 3 - shift); // number of words
2100 
2101     // 32 bit -> 64 bit
2102     __ andi(value, value, 0xffffffff);
2103     __ mv(tmp_reg, value);
2104     __ slli(tmp_reg, tmp_reg, 32);
2105     __ orr(value, value, tmp_reg);
2106 
2107     __ slli(tmp_reg, cnt_words, 3 - shift);
2108     __ subw(count, count, tmp_reg);
2109     {
2110       __ fill_words(to, cnt_words, value);
2111     }
2112 
2113     // Remaining count is less than 8 bytes. Fill it by a single store.
2114     // Note that the total length is no less than 8 bytes.
2115     if (t == T_BYTE || t == T_SHORT) {
2116       __ beqz(count, L_exit1);
2117       __ shadd(to, count, to, tmp_reg, shift); // points to the end
2118       __ sd(value, Address(to, -8)); // overwrite some elements
2119       __ bind(L_exit1);
2120       __ leave();
2121       __ ret();
2122     }
2123 
2124     // Handle copies less than 8 bytes.
2125     Label L_fill_2, L_fill_4, L_exit2;
2126     __ bind(L_fill_elements);
2127     switch (t) {
2128       case T_BYTE:
2129         __ test_bit(t0, count, 0);
2130         __ beqz(t0, L_fill_2);
2131         __ sb(value, Address(to, 0));
2132         __ addi(to, to, 1);
2133         __ bind(L_fill_2);
2134         __ test_bit(t0, count, 1);
2135         __ beqz(t0, L_fill_4);
2136         __ sh(value, Address(to, 0));
2137         __ addi(to, to, 2);
2138         __ bind(L_fill_4);
2139         __ test_bit(t0, count, 2);
2140         __ beqz(t0, L_exit2);
2141         __ sw(value, Address(to, 0));
2142         break;
2143       case T_SHORT:
2144         __ test_bit(t0, count, 0);
2145         __ beqz(t0, L_fill_4);
2146         __ sh(value, Address(to, 0));
2147         __ addi(to, to, 2);
2148         __ bind(L_fill_4);
2149         __ test_bit(t0, count, 1);
2150         __ beqz(t0, L_exit2);
2151         __ sw(value, Address(to, 0));
2152         break;
2153       case T_INT:
2154         __ beqz(count, L_exit2);
2155         __ sw(value, Address(to, 0));
2156         break;
2157       default: ShouldNotReachHere();
2158     }
2159     __ bind(L_exit2);
2160     __ leave();
2161     __ ret();
2162     return start;
2163   }
2164 
2165   void generate_arraycopy_stubs() {
2166     address entry                     = nullptr;
2167     address entry_jbyte_arraycopy     = nullptr;
2168     address entry_jshort_arraycopy    = nullptr;
2169     address entry_jint_arraycopy      = nullptr;
2170     address entry_oop_arraycopy       = nullptr;
2171     address entry_jlong_arraycopy     = nullptr;
2172     address entry_checkcast_arraycopy = nullptr;
2173 
2174     generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards);
2175     generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards);
2176 
2177     StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2178 
2179     //*** jbyte
2180     // Always need aligned and unaligned versions
2181     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_byte_copy(false, &entry,
2182                                                                                    "jbyte_disjoint_arraycopy");
2183     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_byte_copy(false, entry,
2184                                                                                    &entry_jbyte_arraycopy,
2185                                                                                    "jbyte_arraycopy");
2186     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, &entry,
2187                                                                                    "arrayof_jbyte_disjoint_arraycopy");
2188     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_byte_copy(true, entry, nullptr,
2189                                                                                    "arrayof_jbyte_arraycopy");
2190 
2191     //*** jshort
2192     // Always need aligned and unaligned versions
2193     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2194                                                                                     "jshort_disjoint_arraycopy");
2195     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2196                                                                                     &entry_jshort_arraycopy,
2197                                                                                     "jshort_arraycopy");
2198     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2199                                                                                     "arrayof_jshort_disjoint_arraycopy");
2200     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2201                                                                                     "arrayof_jshort_arraycopy");
2202 
2203     //*** jint
2204     // Aligned versions
2205     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_copy(true, &entry,
2206                                                                                   "arrayof_jint_disjoint_arraycopy");
2207     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2208                                                                                   "arrayof_jint_arraycopy");
2209     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2210     // entry_jint_arraycopy always points to the unaligned version
2211     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_int_copy(false, &entry,
2212                                                                                   "jint_disjoint_arraycopy");
2213     StubRoutines::_jint_arraycopy                    = generate_conjoint_int_copy(false, entry,
2214                                                                                   &entry_jint_arraycopy,
2215                                                                                   "jint_arraycopy");
2216 
2217     //*** jlong
2218     // It is always aligned
2219     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, &entry,
2220                                                                                    "arrayof_jlong_disjoint_arraycopy");
2221     StubRoutines::_arrayof_jlong_arraycopy           = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2222                                                                                    "arrayof_jlong_arraycopy");
2223     StubRoutines::_jlong_disjoint_arraycopy          = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2224     StubRoutines::_jlong_arraycopy                   = StubRoutines::_arrayof_jlong_arraycopy;
2225 
2226     //*** oops
2227     {
2228       // With compressed oops we need unaligned versions; notice that
2229       // we overwrite entry_oop_arraycopy.
2230       bool aligned = !UseCompressedOops;
2231 
2232       StubRoutines::_arrayof_oop_disjoint_arraycopy
2233         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2234                                      /*dest_uninitialized*/false);
2235       StubRoutines::_arrayof_oop_arraycopy
2236         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2237                                      /*dest_uninitialized*/false);
2238       // Aligned versions without pre-barriers
2239       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2240         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2241                                      /*dest_uninitialized*/true);
2242       StubRoutines::_arrayof_oop_arraycopy_uninit
2243         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2244                                      /*dest_uninitialized*/true);
2245     }
2246 
2247     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2248     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2249     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2250     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2251 
2252     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2253     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2254                                                                         /*dest_uninitialized*/true);
2255 
2256 
2257     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2258                                                               entry_jbyte_arraycopy,
2259                                                               entry_jshort_arraycopy,
2260                                                               entry_jint_arraycopy,
2261                                                               entry_jlong_arraycopy);
2262 
2263     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2264                                                                entry_jbyte_arraycopy,
2265                                                                entry_jshort_arraycopy,
2266                                                                entry_jint_arraycopy,
2267                                                                entry_oop_arraycopy,
2268                                                                entry_jlong_arraycopy,
2269                                                                entry_checkcast_arraycopy);
2270 
2271     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2272     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2273     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2274     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2275     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2276     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2277   }
2278 
2279   // code for comparing 16 bytes of strings with same encoding
2280   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
2281     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31;
2282     __ ld(tmp5, Address(str1));
2283     __ addi(str1, str1, 8);
2284     __ xorr(tmp4, tmp1, tmp2);
2285     __ ld(cnt1, Address(str2));
2286     __ addi(str2, str2, 8);
2287     __ bnez(tmp4, DIFF1);
2288     __ ld(tmp1, Address(str1));
2289     __ addi(str1, str1, 8);
2290     __ xorr(tmp4, tmp5, cnt1);
2291     __ ld(tmp2, Address(str2));
2292     __ addi(str2, str2, 8);
2293     __ bnez(tmp4, DIFF2);
2294   }
2295 
2296   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
2297   void compare_string_8_x_LU(Register tmpL, Register tmpU, Register strL, Register strU, Label& DIFF) {
2298     const Register tmp = x30, tmpLval = x12;
2299     __ ld(tmpLval, Address(strL));
2300     __ addi(strL, strL, wordSize);
2301     __ ld(tmpU, Address(strU));
2302     __ addi(strU, strU, wordSize);
2303     __ inflate_lo32(tmpL, tmpLval);
2304     __ xorr(tmp, tmpU, tmpL);
2305     __ bnez(tmp, DIFF);
2306 
2307     __ ld(tmpU, Address(strU));
2308     __ addi(strU, strU, wordSize);
2309     __ inflate_hi32(tmpL, tmpLval);
2310     __ xorr(tmp, tmpU, tmpL);
2311     __ bnez(tmp, DIFF);
2312   }
2313 
2314   // x10  = result
2315   // x11  = str1
2316   // x12  = cnt1
2317   // x13  = str2
2318   // x14  = cnt2
2319   // x28  = tmp1
2320   // x29  = tmp2
2321   // x30  = tmp3
2322   address generate_compare_long_string_different_encoding(bool isLU) {
2323     __ align(CodeEntryAlignment);
2324     StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL");
2325     address entry = __ pc();
2326     Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
2327     const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
2328                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
2329 
2330     // cnt2 == amount of characters left to compare
2331     // Check already loaded first 4 symbols
2332     __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2);
2333     __ mv(isLU ? tmp1 : tmp2, tmp3);
2334     __ addi(str1, str1, isLU ? wordSize / 2 : wordSize);
2335     __ addi(str2, str2, isLU ? wordSize : wordSize / 2);
2336     __ sub(cnt2, cnt2, wordSize / 2); // Already loaded 4 symbols
2337 
2338     __ xorr(tmp3, tmp1, tmp2);
2339     __ bnez(tmp3, CALCULATE_DIFFERENCE);
2340 
2341     Register strU = isLU ? str2 : str1,
2342              strL = isLU ? str1 : str2,
2343              tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
2344              tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
2345 
2346     // make sure main loop is 8 byte-aligned, we should load another 4 bytes from strL
2347     // cnt2 is >= 68 here, no need to check it for >= 0
2348     __ lwu(tmpL, Address(strL));
2349     __ addi(strL, strL, wordSize / 2);
2350     __ ld(tmpU, Address(strU));
2351     __ addi(strU, strU, wordSize);
2352     __ inflate_lo32(tmp3, tmpL);
2353     __ mv(tmpL, tmp3);
2354     __ xorr(tmp3, tmpU, tmpL);
2355     __ bnez(tmp3, CALCULATE_DIFFERENCE);
2356     __ addi(cnt2, cnt2, -wordSize / 2);
2357 
2358     // we are now 8-bytes aligned on strL
2359     __ sub(cnt2, cnt2, wordSize * 2);
2360     __ bltz(cnt2, TAIL);
2361     __ bind(SMALL_LOOP); // smaller loop
2362       __ sub(cnt2, cnt2, wordSize * 2);
2363       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2364       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2365       __ bgez(cnt2, SMALL_LOOP);
2366       __ addi(t0, cnt2, wordSize * 2);
2367       __ beqz(t0, DONE);
2368     __ bind(TAIL);  // 1..15 characters left
2369       // Aligned access. Load bytes in portions - 4, 2, 1.
2370 
2371       __ addi(t0, cnt2, wordSize);
2372       __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
2373       __ bltz(t0, LOAD_LAST);
2374       // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
2375       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2376       __ addi(cnt2, cnt2, -wordSize);
2377       __ beqz(cnt2, DONE);  // no character left
2378       __ bind(LOAD_LAST);   // cnt2 = 1..7 characters left
2379 
2380       __ addi(cnt2, cnt2, -wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
2381       __ slli(t0, cnt2, 1);     // t0 is now an offset in strU which points to last 16 bytes
2382       __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
2383       __ add(strU, strU, t0);   // Address of last 16 bytes in UTF-16 string
2384       __ load_int_misaligned(tmpL, Address(strL), t0, false);
2385       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
2386       __ inflate_lo32(tmp3, tmpL);
2387       __ mv(tmpL, tmp3);
2388       __ xorr(tmp3, tmpU, tmpL);
2389       __ bnez(tmp3, CALCULATE_DIFFERENCE);
2390 
2391       __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
2392       __ addi(strU, strU, wordSize);   // Address of last 8 bytes in UTF-16 string
2393       __ load_int_misaligned(tmpL, Address(strL), t0, false);
2394       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
2395       __ inflate_lo32(tmp3, tmpL);
2396       __ mv(tmpL, tmp3);
2397       __ xorr(tmp3, tmpU, tmpL);
2398       __ bnez(tmp3, CALCULATE_DIFFERENCE);
2399       __ j(DONE); // no character left
2400 
2401       // Find the first different characters in the longwords and
2402       // compute their difference.
2403     __ bind(CALCULATE_DIFFERENCE);
2404       __ ctzc_bit(tmp4, tmp3);
2405       __ srl(tmp1, tmp1, tmp4);
2406       __ srl(tmp2, tmp2, tmp4);
2407       __ andi(tmp1, tmp1, 0xFFFF);
2408       __ andi(tmp2, tmp2, 0xFFFF);
2409       __ sub(result, tmp1, tmp2);
2410     __ bind(DONE);
2411       __ ret();
2412     return entry;
2413   }
2414 
2415   address generate_method_entry_barrier() {
2416     __ align(CodeEntryAlignment);
2417     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
2418 
2419     Label deoptimize_label;
2420 
2421     address start = __ pc();
2422 
2423     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
2424 
2425     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
2426       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
2427       Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
2428       __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
2429       __ lwu(t1, t1);
2430       __ sw(t1, thread_epoch_addr);
2431       __ membar(__ LoadLoad);
2432     }
2433 
2434     __ set_last_Java_frame(sp, fp, ra);
2435 
2436     __ enter();
2437     __ add(t1, sp, wordSize);
2438 
2439     __ sub(sp, sp, 4 * wordSize);
2440 
2441     __ push_call_clobbered_registers();
2442 
2443     __ mv(c_rarg0, t1);
2444     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
2445 
2446     __ reset_last_Java_frame(true);
2447 
2448     __ mv(t0, x10);
2449 
2450     __ pop_call_clobbered_registers();
2451 
2452     __ bnez(t0, deoptimize_label);
2453 
2454     __ leave();
2455     __ ret();
2456 
2457     __ BIND(deoptimize_label);
2458 
2459     __ ld(t0, Address(sp, 0));
2460     __ ld(fp, Address(sp, wordSize));
2461     __ ld(ra, Address(sp, wordSize * 2));
2462     __ ld(t1, Address(sp, wordSize * 3));
2463 
2464     __ mv(sp, t0);
2465     __ jr(t1);
2466 
2467     return start;
2468   }
2469 
2470   // x10  = result
2471   // x11  = str1
2472   // x12  = cnt1
2473   // x13  = str2
2474   // x14  = cnt2
2475   // x28  = tmp1
2476   // x29  = tmp2
2477   // x30  = tmp3
2478   // x31  = tmp4
2479   address generate_compare_long_string_same_encoding(bool isLL) {
2480     __ align(CodeEntryAlignment);
2481     StubCodeMark mark(this, "StubRoutines", isLL ?
2482                       "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU");
2483     address entry = __ pc();
2484     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
2485           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
2486     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
2487                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
2488     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
2489 
2490     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
2491     // update cnt2 counter with already loaded 8 bytes
2492     __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
2493     // update pointers, because of previous read
2494     __ add(str1, str1, wordSize);
2495     __ add(str2, str2, wordSize);
2496     // less than 16 bytes left?
2497     __ sub(cnt2, cnt2, isLL ? 16 : 8);
2498     __ push_reg(spilled_regs, sp);
2499     __ bltz(cnt2, TAIL);
2500     __ bind(SMALL_LOOP);
2501       compare_string_16_bytes_same(DIFF, DIFF2);
2502       __ sub(cnt2, cnt2, isLL ? 16 : 8);
2503       __ bgez(cnt2, SMALL_LOOP);
2504     __ bind(TAIL);
2505       __ addi(cnt2, cnt2, isLL ? 16 : 8);
2506       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
2507       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2508       __ blez(cnt2, CHECK_LAST);
2509       __ xorr(tmp4, tmp1, tmp2);
2510       __ bnez(tmp4, DIFF);
2511       __ ld(tmp1, Address(str1));
2512       __ addi(str1, str1, 8);
2513       __ ld(tmp2, Address(str2));
2514       __ addi(str2, str2, 8);
2515       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2516     __ bind(CHECK_LAST);
2517       if (!isLL) {
2518         __ add(cnt2, cnt2, cnt2); // now in bytes
2519       }
2520       __ xorr(tmp4, tmp1, tmp2);
2521       __ bnez(tmp4, DIFF);
2522       __ add(str1, str1, cnt2);
2523       __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
2524       __ add(str2, str2, cnt2);
2525       __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
2526       __ xorr(tmp4, tmp5, cnt1);
2527       __ beqz(tmp4, LENGTH_DIFF);
2528       // Find the first different characters in the longwords and
2529       // compute their difference.
2530     __ bind(DIFF2);
2531       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2532       __ srl(tmp5, tmp5, tmp3);
2533       __ srl(cnt1, cnt1, tmp3);
2534       if (isLL) {
2535         __ andi(tmp5, tmp5, 0xFF);
2536         __ andi(cnt1, cnt1, 0xFF);
2537       } else {
2538         __ andi(tmp5, tmp5, 0xFFFF);
2539         __ andi(cnt1, cnt1, 0xFFFF);
2540       }
2541       __ sub(result, tmp5, cnt1);
2542       __ j(LENGTH_DIFF);
2543     __ bind(DIFF);
2544       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2545       __ srl(tmp1, tmp1, tmp3);
2546       __ srl(tmp2, tmp2, tmp3);
2547       if (isLL) {
2548         __ andi(tmp1, tmp1, 0xFF);
2549         __ andi(tmp2, tmp2, 0xFF);
2550       } else {
2551         __ andi(tmp1, tmp1, 0xFFFF);
2552         __ andi(tmp2, tmp2, 0xFFFF);
2553       }
2554       __ sub(result, tmp1, tmp2);
2555       __ j(LENGTH_DIFF);
2556     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
2557       __ xorr(tmp4, tmp1, tmp2);
2558       __ bnez(tmp4, DIFF);
2559     __ bind(LENGTH_DIFF);
2560       __ pop_reg(spilled_regs, sp);
2561       __ ret();
2562     return entry;
2563   }
2564 
2565   void generate_compare_long_strings() {
2566     StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true);
2567     StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false);
2568     StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true);
2569     StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false);
2570   }
2571 
2572   // x10 result
2573   // x11 src
2574   // x12 src count
2575   // x13 pattern
2576   // x14 pattern count
2577   address generate_string_indexof_linear(bool needle_isL, bool haystack_isL)
2578   {
2579     const char* stubName = needle_isL
2580            ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul")
2581            : "indexof_linear_uu";
2582     __ align(CodeEntryAlignment);
2583     StubCodeMark mark(this, "StubRoutines", stubName);
2584     address entry = __ pc();
2585 
2586     int needle_chr_size = needle_isL ? 1 : 2;
2587     int haystack_chr_size = haystack_isL ? 1 : 2;
2588     int needle_chr_shift = needle_isL ? 0 : 1;
2589     int haystack_chr_shift = haystack_isL ? 0 : 1;
2590     bool isL = needle_isL && haystack_isL;
2591     // parameters
2592     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
2593     // temporary registers
2594     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
2595     // redefinitions
2596     Register ch1 = x28, ch2 = x29;
2597     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
2598 
2599     __ push_reg(spilled_regs, sp);
2600 
2601     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
2602           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
2603           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
2604           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
2605           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
2606           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
2607 
2608     __ ld(ch1, Address(needle));
2609     __ ld(ch2, Address(haystack));
2610     // src.length - pattern.length
2611     __ sub(haystack_len, haystack_len, needle_len);
2612 
2613     // first is needle[0]
2614     __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first);
2615     uint64_t mask0101 = UCONST64(0x0101010101010101);
2616     uint64_t mask0001 = UCONST64(0x0001000100010001);
2617     __ mv(mask1, haystack_isL ? mask0101 : mask0001);
2618     __ mul(first, first, mask1);
2619     uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
2620     uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
2621     __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
2622     if (needle_isL != haystack_isL) {
2623       __ mv(tmp, ch1);
2624     }
2625     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
2626     __ blez(haystack_len, L_SMALL);
2627 
2628     if (needle_isL != haystack_isL) {
2629       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2630     }
2631     // xorr, sub, orr, notr, andr
2632     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
2633     // eg:
2634     // first:        aa aa aa aa aa aa aa aa
2635     // ch2:          aa aa li nx jd ka aa aa
2636     // match_mask:   80 80 00 00 00 00 80 80
2637     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2638 
2639     // search first char of needle, if success, goto L_HAS_ZERO;
2640     __ bnez(match_mask, L_HAS_ZERO);
2641     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2642     __ add(result, result, wordSize / haystack_chr_size);
2643     __ add(haystack, haystack, wordSize);
2644     __ bltz(haystack_len, L_POST_LOOP);
2645 
2646     __ bind(L_LOOP);
2647     __ ld(ch2, Address(haystack));
2648     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2649     __ bnez(match_mask, L_HAS_ZERO);
2650 
2651     __ bind(L_LOOP_PROCEED);
2652     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2653     __ add(haystack, haystack, wordSize);
2654     __ add(result, result, wordSize / haystack_chr_size);
2655     __ bgez(haystack_len, L_LOOP);
2656 
2657     __ bind(L_POST_LOOP);
2658     __ mv(ch2, -wordSize / haystack_chr_size);
2659     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
2660     __ ld(ch2, Address(haystack));
2661     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2662     __ neg(haystack_len, haystack_len);
2663     __ xorr(ch2, first, ch2);
2664     __ sub(match_mask, ch2, mask1);
2665     __ orr(ch2, ch2, mask2);
2666     __ mv(trailing_zeros, -1); // all bits set
2667     __ j(L_SMALL_PROCEED);
2668 
2669     __ align(OptoLoopAlignment);
2670     __ bind(L_SMALL);
2671     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2672     __ neg(haystack_len, haystack_len);
2673     if (needle_isL != haystack_isL) {
2674       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2675     }
2676     __ xorr(ch2, first, ch2);
2677     __ sub(match_mask, ch2, mask1);
2678     __ orr(ch2, ch2, mask2);
2679     __ mv(trailing_zeros, -1); // all bits set
2680 
2681     __ bind(L_SMALL_PROCEED);
2682     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
2683     __ notr(ch2, ch2);
2684     __ andr(match_mask, match_mask, ch2);
2685     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
2686     __ beqz(match_mask, NOMATCH);
2687 
2688     __ bind(L_SMALL_HAS_ZERO_LOOP);
2689     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros
2690     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2691     __ mv(ch2, wordSize / haystack_chr_size);
2692     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
2693     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2694     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2695     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2696 
2697     __ bind(L_SMALL_CMP_LOOP);
2698     __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
2699     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2700     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
2701     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2702     __ add(trailing_zeros, trailing_zeros, 1);
2703     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
2704     __ beq(first, ch2, L_SMALL_CMP_LOOP);
2705 
2706     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
2707     __ beqz(match_mask, NOMATCH);
2708     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2709     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2710     __ add(result, result, 1);
2711     __ add(haystack, haystack, haystack_chr_size);
2712     __ j(L_SMALL_HAS_ZERO_LOOP);
2713 
2714     __ align(OptoLoopAlignment);
2715     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
2716     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2717     __ j(DONE);
2718 
2719     __ align(OptoLoopAlignment);
2720     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
2721     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2722     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2723     __ j(DONE);
2724 
2725     __ align(OptoLoopAlignment);
2726     __ bind(L_HAS_ZERO);
2727     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2728     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2729     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
2730     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
2731     __ sub(result, result, 1); // array index from 0, so result -= 1
2732 
2733     __ bind(L_HAS_ZERO_LOOP);
2734     __ mv(needle_len, wordSize / haystack_chr_size);
2735     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
2736     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
2737     // load next 8 bytes from haystack, and increase result index
2738     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2739     __ add(result, result, 1);
2740     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2741     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2742 
2743     // compare one char
2744     __ bind(L_CMP_LOOP);
2745     __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
2746     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
2747     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2748     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2749     __ add(trailing_zeros, trailing_zeros, 1); // next char index
2750     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
2751     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
2752     __ beq(needle_len, ch2, L_CMP_LOOP);
2753 
2754     __ bind(L_CMP_LOOP_NOMATCH);
2755     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
2756     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index
2757     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2758     __ add(haystack, haystack, haystack_chr_size);
2759     __ j(L_HAS_ZERO_LOOP);
2760 
2761     __ align(OptoLoopAlignment);
2762     __ bind(L_CMP_LOOP_LAST_CMP);
2763     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
2764     __ j(DONE);
2765 
2766     __ align(OptoLoopAlignment);
2767     __ bind(L_CMP_LOOP_LAST_CMP2);
2768     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2769     __ add(result, result, 1);
2770     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2771     __ j(DONE);
2772 
2773     __ align(OptoLoopAlignment);
2774     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
2775     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
2776     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
2777     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
2778     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
2779     // result by analyzed characters value, so, we can just reset lower bits
2780     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
2781     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
2782     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
2783     // index of last analyzed substring inside current octet. So, haystack in at
2784     // respective start address. We need to advance it to next octet
2785     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
2786     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
2787     __ andi(result, result, haystack_isL ? -8 : -4);
2788     __ slli(tmp, match_mask, haystack_chr_shift);
2789     __ sub(haystack, haystack, tmp);
2790     __ sign_extend(haystack_len, haystack_len, 32);
2791     __ j(L_LOOP_PROCEED);
2792 
2793     __ align(OptoLoopAlignment);
2794     __ bind(NOMATCH);
2795     __ mv(result, -1);
2796 
2797     __ bind(DONE);
2798     __ pop_reg(spilled_regs, sp);
2799     __ ret();
2800     return entry;
2801   }
2802 
2803   void generate_string_indexof_stubs()
2804   {
2805     StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
2806     StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
2807     StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
2808   }
2809 
2810 #ifdef COMPILER2
2811   address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
2812     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
2813 
2814     address start = __ pc();
2815     const Register
2816       r_super_klass  = x10,
2817       r_array_base   = x11,
2818       r_array_length = x12,
2819       r_array_index  = x13,
2820       r_sub_klass    = x14,
2821       result         = x15,
2822       r_bitmap       = x16;
2823 
2824     Label L_success;
2825     __ enter();
2826     __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, result,
2827                                      r_array_base, r_array_length, r_array_index,
2828                                      r_bitmap, super_klass_index, /*stub_is_near*/true);
2829     __ leave();
2830     __ ret();
2831 
2832     return start;
2833   }
2834 
2835   // Slow path implementation for UseSecondarySupersTable.
2836   address generate_lookup_secondary_supers_table_slow_path_stub() {
2837     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
2838 
2839     address start = __ pc();
2840     const Register
2841       r_super_klass  = x10,        // argument
2842       r_array_base   = x11,        // argument
2843       temp1          = x12,        // tmp
2844       r_array_index  = x13,        // argument
2845       result         = x15,        // argument
2846       r_bitmap       = x16;        // argument
2847 
2848 
2849     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
2850     __ ret();
2851 
2852     return start;
2853   }
2854 
2855   address generate_mulAdd()
2856   {
2857     __ align(CodeEntryAlignment);
2858     StubCodeMark mark(this, "StubRoutines", "mulAdd");
2859 
2860     address entry = __ pc();
2861 
2862     const Register out     = x10;
2863     const Register in      = x11;
2864     const Register offset  = x12;
2865     const Register len     = x13;
2866     const Register k       = x14;
2867     const Register tmp     = x28;
2868 
2869     BLOCK_COMMENT("Entry:");
2870     __ enter();
2871     __ mul_add(out, in, offset, len, k, tmp);
2872     __ leave();
2873     __ ret();
2874 
2875     return entry;
2876   }
2877 
2878   /**
2879    *  Arguments:
2880    *
2881    *  Input:
2882    *    c_rarg0   - x address
2883    *    c_rarg1   - x length
2884    *    c_rarg2   - y address
2885    *    c_rarg3   - y length
2886    *    c_rarg4   - z address
2887    */
2888   address generate_multiplyToLen()
2889   {
2890     __ align(CodeEntryAlignment);
2891     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2892     address entry = __ pc();
2893 
2894     const Register x     = x10;
2895     const Register xlen  = x11;
2896     const Register y     = x12;
2897     const Register ylen  = x13;
2898     const Register z     = x14;
2899 
2900     const Register tmp0  = x15;
2901     const Register tmp1  = x16;
2902     const Register tmp2  = x17;
2903     const Register tmp3  = x7;
2904     const Register tmp4  = x28;
2905     const Register tmp5  = x29;
2906     const Register tmp6  = x30;
2907     const Register tmp7  = x31;
2908 
2909     BLOCK_COMMENT("Entry:");
2910     __ enter(); // required for proper stackwalking of RuntimeStub frame
2911     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2912     __ leave(); // required for proper stackwalking of RuntimeStub frame
2913     __ ret();
2914 
2915     return entry;
2916   }
2917 
2918   address generate_squareToLen()
2919   {
2920     __ align(CodeEntryAlignment);
2921     StubCodeMark mark(this, "StubRoutines", "squareToLen");
2922     address entry = __ pc();
2923 
2924     const Register x     = x10;
2925     const Register xlen  = x11;
2926     const Register z     = x12;
2927     const Register y     = x14; // == x
2928     const Register ylen  = x15; // == xlen
2929 
2930     const Register tmp0  = x13; // zlen, unused
2931     const Register tmp1  = x16;
2932     const Register tmp2  = x17;
2933     const Register tmp3  = x7;
2934     const Register tmp4  = x28;
2935     const Register tmp5  = x29;
2936     const Register tmp6  = x30;
2937     const Register tmp7  = x31;
2938 
2939     BLOCK_COMMENT("Entry:");
2940     __ enter();
2941     __ mv(y, x);
2942     __ mv(ylen, xlen);
2943     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2944     __ leave();
2945     __ ret();
2946 
2947     return entry;
2948   }
2949 
2950   // Arguments:
2951   //
2952   // Input:
2953   //   c_rarg0   - newArr address
2954   //   c_rarg1   - oldArr address
2955   //   c_rarg2   - newIdx
2956   //   c_rarg3   - shiftCount
2957   //   c_rarg4   - numIter
2958   //
2959   address generate_bigIntegerLeftShift() {
2960     __ align(CodeEntryAlignment);
2961     StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
2962     address entry = __ pc();
2963 
2964     Label loop, exit;
2965 
2966     Register newArr        = c_rarg0;
2967     Register oldArr        = c_rarg1;
2968     Register newIdx        = c_rarg2;
2969     Register shiftCount    = c_rarg3;
2970     Register numIter       = c_rarg4;
2971 
2972     Register shiftRevCount = c_rarg5;
2973     Register oldArrNext    = t1;
2974 
2975     __ beqz(numIter, exit);
2976     __ shadd(newArr, newIdx, newArr, t0, 2);
2977 
2978     __ mv(shiftRevCount, 32);
2979     __ sub(shiftRevCount, shiftRevCount, shiftCount);
2980 
2981     __ bind(loop);
2982     __ addi(oldArrNext, oldArr, 4);
2983     __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
2984     __ vle32_v(v0, oldArr);
2985     __ vle32_v(v4, oldArrNext);
2986     __ vsll_vx(v0, v0, shiftCount);
2987     __ vsrl_vx(v4, v4, shiftRevCount);
2988     __ vor_vv(v0, v0, v4);
2989     __ vse32_v(v0, newArr);
2990     __ sub(numIter, numIter, t0);
2991     __ shadd(oldArr, t0, oldArr, t1, 2);
2992     __ shadd(newArr, t0, newArr, t1, 2);
2993     __ bnez(numIter, loop);
2994 
2995     __ bind(exit);
2996     __ ret();
2997 
2998     return entry;
2999   }
3000 
3001   // Arguments:
3002   //
3003   // Input:
3004   //   c_rarg0   - newArr address
3005   //   c_rarg1   - oldArr address
3006   //   c_rarg2   - newIdx
3007   //   c_rarg3   - shiftCount
3008   //   c_rarg4   - numIter
3009   //
3010   address generate_bigIntegerRightShift() {
3011     __ align(CodeEntryAlignment);
3012     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
3013     address entry = __ pc();
3014 
3015     Label loop, exit;
3016 
3017     Register newArr        = c_rarg0;
3018     Register oldArr        = c_rarg1;
3019     Register newIdx        = c_rarg2;
3020     Register shiftCount    = c_rarg3;
3021     Register numIter       = c_rarg4;
3022     Register idx           = numIter;
3023 
3024     Register shiftRevCount = c_rarg5;
3025     Register oldArrNext    = c_rarg6;
3026     Register newArrCur     = t0;
3027     Register oldArrCur     = t1;
3028 
3029     __ beqz(idx, exit);
3030     __ shadd(newArr, newIdx, newArr, t0, 2);
3031 
3032     __ mv(shiftRevCount, 32);
3033     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3034 
3035     __ bind(loop);
3036     __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3037     __ sub(idx, idx, t0);
3038     __ shadd(oldArrNext, idx, oldArr, t1, 2);
3039     __ shadd(newArrCur, idx, newArr, t1, 2);
3040     __ addi(oldArrCur, oldArrNext, 4);
3041     __ vle32_v(v0, oldArrCur);
3042     __ vle32_v(v4, oldArrNext);
3043     __ vsrl_vx(v0, v0, shiftCount);
3044     __ vsll_vx(v4, v4, shiftRevCount);
3045     __ vor_vv(v0, v0, v4);
3046     __ vse32_v(v0, newArrCur);
3047     __ bnez(idx, loop);
3048 
3049     __ bind(exit);
3050     __ ret();
3051 
3052     return entry;
3053   }
3054 #endif
3055 
3056 #ifdef COMPILER2
3057   class MontgomeryMultiplyGenerator : public MacroAssembler {
3058 
3059     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3060       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3061 
3062     RegSet _toSave;
3063     bool _squaring;
3064 
3065   public:
3066     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3067       : MacroAssembler(as->code()), _squaring(squaring) {
3068 
3069       // Register allocation
3070 
3071       RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
3072       Pa_base = *regs;       // Argument registers
3073       if (squaring) {
3074         Pb_base = Pa_base;
3075       } else {
3076         Pb_base = *++regs;
3077       }
3078       Pn_base = *++regs;
3079       Rlen= *++regs;
3080       inv = *++regs;
3081       Pm_base = *++regs;
3082 
3083                         // Working registers:
3084       Ra =  *++regs;    // The current digit of a, b, n, and m.
3085       Rb =  *++regs;
3086       Rm =  *++regs;
3087       Rn =  *++regs;
3088 
3089       Pa =  *++regs;      // Pointers to the current/next digit of a, b, n, and m.
3090       Pb =  *++regs;
3091       Pm =  *++regs;
3092       Pn =  *++regs;
3093 
3094       tmp0 =  *++regs;    // Three registers which form a
3095       tmp1 =  *++regs;    // triple-precision accumuator.
3096       tmp2 =  *++regs;
3097 
3098       Ri =  x6;         // Inner and outer loop indexes.
3099       Rj =  x7;
3100 
3101       Rhi_ab = x28;     // Product registers: low and high parts
3102       Rlo_ab = x29;     // of a*b and m*n.
3103       Rhi_mn = x30;
3104       Rlo_mn = x31;
3105 
3106       // x18 and up are callee-saved.
3107       _toSave = RegSet::range(x18, *regs) + Pm_base;
3108     }
3109 
3110   private:
3111     void save_regs() {
3112       push_reg(_toSave, sp);
3113     }
3114 
3115     void restore_regs() {
3116       pop_reg(_toSave, sp);
3117     }
3118 
3119     template <typename T>
3120     void unroll_2(Register count, T block) {
3121       Label loop, end, odd;
3122       beqz(count, end);
3123       test_bit(t0, count, 0);
3124       bnez(t0, odd);
3125       align(16);
3126       bind(loop);
3127       (this->*block)();
3128       bind(odd);
3129       (this->*block)();
3130       addi(count, count, -2);
3131       bgtz(count, loop);
3132       bind(end);
3133     }
3134 
3135     template <typename T>
3136     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3137       Label loop, end, odd;
3138       beqz(count, end);
3139       test_bit(tmp, count, 0);
3140       bnez(tmp, odd);
3141       align(16);
3142       bind(loop);
3143       (this->*block)(d, s, tmp);
3144       bind(odd);
3145       (this->*block)(d, s, tmp);
3146       addi(count, count, -2);
3147       bgtz(count, loop);
3148       bind(end);
3149     }
3150 
3151     void pre1(RegisterOrConstant i) {
3152       block_comment("pre1");
3153       // Pa = Pa_base;
3154       // Pb = Pb_base + i;
3155       // Pm = Pm_base;
3156       // Pn = Pn_base + i;
3157       // Ra = *Pa;
3158       // Rb = *Pb;
3159       // Rm = *Pm;
3160       // Rn = *Pn;
3161       if (i.is_register()) {
3162         slli(t0, i.as_register(), LogBytesPerWord);
3163       } else {
3164         mv(t0, i.as_constant());
3165         slli(t0, t0, LogBytesPerWord);
3166       }
3167 
3168       mv(Pa, Pa_base);
3169       add(Pb, Pb_base, t0);
3170       mv(Pm, Pm_base);
3171       add(Pn, Pn_base, t0);
3172 
3173       ld(Ra, Address(Pa));
3174       ld(Rb, Address(Pb));
3175       ld(Rm, Address(Pm));
3176       ld(Rn, Address(Pn));
3177 
3178       // Zero the m*n result.
3179       mv(Rhi_mn, zr);
3180       mv(Rlo_mn, zr);
3181     }
3182 
3183     // The core multiply-accumulate step of a Montgomery
3184     // multiplication.  The idea is to schedule operations as a
3185     // pipeline so that instructions with long latencies (loads and
3186     // multiplies) have time to complete before their results are
3187     // used.  This most benefits in-order implementations of the
3188     // architecture but out-of-order ones also benefit.
3189     void step() {
3190       block_comment("step");
3191       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3192       // Ra = *++Pa;
3193       // Rb = *--Pb;
3194       mulhu(Rhi_ab, Ra, Rb);
3195       mul(Rlo_ab, Ra, Rb);
3196       addi(Pa, Pa, wordSize);
3197       ld(Ra, Address(Pa));
3198       addi(Pb, Pb, -wordSize);
3199       ld(Rb, Address(Pb));
3200       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
3201                                             // previous iteration.
3202       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3203       // Rm = *++Pm;
3204       // Rn = *--Pn;
3205       mulhu(Rhi_mn, Rm, Rn);
3206       mul(Rlo_mn, Rm, Rn);
3207       addi(Pm, Pm, wordSize);
3208       ld(Rm, Address(Pm));
3209       addi(Pn, Pn, -wordSize);
3210       ld(Rn, Address(Pn));
3211       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3212     }
3213 
3214     void post1() {
3215       block_comment("post1");
3216 
3217       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3218       // Ra = *++Pa;
3219       // Rb = *--Pb;
3220       mulhu(Rhi_ab, Ra, Rb);
3221       mul(Rlo_ab, Ra, Rb);
3222       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3223       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3224 
3225       // *Pm = Rm = tmp0 * inv;
3226       mul(Rm, tmp0, inv);
3227       sd(Rm, Address(Pm));
3228 
3229       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3230       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3231       mulhu(Rhi_mn, Rm, Rn);
3232 
3233 #ifndef PRODUCT
3234       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3235       {
3236         mul(Rlo_mn, Rm, Rn);
3237         add(Rlo_mn, tmp0, Rlo_mn);
3238         Label ok;
3239         beqz(Rlo_mn, ok);
3240         stop("broken Montgomery multiply");
3241         bind(ok);
3242       }
3243 #endif
3244       // We have very carefully set things up so that
3245       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3246       // the lower half of Rm * Rn because we know the result already:
3247       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3248       // tmp0 != 0.  So, rather than do a mul and an cad we just set
3249       // the carry flag iff tmp0 is nonzero.
3250       //
3251       // mul(Rlo_mn, Rm, Rn);
3252       // cad(zr, tmp0, Rlo_mn);
3253       addi(t0, tmp0, -1);
3254       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3255       cadc(tmp0, tmp1, Rhi_mn, t0);
3256       adc(tmp1, tmp2, zr, t0);
3257       mv(tmp2, zr);
3258     }
3259 
3260     void pre2(Register i, Register len) {
3261       block_comment("pre2");
3262       // Pa = Pa_base + i-len;
3263       // Pb = Pb_base + len;
3264       // Pm = Pm_base + i-len;
3265       // Pn = Pn_base + len;
3266 
3267       sub(Rj, i, len);
3268       // Rj == i-len
3269 
3270       // Ra as temp register
3271       slli(Ra, Rj, LogBytesPerWord);
3272       add(Pa, Pa_base, Ra);
3273       add(Pm, Pm_base, Ra);
3274       slli(Ra, len, LogBytesPerWord);
3275       add(Pb, Pb_base, Ra);
3276       add(Pn, Pn_base, Ra);
3277 
3278       // Ra = *++Pa;
3279       // Rb = *--Pb;
3280       // Rm = *++Pm;
3281       // Rn = *--Pn;
3282       add(Pa, Pa, wordSize);
3283       ld(Ra, Address(Pa));
3284       add(Pb, Pb, -wordSize);
3285       ld(Rb, Address(Pb));
3286       add(Pm, Pm, wordSize);
3287       ld(Rm, Address(Pm));
3288       add(Pn, Pn, -wordSize);
3289       ld(Rn, Address(Pn));
3290 
3291       mv(Rhi_mn, zr);
3292       mv(Rlo_mn, zr);
3293     }
3294 
3295     void post2(Register i, Register len) {
3296       block_comment("post2");
3297       sub(Rj, i, len);
3298 
3299       cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
3300 
3301       // As soon as we know the least significant digit of our result,
3302       // store it.
3303       // Pm_base[i-len] = tmp0;
3304       // Rj as temp register
3305       slli(Rj, Rj, LogBytesPerWord);
3306       add(Rj, Pm_base, Rj);
3307       sd(tmp0, Address(Rj));
3308 
3309       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3310       cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
3311       adc(tmp1, tmp2, zr, t0);
3312       mv(tmp2, zr);
3313     }
3314 
3315     // A carry in tmp0 after Montgomery multiplication means that we
3316     // should subtract multiples of n from our result in m.  We'll
3317     // keep doing that until there is no carry.
3318     void normalize(Register len) {
3319       block_comment("normalize");
3320       // while (tmp0)
3321       //   tmp0 = sub(Pm_base, Pn_base, tmp0, len);
3322       Label loop, post, again;
3323       Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
3324       beqz(tmp0, post); {
3325         bind(again); {
3326           mv(i, zr);
3327           mv(cnt, len);
3328           slli(Rn, i, LogBytesPerWord);
3329           add(Rm, Pm_base, Rn);
3330           ld(Rm, Address(Rm));
3331           add(Rn, Pn_base, Rn);
3332           ld(Rn, Address(Rn));
3333           mv(t0, 1); // set carry flag, i.e. no borrow
3334           align(16);
3335           bind(loop); {
3336             notr(Rn, Rn);
3337             add(Rm, Rm, t0);
3338             add(Rm, Rm, Rn);
3339             sltu(t0, Rm, Rn);
3340             slli(Rn, i, LogBytesPerWord); // Rn as temp register
3341             add(Rn, Pm_base, Rn);
3342             sd(Rm, Address(Rn));
3343             add(i, i, 1);
3344             slli(Rn, i, LogBytesPerWord);
3345             add(Rm, Pm_base, Rn);
3346             ld(Rm, Address(Rm));
3347             add(Rn, Pn_base, Rn);
3348             ld(Rn, Address(Rn));
3349             sub(cnt, cnt, 1);
3350           } bnez(cnt, loop);
3351           addi(tmp0, tmp0, -1);
3352           add(tmp0, tmp0, t0);
3353         } bnez(tmp0, again);
3354       } bind(post);
3355     }
3356 
3357     // Move memory at s to d, reversing words.
3358     //    Increments d to end of copied memory
3359     //    Destroys tmp1, tmp2
3360     //    Preserves len
3361     //    Leaves s pointing to the address which was in d at start
3362     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3363       assert(tmp1->encoding() < x28->encoding(), "register corruption");
3364       assert(tmp2->encoding() < x28->encoding(), "register corruption");
3365 
3366       shadd(s, len, s, tmp1, LogBytesPerWord);
3367       mv(tmp1, len);
3368       unroll_2(tmp1,  &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3369       slli(tmp1, len, LogBytesPerWord);
3370       sub(s, d, tmp1);
3371     }
3372     // [63...0] -> [31...0][63...32]
3373     void reverse1(Register d, Register s, Register tmp) {
3374       addi(s, s, -wordSize);
3375       ld(tmp, Address(s));
3376       ror_imm(tmp, tmp, 32, t0);
3377       sd(tmp, Address(d));
3378       addi(d, d, wordSize);
3379     }
3380 
3381     void step_squaring() {
3382       // An extra ACC
3383       step();
3384       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3385     }
3386 
3387     void last_squaring(Register i) {
3388       Label dont;
3389       // if ((i & 1) == 0) {
3390       test_bit(t0, i, 0);
3391       bnez(t0, dont); {
3392         // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3393         // Ra = *++Pa;
3394         // Rb = *--Pb;
3395         mulhu(Rhi_ab, Ra, Rb);
3396         mul(Rlo_ab, Ra, Rb);
3397         acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3398       } bind(dont);
3399     }
3400 
3401     void extra_step_squaring() {
3402       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3403 
3404       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3405       // Rm = *++Pm;
3406       // Rn = *--Pn;
3407       mulhu(Rhi_mn, Rm, Rn);
3408       mul(Rlo_mn, Rm, Rn);
3409       addi(Pm, Pm, wordSize);
3410       ld(Rm, Address(Pm));
3411       addi(Pn, Pn, -wordSize);
3412       ld(Rn, Address(Pn));
3413     }
3414 
3415     void post1_squaring() {
3416       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3417 
3418       // *Pm = Rm = tmp0 * inv;
3419       mul(Rm, tmp0, inv);
3420       sd(Rm, Address(Pm));
3421 
3422       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3423       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3424       mulhu(Rhi_mn, Rm, Rn);
3425 
3426 #ifndef PRODUCT
3427       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3428       {
3429         mul(Rlo_mn, Rm, Rn);
3430         add(Rlo_mn, tmp0, Rlo_mn);
3431         Label ok;
3432         beqz(Rlo_mn, ok); {
3433           stop("broken Montgomery multiply");
3434         } bind(ok);
3435       }
3436 #endif
3437       // We have very carefully set things up so that
3438       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3439       // the lower half of Rm * Rn because we know the result already:
3440       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3441       // tmp0 != 0.  So, rather than do a mul and a cad we just set
3442       // the carry flag iff tmp0 is nonzero.
3443       //
3444       // mul(Rlo_mn, Rm, Rn);
3445       // cad(zr, tmp, Rlo_mn);
3446       addi(t0, tmp0, -1);
3447       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3448       cadc(tmp0, tmp1, Rhi_mn, t0);
3449       adc(tmp1, tmp2, zr, t0);
3450       mv(tmp2, zr);
3451     }
3452 
3453     // use t0 as carry
3454     void acc(Register Rhi, Register Rlo,
3455              Register tmp0, Register tmp1, Register tmp2) {
3456       cad(tmp0, tmp0, Rlo, t0);
3457       cadc(tmp1, tmp1, Rhi, t0);
3458       adc(tmp2, tmp2, zr, t0);
3459     }
3460 
3461   public:
3462     /**
3463      * Fast Montgomery multiplication.  The derivation of the
3464      * algorithm is in A Cryptographic Library for the Motorola
3465      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3466      *
3467      * Arguments:
3468      *
3469      * Inputs for multiplication:
3470      *   c_rarg0   - int array elements a
3471      *   c_rarg1   - int array elements b
3472      *   c_rarg2   - int array elements n (the modulus)
3473      *   c_rarg3   - int length
3474      *   c_rarg4   - int inv
3475      *   c_rarg5   - int array elements m (the result)
3476      *
3477      * Inputs for squaring:
3478      *   c_rarg0   - int array elements a
3479      *   c_rarg1   - int array elements n (the modulus)
3480      *   c_rarg2   - int length
3481      *   c_rarg3   - int inv
3482      *   c_rarg4   - int array elements m (the result)
3483      *
3484      */
3485     address generate_multiply() {
3486       Label argh, nothing;
3487       bind(argh);
3488       stop("MontgomeryMultiply total_allocation must be <= 8192");
3489 
3490       align(CodeEntryAlignment);
3491       address entry = pc();
3492 
3493       beqz(Rlen, nothing);
3494 
3495       enter();
3496 
3497       // Make room.
3498       mv(Ra, 512);
3499       bgt(Rlen, Ra, argh);
3500       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3501       sub(Ra, sp, Ra);
3502       andi(sp, Ra, -2 * wordSize);
3503 
3504       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3505 
3506       {
3507         // Copy input args, reversing as we go.  We use Ra as a
3508         // temporary variable.
3509         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3510         if (!_squaring)
3511           reverse(Ra, Pb_base, Rlen, Ri, Rj);
3512         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3513       }
3514 
3515       // Push all call-saved registers and also Pm_base which we'll need
3516       // at the end.
3517       save_regs();
3518 
3519 #ifndef PRODUCT
3520       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3521       {
3522         ld(Rn, Address(Pn_base));
3523         mul(Rlo_mn, Rn, inv);
3524         mv(t0, -1);
3525         Label ok;
3526         beq(Rlo_mn, t0, ok);
3527         stop("broken inverse in Montgomery multiply");
3528         bind(ok);
3529       }
3530 #endif
3531 
3532       mv(Pm_base, Ra);
3533 
3534       mv(tmp0, zr);
3535       mv(tmp1, zr);
3536       mv(tmp2, zr);
3537 
3538       block_comment("for (int i = 0; i < len; i++) {");
3539       mv(Ri, zr); {
3540         Label loop, end;
3541         bge(Ri, Rlen, end);
3542 
3543         bind(loop);
3544         pre1(Ri);
3545 
3546         block_comment("  for (j = i; j; j--) {"); {
3547           mv(Rj, Ri);
3548           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3549         } block_comment("  } // j");
3550 
3551         post1();
3552         addw(Ri, Ri, 1);
3553         blt(Ri, Rlen, loop);
3554         bind(end);
3555         block_comment("} // i");
3556       }
3557 
3558       block_comment("for (int i = len; i < 2*len; i++) {");
3559       mv(Ri, Rlen); {
3560         Label loop, end;
3561         slli(t0, Rlen, 1);
3562         bge(Ri, t0, end);
3563 
3564         bind(loop);
3565         pre2(Ri, Rlen);
3566 
3567         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3568           slliw(Rj, Rlen, 1);
3569           subw(Rj, Rj, Ri);
3570           subw(Rj, Rj, 1);
3571           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3572         } block_comment("  } // j");
3573 
3574         post2(Ri, Rlen);
3575         addw(Ri, Ri, 1);
3576         slli(t0, Rlen, 1);
3577         blt(Ri, t0, loop);
3578         bind(end);
3579       }
3580       block_comment("} // i");
3581 
3582       normalize(Rlen);
3583 
3584       mv(Ra, Pm_base);  // Save Pm_base in Ra
3585       restore_regs();  // Restore caller's Pm_base
3586 
3587       // Copy our result into caller's Pm_base
3588       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3589 
3590       leave();
3591       bind(nothing);
3592       ret();
3593 
3594       return entry;
3595     }
3596 
3597     /**
3598      *
3599      * Arguments:
3600      *
3601      * Inputs:
3602      *   c_rarg0   - int array elements a
3603      *   c_rarg1   - int array elements n (the modulus)
3604      *   c_rarg2   - int length
3605      *   c_rarg3   - int inv
3606      *   c_rarg4   - int array elements m (the result)
3607      *
3608      */
3609     address generate_square() {
3610       Label argh;
3611       bind(argh);
3612       stop("MontgomeryMultiply total_allocation must be <= 8192");
3613 
3614       align(CodeEntryAlignment);
3615       address entry = pc();
3616 
3617       enter();
3618 
3619       // Make room.
3620       mv(Ra, 512);
3621       bgt(Rlen, Ra, argh);
3622       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3623       sub(Ra, sp, Ra);
3624       andi(sp, Ra, -2 * wordSize);
3625 
3626       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3627 
3628       {
3629         // Copy input args, reversing as we go.  We use Ra as a
3630         // temporary variable.
3631         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3632         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3633       }
3634 
3635       // Push all call-saved registers and also Pm_base which we'll need
3636       // at the end.
3637       save_regs();
3638 
3639       mv(Pm_base, Ra);
3640 
3641       mv(tmp0, zr);
3642       mv(tmp1, zr);
3643       mv(tmp2, zr);
3644 
3645       block_comment("for (int i = 0; i < len; i++) {");
3646       mv(Ri, zr); {
3647         Label loop, end;
3648         bind(loop);
3649         bge(Ri, Rlen, end);
3650 
3651         pre1(Ri);
3652 
3653         block_comment("for (j = (i+1)/2; j; j--) {"); {
3654           addi(Rj, Ri, 1);
3655           srliw(Rj, Rj, 1);
3656           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3657         } block_comment("  } // j");
3658 
3659         last_squaring(Ri);
3660 
3661         block_comment("  for (j = i/2; j; j--) {"); {
3662           srliw(Rj, Ri, 1);
3663           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3664         } block_comment("  } // j");
3665 
3666         post1_squaring();
3667         addi(Ri, Ri, 1);
3668         blt(Ri, Rlen, loop);
3669 
3670         bind(end);
3671         block_comment("} // i");
3672       }
3673 
3674       block_comment("for (int i = len; i < 2*len; i++) {");
3675       mv(Ri, Rlen); {
3676         Label loop, end;
3677         bind(loop);
3678         slli(t0, Rlen, 1);
3679         bge(Ri, t0, end);
3680 
3681         pre2(Ri, Rlen);
3682 
3683         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
3684           slli(Rj, Rlen, 1);
3685           sub(Rj, Rj, Ri);
3686           sub(Rj, Rj, 1);
3687           srliw(Rj, Rj, 1);
3688           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3689         } block_comment("  } // j");
3690 
3691         last_squaring(Ri);
3692 
3693         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
3694           slli(Rj, Rlen, 1);
3695           sub(Rj, Rj, Ri);
3696           srliw(Rj, Rj, 1);
3697           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3698         } block_comment("  } // j");
3699 
3700         post2(Ri, Rlen);
3701         addi(Ri, Ri, 1);
3702         slli(t0, Rlen, 1);
3703         blt(Ri, t0, loop);
3704 
3705         bind(end);
3706         block_comment("} // i");
3707       }
3708 
3709       normalize(Rlen);
3710 
3711       mv(Ra, Pm_base);  // Save Pm_base in Ra
3712       restore_regs();  // Restore caller's Pm_base
3713 
3714       // Copy our result into caller's Pm_base
3715       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3716 
3717       leave();
3718       ret();
3719 
3720       return entry;
3721     }
3722   };
3723 
3724 #endif // COMPILER2
3725 
3726   address generate_cont_thaw(Continuation::thaw_kind kind) {
3727     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
3728     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
3729 
3730     address start = __ pc();
3731 
3732     if (return_barrier) {
3733       __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
3734     }
3735 
3736 #ifndef PRODUCT
3737     {
3738       Label OK;
3739       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
3740       __ beq(sp, t0, OK);
3741       __ stop("incorrect sp");
3742       __ bind(OK);
3743     }
3744 #endif
3745 
3746     if (return_barrier) {
3747       // preserve possible return value from a method returning to the return barrier
3748       __ sub(sp, sp, 2 * wordSize);
3749       __ fsd(f10, Address(sp, 0 * wordSize));
3750       __ sd(x10, Address(sp, 1 * wordSize));
3751     }
3752 
3753     __ mv(c_rarg1, (return_barrier ? 1 : 0));
3754     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
3755     __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
3756 
3757     if (return_barrier) {
3758       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
3759       __ ld(x10, Address(sp, 1 * wordSize));
3760       __ fld(f10, Address(sp, 0 * wordSize));
3761       __ add(sp, sp, 2 * wordSize);
3762     }
3763 
3764 #ifndef PRODUCT
3765     {
3766       Label OK;
3767       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
3768       __ beq(sp, t0, OK);
3769       __ stop("incorrect sp");
3770       __ bind(OK);
3771     }
3772 #endif
3773 
3774     Label thaw_success;
3775     // t1 contains the size of the frames to thaw, 0 if overflow or no more frames
3776     __ bnez(t1, thaw_success);
3777     __ la(t0, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
3778     __ jr(t0);
3779     __ bind(thaw_success);
3780 
3781     // make room for the thawed frames
3782     __ sub(t0, sp, t1);
3783     __ andi(sp, t0, -16); // align
3784 
3785     if (return_barrier) {
3786       // save original return value -- again
3787       __ sub(sp, sp, 2 * wordSize);
3788       __ fsd(f10, Address(sp, 0 * wordSize));
3789       __ sd(x10, Address(sp, 1 * wordSize));
3790     }
3791 
3792     // If we want, we can templatize thaw by kind, and have three different entries
3793     __ mv(c_rarg1, kind);
3794 
3795     __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
3796     __ mv(t1, x10); // x10 is the sp of the yielding frame
3797 
3798     if (return_barrier) {
3799       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
3800       __ ld(x10, Address(sp, 1 * wordSize));
3801       __ fld(f10, Address(sp, 0 * wordSize));
3802       __ add(sp, sp, 2 * wordSize);
3803     } else {
3804       __ mv(x10, zr); // return 0 (success) from doYield
3805     }
3806 
3807     // we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
3808     __ mv(fp, t1);
3809     __ sub(sp, t1, 2 * wordSize); // now pointing to fp spill
3810 
3811     if (return_barrier_exception) {
3812       __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
3813       __ verify_oop(x10);
3814       __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
3815 
3816       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
3817 
3818       // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
3819 
3820       __ mv(x11, x10); // the exception handler
3821       __ mv(x10, x9); // restore return value contaning the exception oop
3822       __ verify_oop(x10);
3823 
3824       __ leave();
3825       __ mv(x13, ra);
3826       __ jr(x11); // the exception handler
3827     } else {
3828       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
3829       __ leave();
3830       __ ret();
3831     }
3832 
3833     return start;
3834   }
3835 
3836   address generate_cont_thaw() {
3837     if (!Continuations::enabled()) return nullptr;
3838 
3839     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
3840     address start = __ pc();
3841     generate_cont_thaw(Continuation::thaw_top);
3842     return start;
3843   }
3844 
3845   address generate_cont_returnBarrier() {
3846     if (!Continuations::enabled()) return nullptr;
3847 
3848     // TODO: will probably need multiple return barriers depending on return type
3849     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
3850     address start = __ pc();
3851 
3852     generate_cont_thaw(Continuation::thaw_return_barrier);
3853 
3854     return start;
3855   }
3856 
3857   address generate_cont_returnBarrier_exception() {
3858     if (!Continuations::enabled()) return nullptr;
3859 
3860     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
3861     address start = __ pc();
3862 
3863     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
3864 
3865     return start;
3866   }
3867 
3868 #if COMPILER2_OR_JVMCI
3869 
3870 #undef __
3871 #define __ this->
3872 
3873   class Sha2Generator : public MacroAssembler {
3874     StubCodeGenerator* _cgen;
3875    public:
3876       Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
3877       address generate_sha256_implCompress(bool multi_block) {
3878         return generate_sha2_implCompress(Assembler::e32, multi_block);
3879       }
3880       address generate_sha512_implCompress(bool multi_block) {
3881         return generate_sha2_implCompress(Assembler::e64, multi_block);
3882       }
3883    private:
3884 
3885     void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
3886       if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
3887       else                            __ vle64_v(vr, sr);
3888     }
3889 
3890     void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
3891       if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
3892       else                            __ vse64_v(vr, sr);
3893     }
3894 
3895     // Overview of the logic in each "quad round".
3896     //
3897     // The code below repeats 16/20 times the logic implementing four rounds
3898     // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
3899     // to implementing the 64/80 single rounds.
3900     //
3901     //    // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
3902     //    // Output:
3903     //    //   vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
3904     //    vl1reXX.v vTmp1, ofs
3905     //
3906     //    // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
3907     //    addi ofs, ofs, 16/32
3908     //
3909     //    // Add constants to message schedule words:
3910     //    //  Input
3911     //    //    vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
3912     //    //    vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
3913     //    //  Output
3914     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
3915     //    vadd.vv vTmp0, vTmp1, vW0
3916     //
3917     //    //  2 rounds of working variables updates.
3918     //    //     vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
3919     //    //  Input:
3920     //    //    vState1 = {c[t],d[t],g[t],h[t]}   " = vState1[t] "
3921     //    //    vState0 = {a[t],b[t],e[t],f[t]}
3922     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
3923     //    //  Output:
3924     //    //    vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}  " = vState0[t+2] "
3925     //    //        = {h[t+4],g[t+4],d[t+4],c[t+4]}  " = vState1[t+4] "
3926     //    vsha2cl.vv vState1, vState0, vTmp0
3927     //
3928     //    //  2 rounds of working variables updates.
3929     //    //     vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
3930     //    //  Input
3931     //    //   vState0 = {a[t],b[t],e[t],f[t]}       " = vState0[t] "
3932     //    //       = {h[t+2],g[t+2],d[t+2],c[t+2]}   " = vState1[t+2] "
3933     //    //   vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}   " = vState0[t+2] "
3934     //    //   vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
3935     //    //  Output:
3936     //    //   vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]}   " = vState0[t+4] "
3937     //    vsha2ch.vv vState0, vState1, vTmp0
3938     //
3939     //    // Combine 2QW into 1QW
3940     //    //
3941     //    // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
3942     //    //     vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
3943     //    // and it can only take 3 vectors as inputs. Hence we need to combine
3944     //    // vW1[0] and vW2[1..3] in a single vector.
3945     //    //
3946     //    // vmerge Vt4, Vt1, Vt2, V0
3947     //    // Input
3948     //    //  V0 = mask // first word from vW2, 1..3 words from vW1
3949     //    //  vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
3950     //    //  vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
3951     //    // Output
3952     //    //  Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
3953     //    vmerge.vvm vTmp0, vW2, vW1, v0
3954     //
3955     //    // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
3956     //    // Input
3957     //    //  vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]}     W[ 3: 0]
3958     //    //  vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]}     W[15:12]
3959     //    //  vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]}     W[11: 9,4]
3960     //    // Output (next four message schedule words)
3961     //    //  vW0 = {W[t+19],  W[t+18],  W[t+17],  W[t+16]}  W[19:16]
3962     //    vsha2ms.vv vW0, vTmp0, vW3
3963     //
3964     // BEFORE
3965     //  vW0 - vW3 hold the message schedule words (initially the block words)
3966     //    vW0 = W[ 3: 0]   "oldest"
3967     //    vW1 = W[ 7: 4]
3968     //    vW2 = W[11: 8]
3969     //    vW3 = W[15:12]   "newest"
3970     //
3971     //  vt6 - vt7 hold the working state variables
3972     //    vState0 = {a[t],b[t],e[t],f[t]}   // initially {H5,H4,H1,H0}
3973     //    vState1 = {c[t],d[t],g[t],h[t]}   // initially {H7,H6,H3,H2}
3974     //
3975     // AFTER
3976     //  vW0 - vW3 hold the message schedule words (initially the block words)
3977     //    vW1 = W[ 7: 4]   "oldest"
3978     //    vW2 = W[11: 8]
3979     //    vW3 = W[15:12]
3980     //    vW0 = W[19:16]   "newest"
3981     //
3982     //  vState0 and vState1 hold the working state variables
3983     //    vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
3984     //    vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
3985     //
3986     //  The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
3987     //  hence the uses of those vectors rotate in each round, and we get back to the
3988     //  initial configuration every 4 quad-rounds. We could avoid those changes at
3989     //  the cost of moving those vectors at the end of each quad-rounds.
3990     void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
3991                          Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
3992                          bool gen_words = true, bool step_const = true) {
3993       __ vleXX_v(vset_sew, vtemp, scalarconst);
3994       if (step_const) {
3995         __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
3996       }
3997       __ vadd_vv(vtemp2, vtemp, rot1);
3998       __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
3999       __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
4000       if (gen_words) {
4001         __ vmerge_vvm(vtemp2, rot3, rot2);
4002         __ vsha2ms_vv(rot1, vtemp2, rot4);
4003       }
4004     }
4005 
4006     const char* stub_name(Assembler::SEW vset_sew, bool multi_block) {
4007       if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress";
4008       if (vset_sew == Assembler::e32 &&  multi_block) return "sha256_implCompressMB";
4009       if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress";
4010       if (vset_sew == Assembler::e64 &&  multi_block) return "sha512_implCompressMB";
4011       ShouldNotReachHere();
4012       return "bad name lookup";
4013     }
4014 
4015     // Arguments:
4016     //
4017     // Inputs:
4018     //   c_rarg0   - byte[]  source+offset
4019     //   c_rarg1   - int[]   SHA.state
4020     //   c_rarg2   - int     offset
4021     //   c_rarg3   - int     limit
4022     //
4023     address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) {
4024       alignas(64) static const uint32_t round_consts_256[64] = {
4025         0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
4026         0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
4027         0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
4028         0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
4029         0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
4030         0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
4031         0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
4032         0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
4033         0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
4034         0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
4035         0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
4036         0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
4037         0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
4038         0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
4039         0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
4040         0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
4041       };
4042       alignas(64) static const uint64_t round_consts_512[80] = {
4043         0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
4044         0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
4045         0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
4046         0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
4047         0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
4048         0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
4049         0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
4050         0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
4051         0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
4052         0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
4053         0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
4054         0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
4055         0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
4056         0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
4057         0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
4058         0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
4059         0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
4060         0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
4061         0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
4062         0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
4063         0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
4064         0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
4065         0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
4066         0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
4067         0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
4068         0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
4069         0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
4070       };
4071       const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
4072 
4073       __ align(CodeEntryAlignment);
4074       StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block));
4075       address start = __ pc();
4076 
4077       Register buf   = c_rarg0;
4078       Register state = c_rarg1;
4079       Register ofs   = c_rarg2;
4080       Register limit = c_rarg3;
4081       Register consts =  t2; // caller saved
4082       Register state_c = x28; // caller saved
4083       VectorRegister vindex = v2;
4084       VectorRegister vW0 = v4;
4085       VectorRegister vW1 = v6;
4086       VectorRegister vW2 = v8;
4087       VectorRegister vW3 = v10;
4088       VectorRegister vState0 = v12;
4089       VectorRegister vState1 = v14;
4090       VectorRegister vHash0  = v16;
4091       VectorRegister vHash1  = v18;
4092       VectorRegister vTmp0   = v20;
4093       VectorRegister vTmp1   = v22;
4094 
4095       Label multi_block_loop;
4096 
4097       __ enter();
4098 
4099       address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
4100       la(consts, ExternalAddress(constant_table));
4101 
4102       // Register use in this function:
4103       //
4104       // VECTORS
4105       //  vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
4106       //             schedule words (Wt). They start with the message block
4107       //             content (W0 to W15), then further words in the message
4108       //             schedule generated via vsha2ms from previous Wt.
4109       //   Initially:
4110       //     vW0 = W[  3:0] = { W3,  W2,  W1,  W0}
4111       //     vW1 = W[  7:4] = { W7,  W6,  W5,  W4}
4112       //     vW2 = W[ 11:8] = {W11, W10,  W9,  W8}
4113       //     vW3 = W[15:12] = {W15, W14, W13, W12}
4114       //
4115       //  vState0 - vState1 hold the working state variables (a, b, ..., h)
4116       //    vState0 = {f[t],e[t],b[t],a[t]}
4117       //    vState1 = {h[t],g[t],d[t],c[t]}
4118       //   Initially:
4119       //    vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
4120       //    vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
4121       //
4122       //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
4123       //
4124       //  vTmp0 = temporary, Wt+Kt
4125       //  vTmp1 = temporary, Kt
4126       //
4127       //  vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
4128       //
4129       // During most of the function the vector state is configured so that each
4130       // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
4131 
4132       // vsha2ch/vsha2cl uses EGW of 4*SEW.
4133       // SHA256 SEW = e32, EGW = 128-bits
4134       // SHA512 SEW = e64, EGW = 256-bits
4135       //
4136       // VLEN is required to be at least 128.
4137       // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
4138       //
4139       // m1: LMUL=1/2
4140       // ta: tail agnostic (don't care about those lanes)
4141       // ma: mask agnostic (don't care about those lanes)
4142       // x0 is not written, we known the number of vector elements.
4143 
4144       if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
4145         __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
4146       } else {
4147         __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
4148       }
4149 
4150       int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
4151       __ li(t0, indexes);
4152       __ vmv_v_x(vindex, t0);
4153 
4154       // Step-over a,b, so we are pointing to c.
4155       // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
4156       __ addi(state_c, state, const_add/2);
4157 
4158       // Use index-load to get {f,e,b,a},{h,g,d,c}
4159       __ vluxei8_v(vState0, state, vindex);
4160       __ vluxei8_v(vState1, state_c, vindex);
4161 
4162       __ bind(multi_block_loop);
4163 
4164       // Capture the initial H values in vHash0 and vHash1 to allow for computing
4165       // the resulting H', since H' = H+{a',b',c',...,h'}.
4166       __ vmv_v_v(vHash0, vState0);
4167       __ vmv_v_v(vHash1, vState1);
4168 
4169       // Load the 512/1024-bits of the message block in vW0-vW3 and perform
4170       // an endian swap on each 4/8 bytes element.
4171       //
4172       // If Zvkb is not implemented one can use vrgather
4173       // with an index sequence to byte-swap.
4174       //  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
4175       //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
4176       //  this sequence. 'vid' gives us the N.
4177       __ vleXX_v(vset_sew, vW0, buf);
4178       __ vrev8_v(vW0, vW0);
4179       __ addi(buf, buf, const_add);
4180       __ vleXX_v(vset_sew, vW1, buf);
4181       __ vrev8_v(vW1, vW1);
4182       __ addi(buf, buf, const_add);
4183       __ vleXX_v(vset_sew, vW2, buf);
4184       __ vrev8_v(vW2, vW2);
4185       __ addi(buf, buf, const_add);
4186       __ vleXX_v(vset_sew, vW3, buf);
4187       __ vrev8_v(vW3, vW3);
4188       __ addi(buf, buf, const_add);
4189 
4190       // Set v0 up for the vmerge that replaces the first word (idx==0)
4191       __ vid_v(v0);
4192       __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
4193 
4194       VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
4195       int rot_pos = 0;
4196       // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
4197       const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
4198       for (int i = 0; i < qr_end; i++) {
4199         sha2_quad_round(vset_sew,
4200                    rotation_regs[(rot_pos + 0) & 0x3],
4201                    rotation_regs[(rot_pos + 1) & 0x3],
4202                    rotation_regs[(rot_pos + 2) & 0x3],
4203                    rotation_regs[(rot_pos + 3) & 0x3],
4204                    consts,
4205                    vTmp1, vTmp0, vState0, vState1);
4206         ++rot_pos;
4207       }
4208       // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
4209       // Note that we stop generating new message schedule words (Wt, vW0-13)
4210       // as we already generated all the words we end up consuming (i.e., W[63:60]).
4211       const int qr_c_end = qr_end + 4;
4212       for (int i = qr_end; i < qr_c_end; i++) {
4213         sha2_quad_round(vset_sew,
4214                    rotation_regs[(rot_pos + 0) & 0x3],
4215                    rotation_regs[(rot_pos + 1) & 0x3],
4216                    rotation_regs[(rot_pos + 2) & 0x3],
4217                    rotation_regs[(rot_pos + 3) & 0x3],
4218                    consts,
4219                    vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
4220         ++rot_pos;
4221       }
4222 
4223       //--------------------------------------------------------------------------------
4224       // Compute the updated hash value H'
4225       //   H' = H + {h',g',...,b',a'}
4226       //      = {h,g,...,b,a} + {h',g',...,b',a'}
4227       //      = {h+h',g+g',...,b+b',a+a'}
4228 
4229       // H' = H+{a',b',c',...,h'}
4230       __ vadd_vv(vState0, vHash0, vState0);
4231       __ vadd_vv(vState1, vHash1, vState1);
4232 
4233       if (multi_block) {
4234         int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
4235         __ addi(consts, consts, -total_adds);
4236         __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
4237         __ ble(ofs, limit, multi_block_loop);
4238         __ mv(c_rarg0, ofs); // return ofs
4239       }
4240 
4241       // Store H[0..8] = {a,b,c,d,e,f,g,h} from
4242       //  vState0 = {f,e,b,a}
4243       //  vState1 = {h,g,d,c}
4244       __ vsuxei8_v(vState0, state,   vindex);
4245       __ vsuxei8_v(vState1, state_c, vindex);
4246 
4247       __ leave();
4248       __ ret();
4249 
4250       return start;
4251     }
4252   };
4253 
4254 #undef __
4255 #define __ _masm->
4256 
4257   // Set of L registers that correspond to a contiguous memory area.
4258   // Each 64-bit register typically corresponds to 2 32-bit integers.
4259   template <uint L>
4260   class RegCache {
4261   private:
4262     MacroAssembler *_masm;
4263     Register _regs[L];
4264 
4265   public:
4266     RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
4267       assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
4268       auto it = rs.begin();
4269       for (auto &r: _regs) {
4270         r = *it;
4271         ++it;
4272       }
4273     }
4274 
4275     // generate load for the i'th register
4276     void gen_load(uint i, Register base) {
4277       assert(i < L, "invalid i: %u", i);
4278       __ ld(_regs[i], Address(base, 8 * i));
4279     }
4280 
4281     // add i'th 32-bit integer to dest
4282     void add_u32(const Register dest, uint i, const Register rtmp = t0) {
4283       assert(i < 2 * L, "invalid i: %u", i);
4284 
4285       if (is_even(i)) {
4286         // Use the bottom 32 bits. No need to mask off the top 32 bits
4287         // as addw will do the right thing.
4288         __ addw(dest, dest, _regs[i / 2]);
4289       } else {
4290         // Use the top 32 bits by right-shifting them.
4291         __ srli(rtmp, _regs[i / 2], 32);
4292         __ addw(dest, dest, rtmp);
4293       }
4294     }
4295   };
4296 
4297   typedef RegCache<8> BufRegCache;
4298 
4299   // a += value + x + ac;
4300   // a = Integer.rotateLeft(a, s) + b;
4301   void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
4302                                Register a, Register b, Register c, Register d,
4303                                int k, int s, int t,
4304                                Register value) {
4305     // a += ac
4306     __ addw(a, a, t, t1);
4307 
4308     // a += x;
4309     reg_cache.add_u32(a, k);
4310     // a += value;
4311     __ addw(a, a, value);
4312 
4313     // a = Integer.rotateLeft(a, s) + b;
4314     __ rolw_imm(a, a, s);
4315     __ addw(a, a, b);
4316   }
4317 
4318   // a += ((b & c) | ((~b) & d)) + x + ac;
4319   // a = Integer.rotateLeft(a, s) + b;
4320   void md5_FF(BufRegCache& reg_cache,
4321               Register a, Register b, Register c, Register d,
4322               int k, int s, int t,
4323               Register rtmp1, Register rtmp2) {
4324     // rtmp1 = b & c
4325     __ andr(rtmp1, b, c);
4326 
4327     // rtmp2 = (~b) & d
4328     __ andn(rtmp2, d, b);
4329 
4330     // rtmp1 = (b & c) | ((~b) & d)
4331     __ orr(rtmp1, rtmp1, rtmp2);
4332 
4333     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4334   }
4335 
4336   // a += ((b & d) | (c & (~d))) + x + ac;
4337   // a = Integer.rotateLeft(a, s) + b;
4338   void md5_GG(BufRegCache& reg_cache,
4339               Register a, Register b, Register c, Register d,
4340               int k, int s, int t,
4341               Register rtmp1, Register rtmp2) {
4342     // rtmp1 = b & d
4343     __ andr(rtmp1, b, d);
4344 
4345     // rtmp2 = c & (~d)
4346     __ andn(rtmp2, c, d);
4347 
4348     // rtmp1 = (b & d) | (c & (~d))
4349     __ orr(rtmp1, rtmp1, rtmp2);
4350 
4351     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4352   }
4353 
4354   // a += ((b ^ c) ^ d) + x + ac;
4355   // a = Integer.rotateLeft(a, s) + b;
4356   void md5_HH(BufRegCache& reg_cache,
4357               Register a, Register b, Register c, Register d,
4358               int k, int s, int t,
4359               Register rtmp1, Register rtmp2) {
4360     // rtmp1 = (b ^ c) ^ d
4361     __ xorr(rtmp2, b, c);
4362     __ xorr(rtmp1, rtmp2, d);
4363 
4364     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4365   }
4366 
4367   // a += (c ^ (b | (~d))) + x + ac;
4368   // a = Integer.rotateLeft(a, s) + b;
4369   void md5_II(BufRegCache& reg_cache,
4370               Register a, Register b, Register c, Register d,
4371               int k, int s, int t,
4372               Register rtmp1, Register rtmp2) {
4373     // rtmp1 = c ^ (b | (~d))
4374     __ orn(rtmp2, b, d);
4375     __ xorr(rtmp1, c, rtmp2);
4376 
4377     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4378   }
4379 
4380   // Arguments:
4381   //
4382   // Inputs:
4383   //   c_rarg0   - byte[]  source+offset
4384   //   c_rarg1   - int[]   SHA.state
4385   //   c_rarg2   - int     offset  (multi_block == True)
4386   //   c_rarg3   - int     limit   (multi_block == True)
4387   //
4388   // Registers:
4389   //    x0   zero  (zero)
4390   //    x1     ra  (return address)
4391   //    x2     sp  (stack pointer)
4392   //    x3     gp  (global pointer)
4393   //    x4     tp  (thread pointer)
4394   //    x5     t0  (tmp register)
4395   //    x6     t1  (tmp register)
4396   //    x7     t2  state0
4397   //    x8  f0/s0  (frame pointer)
4398   //    x9     s1
4399   //   x10     a0  rtmp1 / c_rarg0
4400   //   x11     a1  rtmp2 / c_rarg1
4401   //   x12     a2  a     / c_rarg2
4402   //   x13     a3  b     / c_rarg3
4403   //   x14     a4  c
4404   //   x15     a5  d
4405   //   x16     a6  buf
4406   //   x17     a7  state
4407   //   x18     s2  ofs     [saved-reg]  (multi_block == True)
4408   //   x19     s3  limit   [saved-reg]  (multi_block == True)
4409   //   x20     s4  state1  [saved-reg]
4410   //   x21     s5  state2  [saved-reg]
4411   //   x22     s6  state3  [saved-reg]
4412   //   x23     s7
4413   //   x24     s8  buf0    [saved-reg]
4414   //   x25     s9  buf1    [saved-reg]
4415   //   x26    s10  buf2    [saved-reg]
4416   //   x27    s11  buf3    [saved-reg]
4417   //   x28     t3  buf4
4418   //   x29     t4  buf5
4419   //   x30     t5  buf6
4420   //   x31     t6  buf7
4421   address generate_md5_implCompress(bool multi_block, const char *name) {
4422     __ align(CodeEntryAlignment);
4423     StubCodeMark mark(this, "StubRoutines", name);
4424     address start = __ pc();
4425 
4426     // rotation constants
4427     const int S11 = 7;
4428     const int S12 = 12;
4429     const int S13 = 17;
4430     const int S14 = 22;
4431     const int S21 = 5;
4432     const int S22 = 9;
4433     const int S23 = 14;
4434     const int S24 = 20;
4435     const int S31 = 4;
4436     const int S32 = 11;
4437     const int S33 = 16;
4438     const int S34 = 23;
4439     const int S41 = 6;
4440     const int S42 = 10;
4441     const int S43 = 15;
4442     const int S44 = 21;
4443 
4444     const int64_t mask32 = 0xffffffff;
4445 
4446     Register buf_arg   = c_rarg0; // a0
4447     Register state_arg = c_rarg1; // a1
4448     Register ofs_arg   = c_rarg2; // a2
4449     Register limit_arg = c_rarg3; // a3
4450 
4451     // we'll copy the args to these registers to free up a0-a3
4452     // to use for other values manipulated by instructions
4453     // that can be compressed
4454     Register buf       = x16; // a6
4455     Register state     = x17; // a7
4456     Register ofs       = x18; // s2
4457     Register limit     = x19; // s3
4458 
4459     // using x12->15 to allow compressed instructions
4460     Register a         = x12; // a2
4461     Register b         = x13; // a3
4462     Register c         = x14; // a4
4463     Register d         = x15; // a5
4464 
4465     Register state0    =  x7; // t2
4466     Register state1    = x20; // s4
4467     Register state2    = x21; // s5
4468     Register state3    = x22; // s6
4469 
4470     // using x10->x11 to allow compressed instructions
4471     Register rtmp1     = x10; // a0
4472     Register rtmp2     = x11; // a1
4473 
4474     RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
4475     RegSet reg_cache_regs;
4476     reg_cache_regs += reg_cache_saved_regs;
4477     reg_cache_regs += RegSet::of(x28, x29, x30, x31); // t3, t4, t5, t6
4478     BufRegCache reg_cache(_masm, reg_cache_regs);
4479 
4480     RegSet saved_regs;
4481     if (multi_block) {
4482       saved_regs += RegSet::of(ofs, limit);
4483     }
4484     saved_regs += RegSet::of(state1, state2, state3);
4485     saved_regs += reg_cache_saved_regs;
4486 
4487     __ push_reg(saved_regs, sp);
4488 
4489     __ mv(buf, buf_arg);
4490     __ mv(state, state_arg);
4491     if (multi_block) {
4492       __ mv(ofs, ofs_arg);
4493       __ mv(limit, limit_arg);
4494     }
4495 
4496     // to minimize the number of memory operations:
4497     // read the 4 state 4-byte values in pairs, with a single ld,
4498     // and split them into 2 registers.
4499     //
4500     // And, as the core algorithm of md5 works on 32-bits words, so
4501     // in the following code, it does not care about the content of
4502     // higher 32-bits in state[x]. Based on this observation,
4503     // we can apply further optimization, which is to just ignore the
4504     // higher 32-bits in state0/state2, rather than set the higher
4505     // 32-bits of state0/state2 to zero explicitly with extra instructions.
4506     __ ld(state0, Address(state));
4507     __ srli(state1, state0, 32);
4508     __ ld(state2, Address(state, 8));
4509     __ srli(state3, state2, 32);
4510 
4511     Label md5_loop;
4512     __ BIND(md5_loop);
4513 
4514     __ mv(a, state0);
4515     __ mv(b, state1);
4516     __ mv(c, state2);
4517     __ mv(d, state3);
4518 
4519     // Round 1
4520     reg_cache.gen_load(0, buf);
4521     md5_FF(reg_cache, a, b, c, d,  0, S11, 0xd76aa478, rtmp1, rtmp2);
4522     md5_FF(reg_cache, d, a, b, c,  1, S12, 0xe8c7b756, rtmp1, rtmp2);
4523     reg_cache.gen_load(1, buf);
4524     md5_FF(reg_cache, c, d, a, b,  2, S13, 0x242070db, rtmp1, rtmp2);
4525     md5_FF(reg_cache, b, c, d, a,  3, S14, 0xc1bdceee, rtmp1, rtmp2);
4526     reg_cache.gen_load(2, buf);
4527     md5_FF(reg_cache, a, b, c, d,  4, S11, 0xf57c0faf, rtmp1, rtmp2);
4528     md5_FF(reg_cache, d, a, b, c,  5, S12, 0x4787c62a, rtmp1, rtmp2);
4529     reg_cache.gen_load(3, buf);
4530     md5_FF(reg_cache, c, d, a, b,  6, S13, 0xa8304613, rtmp1, rtmp2);
4531     md5_FF(reg_cache, b, c, d, a,  7, S14, 0xfd469501, rtmp1, rtmp2);
4532     reg_cache.gen_load(4, buf);
4533     md5_FF(reg_cache, a, b, c, d,  8, S11, 0x698098d8, rtmp1, rtmp2);
4534     md5_FF(reg_cache, d, a, b, c,  9, S12, 0x8b44f7af, rtmp1, rtmp2);
4535     reg_cache.gen_load(5, buf);
4536     md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
4537     md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
4538     reg_cache.gen_load(6, buf);
4539     md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
4540     md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
4541     reg_cache.gen_load(7, buf);
4542     md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
4543     md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
4544 
4545     // Round 2
4546     md5_GG(reg_cache, a, b, c, d,  1, S21, 0xf61e2562, rtmp1, rtmp2);
4547     md5_GG(reg_cache, d, a, b, c,  6, S22, 0xc040b340, rtmp1, rtmp2);
4548     md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
4549     md5_GG(reg_cache, b, c, d, a,  0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
4550     md5_GG(reg_cache, a, b, c, d,  5, S21, 0xd62f105d, rtmp1, rtmp2);
4551     md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
4552     md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
4553     md5_GG(reg_cache, b, c, d, a,  4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
4554     md5_GG(reg_cache, a, b, c, d,  9, S21, 0x21e1cde6, rtmp1, rtmp2);
4555     md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
4556     md5_GG(reg_cache, c, d, a, b,  3, S23, 0xf4d50d87, rtmp1, rtmp2);
4557     md5_GG(reg_cache, b, c, d, a,  8, S24, 0x455a14ed, rtmp1, rtmp2);
4558     md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
4559     md5_GG(reg_cache, d, a, b, c,  2, S22, 0xfcefa3f8, rtmp1, rtmp2);
4560     md5_GG(reg_cache, c, d, a, b,  7, S23, 0x676f02d9, rtmp1, rtmp2);
4561     md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
4562 
4563     // Round 3
4564     md5_HH(reg_cache, a, b, c, d,  5, S31, 0xfffa3942, rtmp1, rtmp2);
4565     md5_HH(reg_cache, d, a, b, c,  8, S32, 0x8771f681, rtmp1, rtmp2);
4566     md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
4567     md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
4568     md5_HH(reg_cache, a, b, c, d,  1, S31, 0xa4beea44, rtmp1, rtmp2);
4569     md5_HH(reg_cache, d, a, b, c,  4, S32, 0x4bdecfa9, rtmp1, rtmp2);
4570     md5_HH(reg_cache, c, d, a, b,  7, S33, 0xf6bb4b60, rtmp1, rtmp2);
4571     md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
4572     md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
4573     md5_HH(reg_cache, d, a, b, c,  0, S32, 0xeaa127fa, rtmp1, rtmp2);
4574     md5_HH(reg_cache, c, d, a, b,  3, S33, 0xd4ef3085, rtmp1, rtmp2);
4575     md5_HH(reg_cache, b, c, d, a,  6, S34, 0x04881d05, rtmp1, rtmp2);
4576     md5_HH(reg_cache, a, b, c, d,  9, S31, 0xd9d4d039, rtmp1, rtmp2);
4577     md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
4578     md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
4579     md5_HH(reg_cache, b, c, d, a,  2, S34, 0xc4ac5665, rtmp1, rtmp2);
4580 
4581     // Round 4
4582     md5_II(reg_cache, a, b, c, d,  0, S41, 0xf4292244, rtmp1, rtmp2);
4583     md5_II(reg_cache, d, a, b, c,  7, S42, 0x432aff97, rtmp1, rtmp2);
4584     md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
4585     md5_II(reg_cache, b, c, d, a,  5, S44, 0xfc93a039, rtmp1, rtmp2);
4586     md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
4587     md5_II(reg_cache, d, a, b, c,  3, S42, 0x8f0ccc92, rtmp1, rtmp2);
4588     md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
4589     md5_II(reg_cache, b, c, d, a,  1, S44, 0x85845dd1, rtmp1, rtmp2);
4590     md5_II(reg_cache, a, b, c, d,  8, S41, 0x6fa87e4f, rtmp1, rtmp2);
4591     md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
4592     md5_II(reg_cache, c, d, a, b,  6, S43, 0xa3014314, rtmp1, rtmp2);
4593     md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
4594     md5_II(reg_cache, a, b, c, d,  4, S41, 0xf7537e82, rtmp1, rtmp2);
4595     md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
4596     md5_II(reg_cache, c, d, a, b,  2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
4597     md5_II(reg_cache, b, c, d, a,  9, S44, 0xeb86d391, rtmp1, rtmp2);
4598 
4599     __ addw(state0, state0, a);
4600     __ addw(state1, state1, b);
4601     __ addw(state2, state2, c);
4602     __ addw(state3, state3, d);
4603 
4604     if (multi_block) {
4605       __ addi(buf, buf, 64);
4606       __ addi(ofs, ofs, 64);
4607       // if (ofs <= limit) goto m5_loop
4608       __ bge(limit, ofs, md5_loop);
4609       __ mv(c_rarg0, ofs); // return ofs
4610     }
4611 
4612     // to minimize the number of memory operations:
4613     // write back the 4 state 4-byte values in pairs, with a single sd
4614     __ mv(t0, mask32);
4615     __ andr(state0, state0, t0);
4616     __ slli(state1, state1, 32);
4617     __ orr(state0, state0, state1);
4618     __ sd(state0, Address(state));
4619     __ andr(state2, state2, t0);
4620     __ slli(state3, state3, 32);
4621     __ orr(state2, state2, state3);
4622     __ sd(state2, Address(state, 8));
4623 
4624     __ pop_reg(saved_regs, sp);
4625     __ ret();
4626 
4627     return (address) start;
4628   }
4629 
4630   /**
4631    * Perform the quarter round calculations on values contained within four vector registers.
4632    *
4633    * @param aVec the SIMD register containing only the "a" values
4634    * @param bVec the SIMD register containing only the "b" values
4635    * @param cVec the SIMD register containing only the "c" values
4636    * @param dVec the SIMD register containing only the "d" values
4637    * @param tmp_vr temporary vector register holds intermedia values.
4638    */
4639   void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
4640                           VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
4641     // a += b, d ^= a, d <<<= 16
4642     __ vadd_vv(aVec, aVec, bVec);
4643     __ vxor_vv(dVec, dVec, aVec);
4644     __ vrole32_vi(dVec, 16, tmp_vr);
4645 
4646     // c += d, b ^= c, b <<<= 12
4647     __ vadd_vv(cVec, cVec, dVec);
4648     __ vxor_vv(bVec, bVec, cVec);
4649     __ vrole32_vi(bVec, 12, tmp_vr);
4650 
4651     // a += b, d ^= a, d <<<= 8
4652     __ vadd_vv(aVec, aVec, bVec);
4653     __ vxor_vv(dVec, dVec, aVec);
4654     __ vrole32_vi(dVec, 8, tmp_vr);
4655 
4656     // c += d, b ^= c, b <<<= 7
4657     __ vadd_vv(cVec, cVec, dVec);
4658     __ vxor_vv(bVec, bVec, cVec);
4659     __ vrole32_vi(bVec, 7, tmp_vr);
4660   }
4661 
4662   /**
4663    * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
4664    *
4665    *  Input arguments:
4666    *  c_rarg0   - state, the starting state
4667    *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function
4668    *
4669    *  Implementation Note:
4670    *   Parallelization is achieved by loading individual state elements into vectors for N blocks.
4671    *   N depends on single vector register length.
4672    */
4673   address generate_chacha20Block() {
4674     Label L_Rounds;
4675 
4676     __ align(CodeEntryAlignment);
4677     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4678     address start = __ pc();
4679     __ enter();
4680 
4681     const int states_len = 16;
4682     const int step = 4;
4683     const Register state = c_rarg0;
4684     const Register key_stream = c_rarg1;
4685     const Register tmp_addr = t0;
4686     const Register length = t1;
4687 
4688     // Organize vector registers in an array that facilitates
4689     // putting repetitive opcodes into loop structures below.
4690     const VectorRegister work_vrs[16] = {
4691       v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
4692       v8, v9, v10, v11, v12, v13, v14, v15
4693     };
4694     const VectorRegister tmp_vr = v16;
4695     const VectorRegister counter_vr = v17;
4696 
4697     {
4698       // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
4699       // in java level.
4700       __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
4701     }
4702 
4703     // Load from source state.
4704     // Every element in source state is duplicated to all elements in the corresponding vector.
4705     __ mv(tmp_addr, state);
4706     for (int i = 0; i < states_len; i += 1) {
4707       __ vlse32_v(work_vrs[i], tmp_addr, zr);
4708       __ addi(tmp_addr, tmp_addr, step);
4709     }
4710     // Adjust counter for every individual block.
4711     __ vid_v(counter_vr);
4712     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
4713 
4714     // Perform 10 iterations of the 8 quarter round set
4715     {
4716       const Register loop = t2; // share t2 with other non-overlapping usages.
4717       __ mv(loop, 10);
4718       __ BIND(L_Rounds);
4719 
4720       chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8],  work_vrs[12], tmp_vr);
4721       chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9],  work_vrs[13], tmp_vr);
4722       chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
4723       chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
4724 
4725       chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
4726       chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
4727       chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8],  work_vrs[13], tmp_vr);
4728       chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9],  work_vrs[14], tmp_vr);
4729 
4730       __ sub(loop, loop, 1);
4731       __ bnez(loop, L_Rounds);
4732     }
4733 
4734     // Add the original state into the end working state.
4735     // We do this by first duplicating every element in source state array to the corresponding
4736     // vector, then adding it to the post-loop working state.
4737     __ mv(tmp_addr, state);
4738     for (int i = 0; i < states_len; i += 1) {
4739       __ vlse32_v(tmp_vr, tmp_addr, zr);
4740       __ addi(tmp_addr, tmp_addr, step);
4741       __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
4742     }
4743     // Add the counter overlay onto work_vrs[12] at the end.
4744     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
4745 
4746     // Store result to key stream.
4747     {
4748       const Register stride = t2; // share t2 with other non-overlapping usages.
4749       // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
4750       __ mv(stride, 64);
4751       for (int i = 0; i < states_len; i += 1) {
4752         __ vsse32_v(work_vrs[i], key_stream, stride);
4753         __ addi(key_stream, key_stream, step);
4754       }
4755     }
4756 
4757     // Return length of output key_stream
4758     __ slli(c_rarg0, length, 6);
4759 
4760     __ leave();
4761     __ ret();
4762 
4763     return (address) start;
4764   }
4765 
4766 
4767   // ------------------------ SHA-1 intrinsic ------------------------
4768 
4769   // K't =
4770   //    5a827999, 0  <= t <= 19
4771   //    6ed9eba1, 20 <= t <= 39
4772   //    8f1bbcdc, 40 <= t <= 59
4773   //    ca62c1d6, 60 <= t <= 79
4774   void sha1_prepare_k(Register cur_k, int round) {
4775     assert(round >= 0 && round < 80, "must be");
4776 
4777     static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
4778     if ((round % 20) == 0) {
4779       __ mv(cur_k, ks[round/20]);
4780     }
4781   }
4782 
4783   // W't =
4784   //    M't,                                      0 <=  t <= 15
4785   //    ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
4786   void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
4787     assert(round >= 0 && round < 80, "must be");
4788 
4789     if (round < 16) {
4790       // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
4791       //   in ws[0], high part contains W't-0, low part contains W't-1,
4792       //   in ws[1], high part contains W't-2, low part contains W't-3,
4793       //   ...
4794       //   in ws[7], high part contains W't-14, low part contains W't-15.
4795 
4796       if ((round % 2) == 0) {
4797         __ ld(ws[round/2], Address(buf, (round/2) * 8));
4798         // reverse bytes, as SHA-1 is defined in big-endian.
4799         __ revb(ws[round/2], ws[round/2]);
4800         __ srli(cur_w, ws[round/2], 32);
4801       } else {
4802         __ mv(cur_w, ws[round/2]);
4803       }
4804 
4805       return;
4806     }
4807 
4808     if ((round % 2) == 0) {
4809       int idx = 16;
4810       // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
4811       __ srli(t1, ws[(idx-8)/2], 32);
4812       __ xorr(t0, ws[(idx-3)/2], t1);
4813 
4814       __ srli(t1, ws[(idx-14)/2], 32);
4815       __ srli(cur_w, ws[(idx-16)/2], 32);
4816       __ xorr(cur_w, cur_w, t1);
4817 
4818       __ xorr(cur_w, cur_w, t0);
4819       __ rolw_imm(cur_w, cur_w, 1, t0);
4820 
4821       // copy the cur_w value to ws[8].
4822       // now, valid w't values are at:
4823       //  w0:       ws[0]'s lower 32 bits
4824       //  w1 ~ w14: ws[1] ~ ws[7]
4825       //  w15:      ws[8]'s higher 32 bits
4826       __ slli(ws[idx/2], cur_w, 32);
4827 
4828       return;
4829     }
4830 
4831     int idx = 17;
4832     // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
4833     __ srli(t1, ws[(idx-3)/2], 32);
4834     __ xorr(t0, t1, ws[(idx-8)/2]);
4835 
4836     __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
4837 
4838     __ xorr(cur_w, cur_w, t0);
4839     __ rolw_imm(cur_w, cur_w, 1, t0);
4840 
4841     // copy the cur_w value to ws[8]
4842     __ zero_extend(cur_w, cur_w, 32);
4843     __ orr(ws[idx/2], ws[idx/2], cur_w);
4844 
4845     // shift the w't registers, so they start from ws[0] again.
4846     // now, valid w't values are at:
4847     //  w0 ~ w15: ws[0] ~ ws[7]
4848     Register ws_0 = ws[0];
4849     for (int i = 0; i < 16/2; i++) {
4850       ws[i] = ws[i+1];
4851     }
4852     ws[8] = ws_0;
4853   }
4854 
4855   // f't(x, y, z) =
4856   //    Ch(x, y, z)     = (x & y) ^ (~x & z)            , 0  <= t <= 19
4857   //    Parity(x, y, z) = x ^ y ^ z                     , 20 <= t <= 39
4858   //    Maj(x, y, z)    = (x & y) ^ (x & z) ^ (y & z)   , 40 <= t <= 59
4859   //    Parity(x, y, z) = x ^ y ^ z                     , 60 <= t <= 79
4860   void sha1_f(Register dst, Register x, Register y, Register z, int round) {
4861     assert(round >= 0 && round < 80, "must be");
4862     assert_different_registers(dst, x, y, z, t0, t1);
4863 
4864     if (round < 20) {
4865       // (x & y) ^ (~x & z)
4866       __ andr(t0, x, y);
4867       __ andn(dst, z, x);
4868       __ xorr(dst, dst, t0);
4869     } else if (round >= 40 && round < 60) {
4870       // (x & y) ^ (x & z) ^ (y & z)
4871       __ andr(t0, x, y);
4872       __ andr(t1, x, z);
4873       __ andr(dst, y, z);
4874       __ xorr(dst, dst, t0);
4875       __ xorr(dst, dst, t1);
4876     } else {
4877       // x ^ y ^ z
4878       __ xorr(dst, x, y);
4879       __ xorr(dst, dst, z);
4880     }
4881   }
4882 
4883   // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
4884   // e = d
4885   // d = c
4886   // c = ROTL'30(b)
4887   // b = a
4888   // a = T
4889   void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
4890                           Register cur_k, Register cur_w, Register tmp, int round) {
4891     assert(round >= 0 && round < 80, "must be");
4892     assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
4893 
4894     // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
4895 
4896     // cur_w will be recalculated at the beginning of each round,
4897     // so, we can reuse it as a temp register here.
4898     Register tmp2 = cur_w;
4899 
4900     // reuse e as a temporary register, as we will mv new value into it later
4901     Register tmp3 = e;
4902     __ add(tmp2, cur_k, tmp2);
4903     __ add(tmp3, tmp3, tmp2);
4904     __ rolw_imm(tmp2, a, 5, t0);
4905 
4906     sha1_f(tmp, b, c, d, round);
4907 
4908     __ add(tmp2, tmp2, tmp);
4909     __ add(tmp2, tmp2, tmp3);
4910 
4911     // e = d
4912     // d = c
4913     // c = ROTL'30(b)
4914     // b = a
4915     // a = T
4916     __ mv(e, d);
4917     __ mv(d, c);
4918 
4919     __ rolw_imm(c, b, 30);
4920     __ mv(b, a);
4921     __ mv(a, tmp2);
4922   }
4923 
4924   // H(i)0 = a + H(i-1)0
4925   // H(i)1 = b + H(i-1)1
4926   // H(i)2 = c + H(i-1)2
4927   // H(i)3 = d + H(i-1)3
4928   // H(i)4 = e + H(i-1)4
4929   void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
4930                               Register prev_ab, Register prev_cd, Register prev_e) {
4931     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
4932 
4933     __ add(a, a, prev_ab);
4934     __ srli(prev_ab, prev_ab, 32);
4935     __ add(b, b, prev_ab);
4936 
4937     __ add(c, c, prev_cd);
4938     __ srli(prev_cd, prev_cd, 32);
4939     __ add(d, d, prev_cd);
4940 
4941     __ add(e, e, prev_e);
4942   }
4943 
4944   void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
4945                                 Register prev_ab, Register prev_cd, Register prev_e) {
4946     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
4947 
4948     __ slli(t0, b, 32);
4949     __ zero_extend(prev_ab, a, 32);
4950     __ orr(prev_ab, prev_ab, t0);
4951 
4952     __ slli(t0, d, 32);
4953     __ zero_extend(prev_cd, c, 32);
4954     __ orr(prev_cd, prev_cd, t0);
4955 
4956     __ mv(prev_e, e);
4957   }
4958 
4959   // Intrinsic for:
4960   //   void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
4961   //   void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
4962   //
4963   // Arguments:
4964   //
4965   // Inputs:
4966   //   c_rarg0: byte[]  src array + offset
4967   //   c_rarg1: int[]   SHA.state
4968   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
4969   //   c_rarg2: int     offset
4970   //   c_rarg3: int     limit
4971   //
4972   // Outputs:
4973   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
4974   //   c_rarg0: int offset, when (multi_block == true)
4975   //
4976   address generate_sha1_implCompress(bool multi_block, const char *name) {
4977     __ align(CodeEntryAlignment);
4978     StubCodeMark mark(this, "StubRoutines", name);
4979 
4980     address start = __ pc();
4981     __ enter();
4982 
4983     RegSet saved_regs = RegSet::range(x18, x27);
4984     if (multi_block) {
4985       // use x9 as src below.
4986       saved_regs += RegSet::of(x9);
4987     }
4988     __ push_reg(saved_regs, sp);
4989 
4990     // c_rarg0 - c_rarg3: x10 - x13
4991     Register buf    = c_rarg0;
4992     Register state  = c_rarg1;
4993     Register offset = c_rarg2;
4994     Register limit  = c_rarg3;
4995     // use src to contain the original start point of the array.
4996     Register src    = x9;
4997 
4998     if (multi_block) {
4999       __ sub(limit, limit, offset);
5000       __ add(limit, limit, buf);
5001       __ sub(src, buf, offset);
5002     }
5003 
5004     // [args-reg]:  x14 - x17
5005     // [temp-reg]:  x28 - x31
5006     // [saved-reg]: x18 - x27
5007 
5008     // h0/1/2/3/4
5009     const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5010     // w0, w1, ... w15
5011     // put two adjecent w's in one register:
5012     //    one at high word part, another at low word part
5013     // at different round (even or odd), w't value reside in different items in ws[].
5014     // w0 ~ w15, either reside in
5015     //    ws[0] ~ ws[7], where
5016     //      w0 at higher 32 bits of ws[0],
5017     //      w1 at lower 32 bits of ws[0],
5018     //      ...
5019     //      w14 at higher 32 bits of ws[7],
5020     //      w15 at lower 32 bits of ws[7].
5021     // or, reside in
5022     //    w0:       ws[0]'s lower 32 bits
5023     //    w1 ~ w14: ws[1] ~ ws[7]
5024     //    w15:      ws[8]'s higher 32 bits
5025     Register ws[9] = {x29, x30, x31, x18,
5026                       x19, x20, x21, x22,
5027                       x23}; // auxiliary register for calculating w's value
5028     // current k't's value
5029     const Register cur_k = x24;
5030     // current w't's value
5031     const Register cur_w = x25;
5032     // values of a, b, c, d, e in the previous round
5033     const Register prev_ab = x26, prev_cd = x27;
5034     const Register prev_e = offset; // reuse offset/c_rarg2
5035 
5036     // load 5 words state into a, b, c, d, e.
5037     //
5038     // To minimize the number of memory operations, we apply following
5039     // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5040     // with a single ld, and split them into 2 registers.
5041     //
5042     // And, as the core algorithm of SHA-1 works on 32-bits words, so
5043     // in the following code, it does not care about the content of
5044     // higher 32-bits in a/b/c/d/e. Based on this observation,
5045     // we can apply further optimization, which is to just ignore the
5046     // higher 32-bits in a/c/e, rather than set the higher
5047     // 32-bits of a/c/e to zero explicitly with extra instructions.
5048     __ ld(a, Address(state, 0));
5049     __ srli(b, a, 32);
5050     __ ld(c, Address(state, 8));
5051     __ srli(d, c, 32);
5052     __ lw(e, Address(state, 16));
5053 
5054     Label L_sha1_loop;
5055     if (multi_block) {
5056       __ BIND(L_sha1_loop);
5057     }
5058 
5059     sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5060 
5061     for (int round = 0; round < 80; round++) {
5062       // prepare K't value
5063       sha1_prepare_k(cur_k, round);
5064 
5065       // prepare W't value
5066       sha1_prepare_w(cur_w, ws, buf, round);
5067 
5068       // one round process
5069       sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
5070     }
5071 
5072     // compute the intermediate hash value
5073     sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5074 
5075     if (multi_block) {
5076       int64_t block_bytes = 16 * 4;
5077       __ addi(buf, buf, block_bytes);
5078 
5079       __ bge(limit, buf, L_sha1_loop, true);
5080     }
5081 
5082     // store back the state.
5083     __ zero_extend(a, a, 32);
5084     __ slli(b, b, 32);
5085     __ orr(a, a, b);
5086     __ sd(a, Address(state, 0));
5087     __ zero_extend(c, c, 32);
5088     __ slli(d, d, 32);
5089     __ orr(c, c, d);
5090     __ sd(c, Address(state, 8));
5091     __ sw(e, Address(state, 16));
5092 
5093     // return offset
5094     if (multi_block) {
5095       __ sub(c_rarg0, buf, src);
5096     }
5097 
5098     __ pop_reg(saved_regs, sp);
5099 
5100     __ leave();
5101     __ ret();
5102 
5103     return (address) start;
5104   }
5105 
5106   /**
5107    * vector registers:
5108    *   input VectorRegister's:  intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
5109    *   index VectorRegister's:  idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
5110    *   output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
5111    *
5112    * NOTE: each field will occupy a vector register group
5113    */
5114   void base64_vector_encode_round(Register src, Register dst, Register codec,
5115                     Register size, Register stepSrc, Register stepDst,
5116                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
5117                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
5118                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
5119                     Assembler::LMUL lmul) {
5120     // set vector register type/len
5121     __ vsetvli(x0, size, Assembler::e8, lmul);
5122 
5123     // segmented load src into v registers: mem(src) => vr(3)
5124     __ vlseg3e8_v(inputV1, src);
5125 
5126     // src = src + register_group_len_bytes * 3
5127     __ add(src, src, stepSrc);
5128 
5129     // encoding
5130     //   1. compute index into lookup table: vr(3) => vr(4)
5131     __ vsrl_vi(idxV1, inputV1, 2);
5132 
5133     __ vsrl_vi(idxV2, inputV2, 2);
5134     __ vsll_vi(inputV1, inputV1, 6);
5135     __ vor_vv(idxV2, idxV2, inputV1);
5136     __ vsrl_vi(idxV2, idxV2, 2);
5137 
5138     __ vsrl_vi(idxV3, inputV3, 4);
5139     __ vsll_vi(inputV2, inputV2, 4);
5140     __ vor_vv(idxV3, inputV2, idxV3);
5141     __ vsrl_vi(idxV3, idxV3, 2);
5142 
5143     __ vsll_vi(idxV4, inputV3, 2);
5144     __ vsrl_vi(idxV4, idxV4, 2);
5145 
5146     //   2. indexed load: vr(4) => vr(4)
5147     __ vluxei8_v(outputV1, codec, idxV1);
5148     __ vluxei8_v(outputV2, codec, idxV2);
5149     __ vluxei8_v(outputV3, codec, idxV3);
5150     __ vluxei8_v(outputV4, codec, idxV4);
5151 
5152     // segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
5153     __ vsseg4e8_v(outputV1, dst);
5154 
5155     // dst = dst + register_group_len_bytes * 4
5156     __ add(dst, dst, stepDst);
5157   }
5158 
5159   /**
5160    *  void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
5161    *
5162    *  Input arguments:
5163    *  c_rarg0   - src, source array
5164    *  c_rarg1   - sp, src start offset
5165    *  c_rarg2   - sl, src end offset
5166    *  c_rarg3   - dst, dest array
5167    *  c_rarg4   - dp, dst start offset
5168    *  c_rarg5   - isURL, Base64 or URL character set
5169    */
5170   address generate_base64_encodeBlock() {
5171     alignas(64) static const char toBase64[64] = {
5172       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5173       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5174       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5175       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5176       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5177     };
5178 
5179     alignas(64) static const char toBase64URL[64] = {
5180       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5181       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5182       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5183       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5184       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5185     };
5186 
5187     __ align(CodeEntryAlignment);
5188     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5189     address start = __ pc();
5190     __ enter();
5191 
5192     Register src    = c_rarg0;
5193     Register soff   = c_rarg1;
5194     Register send   = c_rarg2;
5195     Register dst    = c_rarg3;
5196     Register doff   = c_rarg4;
5197     Register isURL  = c_rarg5;
5198 
5199     Register codec  = c_rarg6;
5200     Register length = c_rarg7; // total length of src data in bytes
5201 
5202     Label ProcessData, Exit;
5203 
5204     // length should be multiple of 3
5205     __ sub(length, send, soff);
5206     // real src/dst to process data
5207     __ add(src, src, soff);
5208     __ add(dst, dst, doff);
5209 
5210     // load the codec base address
5211     __ la(codec, ExternalAddress((address) toBase64));
5212     __ beqz(isURL, ProcessData);
5213     __ la(codec, ExternalAddress((address) toBase64URL));
5214     __ BIND(ProcessData);
5215 
5216     // vector version
5217     if (UseRVV) {
5218       Label ProcessM2, ProcessM1, ProcessScalar;
5219 
5220       Register size      = soff;
5221       Register stepSrcM1 = send;
5222       Register stepSrcM2 = doff;
5223       Register stepDst   = isURL;
5224 
5225       __ mv(size, MaxVectorSize * 2);
5226       __ mv(stepSrcM1, MaxVectorSize * 3);
5227       __ slli(stepSrcM2, stepSrcM1, 1);
5228       __ mv(stepDst, MaxVectorSize * 2 * 4);
5229 
5230       __ blt(length, stepSrcM2, ProcessM1);
5231 
5232       __ BIND(ProcessM2);
5233       base64_vector_encode_round(src, dst, codec,
5234                     size, stepSrcM2, stepDst,
5235                     v2, v4, v6,         // inputs
5236                     v8, v10, v12, v14,  // indexes
5237                     v16, v18, v20, v22, // outputs
5238                     Assembler::m2);
5239 
5240       __ sub(length, length, stepSrcM2);
5241       __ bge(length, stepSrcM2, ProcessM2);
5242 
5243       __ BIND(ProcessM1);
5244       __ blt(length, stepSrcM1, ProcessScalar);
5245 
5246       __ srli(size, size, 1);
5247       __ srli(stepDst, stepDst, 1);
5248       base64_vector_encode_round(src, dst, codec,
5249                     size, stepSrcM1, stepDst,
5250                     v1, v2, v3,         // inputs
5251                     v4, v5, v6, v7,     // indexes
5252                     v8, v9, v10, v11,   // outputs
5253                     Assembler::m1);
5254       __ sub(length, length, stepSrcM1);
5255 
5256       __ BIND(ProcessScalar);
5257     }
5258 
5259     // scalar version
5260     {
5261       Register byte1 = soff, byte0 = send, byte2 = doff;
5262       Register combined24Bits = isURL;
5263 
5264       __ beqz(length, Exit);
5265 
5266       Label ScalarLoop;
5267       __ BIND(ScalarLoop);
5268       {
5269         // plain:   [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
5270         // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
5271 
5272         // load 3 bytes src data
5273         __ lbu(byte0, Address(src, 0));
5274         __ lbu(byte1, Address(src, 1));
5275         __ lbu(byte2, Address(src, 2));
5276         __ addi(src, src, 3);
5277 
5278         // construct 24 bits from 3 bytes
5279         __ slliw(byte0, byte0, 16);
5280         __ slliw(byte1, byte1, 8);
5281         __ orr(combined24Bits, byte0, byte1);
5282         __ orr(combined24Bits, combined24Bits, byte2);
5283 
5284         // get codec index and encode(ie. load from codec by index)
5285         __ slliw(byte0, combined24Bits, 8);
5286         __ srliw(byte0, byte0, 26);
5287         __ add(byte0, codec, byte0);
5288         __ lbu(byte0, byte0);
5289 
5290         __ slliw(byte1, combined24Bits, 14);
5291         __ srliw(byte1, byte1, 26);
5292         __ add(byte1, codec, byte1);
5293         __ lbu(byte1, byte1);
5294 
5295         __ slliw(byte2, combined24Bits, 20);
5296         __ srliw(byte2, byte2, 26);
5297         __ add(byte2, codec, byte2);
5298         __ lbu(byte2, byte2);
5299 
5300         __ andi(combined24Bits, combined24Bits, 0x3f);
5301         __ add(combined24Bits, codec, combined24Bits);
5302         __ lbu(combined24Bits, combined24Bits);
5303 
5304         // store 4 bytes encoded data
5305         __ sb(byte0, Address(dst, 0));
5306         __ sb(byte1, Address(dst, 1));
5307         __ sb(byte2, Address(dst, 2));
5308         __ sb(combined24Bits, Address(dst, 3));
5309 
5310         __ sub(length, length, 3);
5311         __ addi(dst, dst, 4);
5312         // loop back
5313         __ bnez(length, ScalarLoop);
5314       }
5315     }
5316 
5317     __ BIND(Exit);
5318 
5319     __ leave();
5320     __ ret();
5321 
5322     return (address) start;
5323   }
5324 
5325   /**
5326    * vector registers:
5327    * input VectorRegister's:  intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
5328    * index VectorRegister's:  idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
5329    * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
5330    *
5331    * NOTE: each field will occupy a single vector register group
5332    */
5333   void base64_vector_decode_round(Register src, Register dst, Register codec,
5334                     Register size, Register stepSrc, Register stepDst, Register failedIdx, Register minusOne,
5335                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
5336                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
5337                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
5338                     Assembler::LMUL lmul) {
5339     // set vector register type/len
5340     __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
5341 
5342     // segmented load src into v registers: mem(src) => vr(4)
5343     __ vlseg4e8_v(inputV1, src);
5344 
5345     // src = src + register_group_len_bytes * 4
5346     __ add(src, src, stepSrc);
5347 
5348     // decoding
5349     //   1. indexed load: vr(4) => vr(4)
5350     __ vluxei8_v(idxV1, codec, inputV1);
5351     __ vluxei8_v(idxV2, codec, inputV2);
5352     __ vluxei8_v(idxV3, codec, inputV3);
5353     __ vluxei8_v(idxV4, codec, inputV4);
5354 
5355     //   2. check wrong data
5356     __ vor_vv(outputV1, idxV1, idxV2);
5357     __ vor_vv(outputV2, idxV3, idxV4);
5358     __ vor_vv(outputV1, outputV1, outputV2);
5359     __ vmseq_vi(v0, outputV1, -1);
5360     __ vfirst_m(failedIdx, v0);
5361     Label NoFailure;
5362     __ beq(failedIdx, minusOne, NoFailure);
5363     __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
5364     __ slli(stepDst, failedIdx, 1);
5365     __ add(stepDst, failedIdx, stepDst);
5366     __ BIND(NoFailure);
5367 
5368     //   3. compute the decoded data: vr(4) => vr(3)
5369     __ vsll_vi(idxV1, idxV1, 2);
5370     __ vsrl_vi(outputV1, idxV2, 4);
5371     __ vor_vv(outputV1, outputV1, idxV1);
5372 
5373     __ vsll_vi(idxV2, idxV2, 4);
5374     __ vsrl_vi(outputV2, idxV3, 2);
5375     __ vor_vv(outputV2, outputV2, idxV2);
5376 
5377     __ vsll_vi(idxV3, idxV3, 6);
5378     __ vor_vv(outputV3, idxV4, idxV3);
5379 
5380     // segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
5381     __ vsseg3e8_v(outputV1, dst);
5382 
5383     // dst = dst + register_group_len_bytes * 3
5384     __ add(dst, dst, stepDst);
5385   }
5386 
5387   /**
5388    * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
5389    *
5390    *  Input arguments:
5391    *  c_rarg0   - src, source array
5392    *  c_rarg1   - sp, src start offset
5393    *  c_rarg2   - sl, src end offset
5394    *  c_rarg3   - dst, dest array
5395    *  c_rarg4   - dp, dst start offset
5396    *  c_rarg5   - isURL, Base64 or URL character set
5397    *  c_rarg6   - isMIME, Decoding MIME block
5398    */
5399   address generate_base64_decodeBlock() {
5400 
5401     static const uint8_t fromBase64[256] = {
5402         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5403         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5404         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5405         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5406         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5407         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
5408         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5409         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5410         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5411         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5412         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5413         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5414         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5415         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5416         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5417         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5418     };
5419 
5420     static const uint8_t fromBase64URL[256] = {
5421         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5422         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5423         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5424         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5425         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5426         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
5427         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5428         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5429         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5430         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5431         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5432         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5433         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5434         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5435         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5436         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5437     };
5438 
5439     __ align(CodeEntryAlignment);
5440     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
5441     address start = __ pc();
5442     __ enter();
5443 
5444     Register src    = c_rarg0;
5445     Register soff   = c_rarg1;
5446     Register send   = c_rarg2;
5447     Register dst    = c_rarg3;
5448     Register doff   = c_rarg4;
5449     Register isURL  = c_rarg5;
5450     Register isMIME = c_rarg6;
5451 
5452     Register codec     = c_rarg7;
5453     Register dstBackup = x31;
5454     Register length    = x28;     // t3, total length of src data in bytes
5455 
5456     Label ProcessData, Exit;
5457     Label ProcessScalar, ScalarLoop;
5458 
5459     // passed in length (send - soff) is guaranteed to be > 4,
5460     // and in this intrinsic we only process data of length in multiple of 4,
5461     // it's not guaranteed to be multiple of 4 by java level, so do it explicitly
5462     __ sub(length, send, soff);
5463     __ andi(length, length, -4);
5464     // real src/dst to process data
5465     __ add(src, src, soff);
5466     __ add(dst, dst, doff);
5467     // backup of dst, used to calculate the return value at exit
5468     __ mv(dstBackup, dst);
5469 
5470     // load the codec base address
5471     __ la(codec, ExternalAddress((address) fromBase64));
5472     __ beqz(isURL, ProcessData);
5473     __ la(codec, ExternalAddress((address) fromBase64URL));
5474     __ BIND(ProcessData);
5475 
5476     // vector version
5477     if (UseRVV) {
5478       // for MIME case, it has a default length limit of 76 which could be
5479       // different(smaller) from (send - soff), so in MIME case, we go through
5480       // the scalar code path directly.
5481       __ bnez(isMIME, ScalarLoop);
5482 
5483       Label ProcessM1, ProcessM2;
5484 
5485       Register failedIdx = soff;
5486       Register stepSrcM1 = send;
5487       Register stepSrcM2 = doff;
5488       Register stepDst   = isURL;
5489       Register size      = x29;   // t4
5490       Register minusOne  = x30;   // t5
5491 
5492       __ mv(minusOne, -1);
5493       __ mv(size, MaxVectorSize * 2);
5494       __ mv(stepSrcM1, MaxVectorSize * 4);
5495       __ slli(stepSrcM2, stepSrcM1, 1);
5496       __ mv(stepDst, MaxVectorSize * 2 * 3);
5497 
5498       __ blt(length, stepSrcM2, ProcessM1);
5499 
5500 
5501       // Assembler::m2
5502       __ BIND(ProcessM2);
5503       base64_vector_decode_round(src, dst, codec,
5504                     size, stepSrcM2, stepDst, failedIdx, minusOne,
5505                     v2, v4, v6, v8,      // inputs
5506                     v10, v12, v14, v16,  // indexes
5507                     v18, v20, v22,       // outputs
5508                     Assembler::m2);
5509       __ sub(length, length, stepSrcM2);
5510 
5511       // error check
5512       __ bne(failedIdx, minusOne, Exit);
5513 
5514       __ bge(length, stepSrcM2, ProcessM2);
5515 
5516 
5517       // Assembler::m1
5518       __ BIND(ProcessM1);
5519       __ blt(length, stepSrcM1, ProcessScalar);
5520 
5521       __ srli(size, size, 1);
5522       __ srli(stepDst, stepDst, 1);
5523       base64_vector_decode_round(src, dst, codec,
5524                     size, stepSrcM1, stepDst, failedIdx, minusOne,
5525                     v1, v2, v3, v4,      // inputs
5526                     v5, v6, v7, v8,      // indexes
5527                     v9, v10, v11,        // outputs
5528                     Assembler::m1);
5529       __ sub(length, length, stepSrcM1);
5530 
5531       // error check
5532       __ bne(failedIdx, minusOne, Exit);
5533 
5534       __ BIND(ProcessScalar);
5535       __ beqz(length, Exit);
5536     }
5537 
5538     // scalar version
5539     {
5540       Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
5541       Register combined32Bits = x29; // t5
5542 
5543       // encoded:   [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
5544       // plain:     [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
5545       __ BIND(ScalarLoop);
5546 
5547       // load 4 bytes encoded src data
5548       __ lbu(byte0, Address(src, 0));
5549       __ lbu(byte1, Address(src, 1));
5550       __ lbu(byte2, Address(src, 2));
5551       __ lbu(byte3, Address(src, 3));
5552       __ addi(src, src, 4);
5553 
5554       // get codec index and decode (ie. load from codec by index)
5555       __ add(byte0, codec, byte0);
5556       __ add(byte1, codec, byte1);
5557       __ lb(byte0, Address(byte0, 0));
5558       __ lb(byte1, Address(byte1, 0));
5559       __ add(byte2, codec, byte2);
5560       __ add(byte3, codec, byte3);
5561       __ lb(byte2, Address(byte2, 0));
5562       __ lb(byte3, Address(byte3, 0));
5563       __ slliw(byte0, byte0, 18);
5564       __ slliw(byte1, byte1, 12);
5565       __ orr(byte0, byte0, byte1);
5566       __ orr(byte0, byte0, byte3);
5567       __ slliw(byte2, byte2, 6);
5568       // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
5569       //  1. error check below
5570       //  2. decode below
5571       __ orr(combined32Bits, byte0, byte2);
5572 
5573       // error check
5574       __ bltz(combined32Bits, Exit);
5575 
5576       // store 3 bytes decoded data
5577       __ sraiw(byte0, combined32Bits, 16);
5578       __ sraiw(byte1, combined32Bits, 8);
5579       __ sb(byte0, Address(dst, 0));
5580       __ sb(byte1, Address(dst, 1));
5581       __ sb(combined32Bits, Address(dst, 2));
5582 
5583       __ sub(length, length, 4);
5584       __ addi(dst, dst, 3);
5585       // loop back
5586       __ bnez(length, ScalarLoop);
5587     }
5588 
5589     __ BIND(Exit);
5590     __ sub(c_rarg0, dst, dstBackup);
5591 
5592     __ leave();
5593     __ ret();
5594 
5595     return (address) start;
5596   }
5597 
5598   void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
5599     VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
5600     Register temp0, Register temp1, Register temp2,  Register temp3,
5601     VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
5602 
5603     assert((lmul == Assembler::m4 && step == 64) ||
5604            (lmul == Assembler::m2 && step == 32) ||
5605            (lmul == Assembler::m1 && step == 16),
5606            "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
5607     // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
5608     // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
5609     // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
5610     // In non-vectorized code, we update s1 and s2 as:
5611     //   s1 <- s1 + b1
5612     //   s2 <- s2 + s1
5613     //   s1 <- s1 + b2
5614     //   s2 <- s2 + b1
5615     //   ...
5616     //   s1 <- s1 + b64
5617     //   s2 <- s2 + s1
5618     // Putting above assignments together, we have:
5619     //   s1_new = s1 + b1 + b2 + ... + b64
5620     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
5621     //          = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
5622     //          = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
5623 
5624     __ mv(temp3, step);
5625     // Load data
5626     __ vsetvli(temp0, temp3, Assembler::e8, lmul);
5627     __ vle8_v(vbytes, buff);
5628     __ addi(buff, buff, step);
5629 
5630     // Upper bound reduction sum for s1_new:
5631     // 0xFF * 64 = 0x3FC0, so:
5632     // 1. Need to do vector-widening reduction sum
5633     // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
5634     __ vwredsumu_vs(vs1acc, vbytes, vzero);
5635     // Multiplication for s2_new
5636     __ vwmulu_vv(vs2acc, vtable, vbytes);
5637 
5638     // s2 = s2 + s1 * log2(step)
5639     __ slli(temp1, s1, exact_log2(step));
5640     __ add(s2, s2, temp1);
5641 
5642     // Summing up calculated results for s2_new
5643     if (MaxVectorSize > 16) {
5644       __ vsetvli(temp0, temp3, Assembler::e16, lmul);
5645     } else {
5646       // Half of vector-widening multiplication result is in successor of vs2acc
5647       // group for vlen == 16, in which case we need to double vector register
5648       // group width in order to reduction sum all of them
5649       Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
5650                                (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
5651       __ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
5652     }
5653     // Upper bound for reduction sum:
5654     // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
5655     // 1. Need to do vector-widening reduction sum
5656     // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
5657     __ vwredsumu_vs(vtemp1, vs2acc, vzero);
5658 
5659     // Extracting results for:
5660     // s1_new
5661     __ vmv_x_s(temp0, vs1acc);
5662     __ add(s1, s1, temp0);
5663     // s2_new
5664     __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
5665     __ vmv_x_s(temp1, vtemp1);
5666     __ add(s2, s2, temp1);
5667   }
5668 
5669   /***
5670    *  int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
5671    *
5672    *  Arguments:
5673    *
5674    *  Inputs:
5675    *   c_rarg0   - int   adler
5676    *   c_rarg1   - byte* buff (b + off)
5677    *   c_rarg2   - int   len
5678    *
5679    *  Output:
5680    *   c_rarg0   - int adler result
5681    */
5682   address generate_updateBytesAdler32() {
5683     __ align(CodeEntryAlignment);
5684     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
5685     address start = __ pc();
5686 
5687     Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
5688       L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
5689 
5690     // Aliases
5691     Register adler  = c_rarg0;
5692     Register s1     = c_rarg0;
5693     Register s2     = c_rarg3;
5694     Register buff   = c_rarg1;
5695     Register len    = c_rarg2;
5696     Register nmax  = c_rarg4;
5697     Register base  = c_rarg5;
5698     Register count = c_rarg6;
5699     Register temp0 = x28; // t3
5700     Register temp1 = x29; // t4
5701     Register temp2 = x30; // t5
5702     Register temp3 = x31; // t6
5703 
5704     VectorRegister vzero = v31;
5705     VectorRegister vbytes = v8; // group: v8, v9, v10, v11
5706     VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
5707     VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
5708     VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
5709     VectorRegister vtable_32 = v4; // group: v4, v5
5710     VectorRegister vtable_16 = v30;
5711     VectorRegister vtemp1 = v28;
5712     VectorRegister vtemp2 = v29;
5713 
5714     // Max number of bytes we can process before having to take the mod
5715     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
5716     const uint64_t BASE = 0xfff1;
5717     const uint64_t NMAX = 0x15B0;
5718 
5719     // Loops steps
5720     int step_64 = 64;
5721     int step_32 = 32;
5722     int step_16 = 16;
5723     int step_1  = 1;
5724 
5725     __ enter(); // Required for proper stackwalking of RuntimeStub frame
5726     __ mv(temp1, 64);
5727     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
5728 
5729     // Generating accumulation coefficients for further calculations
5730     // vtable_64:
5731     __ vid_v(vtemp1);
5732     __ vrsub_vx(vtable_64, vtemp1, temp1);
5733     // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
5734 
5735     // vtable_32:
5736     __ mv(temp1, 32);
5737     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
5738     __ vid_v(vtemp1);
5739     __ vrsub_vx(vtable_32, vtemp1, temp1);
5740     // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
5741 
5742     __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
5743     // vtable_16:
5744     __ mv(temp1, 16);
5745     __ vid_v(vtemp1);
5746     __ vrsub_vx(vtable_16, vtemp1, temp1);
5747     // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
5748 
5749     __ vmv_v_i(vzero, 0);
5750 
5751     __ mv(base, BASE);
5752     __ mv(nmax, NMAX);
5753 
5754     // s1 is initialized to the lower 16 bits of adler
5755     // s2 is initialized to the upper 16 bits of adler
5756     __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
5757     __ zero_extend(s1, adler, 16); // s1 = (adler & 0xffff)
5758 
5759     // The pipelined loop needs at least 16 elements for 1 iteration
5760     // It does check this, but it is more effective to skip to the cleanup loop
5761     __ mv(temp0, step_16);
5762     __ bgeu(len, temp0, L_nmax);
5763     __ beqz(len, L_combine);
5764 
5765     // Jumping to L_by1_loop
5766     __ sub(len, len, step_1);
5767     __ j(L_by1_loop);
5768 
5769   __ bind(L_nmax);
5770     __ sub(len, len, nmax);
5771     __ sub(count, nmax, 16);
5772     __ bltz(len, L_by16);
5773 
5774   // Align L_nmax loop by 64
5775   __ bind(L_nmax_loop_entry);
5776     __ sub(count, count, 32);
5777 
5778   __ bind(L_nmax_loop);
5779     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
5780       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5781       vtemp1, vtemp2, step_64, Assembler::m4);
5782     __ sub(count, count, step_64);
5783     __ bgtz(count, L_nmax_loop);
5784 
5785     // There are three iterations left to do
5786     adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
5787       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5788       vtemp1, vtemp2, step_32, Assembler::m2);
5789     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
5790       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5791       vtemp1, vtemp2, step_16, Assembler::m1);
5792 
5793     // s1 = s1 % BASE
5794     __ remuw(s1, s1, base);
5795     // s2 = s2 % BASE
5796     __ remuw(s2, s2, base);
5797 
5798     __ sub(len, len, nmax);
5799     __ sub(count, nmax, 16);
5800     __ bgez(len, L_nmax_loop_entry);
5801 
5802   __ bind(L_by16);
5803     __ add(len, len, count);
5804     __ bltz(len, L_by1);
5805     // Trying to unroll
5806     __ mv(temp3, step_64);
5807     __ blt(len, temp3, L_by16_loop);
5808 
5809   __ bind(L_by16_loop_unroll);
5810     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
5811       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5812       vtemp1, vtemp2, step_64, Assembler::m4);
5813     __ sub(len, len, step_64);
5814     // By now the temp3 should still be 64
5815     __ bge(len, temp3, L_by16_loop_unroll);
5816 
5817   __ bind(L_by16_loop);
5818     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
5819       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5820       vtemp1, vtemp2, step_16, Assembler::m1);
5821     __ sub(len, len, step_16);
5822     __ bgez(len, L_by16_loop);
5823 
5824   __ bind(L_by1);
5825     __ add(len, len, 15);
5826     __ bltz(len, L_do_mod);
5827 
5828   __ bind(L_by1_loop);
5829     __ lbu(temp0, Address(buff, 0));
5830     __ addi(buff, buff, step_1);
5831     __ add(s1, temp0, s1);
5832     __ add(s2, s2, s1);
5833     __ sub(len, len, step_1);
5834     __ bgez(len, L_by1_loop);
5835 
5836   __ bind(L_do_mod);
5837     // s1 = s1 % BASE
5838     __ remuw(s1, s1, base);
5839     // s2 = s2 % BASE
5840     __ remuw(s2, s2, base);
5841 
5842     // Combine lower bits and higher bits
5843     // adler = s1 | (s2 << 16)
5844   __ bind(L_combine);
5845     __ slli(s2, s2, 16);
5846     __ orr(s1, s1, s2);
5847 
5848     __ leave(); // Required for proper stackwalking of RuntimeStub frame
5849     __ ret();
5850 
5851     return start;
5852   }
5853 
5854 #endif // COMPILER2_OR_JVMCI
5855 
5856 #ifdef COMPILER2
5857 
5858 static const int64_t right_2_bits = right_n_bits(2);
5859 static const int64_t right_3_bits = right_n_bits(3);
5860 
5861   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
5862   // are represented as long[5], with BITS_PER_LIMB = 26.
5863   // Pack five 26-bit limbs into three 64-bit registers.
5864   void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
5865     assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
5866 
5867     // The goal is to have 128-bit value in dest2:dest1:dest0
5868     __ ld(dest0, Address(src, 0));    // 26 bits in dest0
5869 
5870     __ ld(tmp1, Address(src, sizeof(jlong)));
5871     __ slli(tmp1, tmp1, 26);
5872     __ add(dest0, dest0, tmp1);       // 52 bits in dest0
5873 
5874     __ ld(tmp2, Address(src, 2 * sizeof(jlong)));
5875     __ slli(tmp1, tmp2, 52);
5876     __ add(dest0, dest0, tmp1);       // dest0 is full
5877 
5878     __ srli(dest1, tmp2, 12);         // 14-bit in dest1
5879 
5880     __ ld(tmp1, Address(src, 3 * sizeof(jlong)));
5881     __ slli(tmp1, tmp1, 14);
5882     __ add(dest1, dest1, tmp1);       // 40-bit in dest1
5883 
5884     __ ld(tmp1, Address(src, 4 * sizeof(jlong)));
5885     __ slli(tmp2, tmp1, 40);
5886     __ add(dest1, dest1, tmp2);       // dest1 is full
5887 
5888     if (dest2->is_valid()) {
5889       __ srli(tmp1, tmp1, 24);
5890       __ mv(dest2, tmp1);               // 2 bits in dest2
5891     } else {
5892 #ifdef ASSERT
5893       Label OK;
5894       __ srli(tmp1, tmp1, 24);
5895       __ beq(zr, tmp1, OK);           // 2 bits
5896       __ stop("high bits of Poly1305 integer should be zero");
5897       __ should_not_reach_here();
5898       __ bind(OK);
5899 #endif
5900     }
5901   }
5902 
5903   // As above, but return only a 128-bit integer, packed into two
5904   // 64-bit registers.
5905   void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
5906     poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
5907   }
5908 
5909   // U_2:U_1:U_0: += (U_2 >> 2) * 5
5910   void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
5911     assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
5912 
5913     // First, U_2:U_1:U_0 += (U_2 >> 2)
5914     __ srli(tmp1, U_2, 2);
5915     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
5916     __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
5917     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
5918     __ add(U_2, U_2, tmp2);
5919 
5920     // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
5921     __ slli(tmp1, tmp1, 2);
5922     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
5923     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
5924     __ add(U_2, U_2, tmp2);
5925   }
5926 
5927   // Poly1305, RFC 7539
5928   // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
5929 
5930   // Arguments:
5931   //    c_rarg0:   input_start -- where the input is stored
5932   //    c_rarg1:   length
5933   //    c_rarg2:   acc_start -- where the output will be stored
5934   //    c_rarg3:   r_start -- where the randomly generated 128-bit key is stored
5935 
5936   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
5937   // description of the tricks used to simplify and accelerate this
5938   // computation.
5939 
5940   address generate_poly1305_processBlocks() {
5941     __ align(CodeEntryAlignment);
5942     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
5943     address start = __ pc();
5944     __ enter();
5945     Label here;
5946 
5947     RegSet saved_regs = RegSet::range(x18, x21);
5948     RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
5949     __ push_reg(saved_regs, sp);
5950 
5951     // Arguments
5952     const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
5953 
5954     // R_n is the 128-bit randomly-generated key, packed into two
5955     // registers. The caller passes this key to us as long[5], with
5956     // BITS_PER_LIMB = 26.
5957     const Register R_0 = *regs, R_1 = *++regs;
5958     poly1305_pack_26(R_0, R_1, r_start, t1, t2);
5959 
5960     // RR_n is (R_n >> 2) * 5
5961     const Register RR_0 = *++regs, RR_1 = *++regs;
5962     __ srli(t1, R_0, 2);
5963     __ shadd(RR_0, t1, t1, t2, 2);
5964     __ srli(t1, R_1, 2);
5965     __ shadd(RR_1, t1, t1, t2, 2);
5966 
5967     // U_n is the current checksum
5968     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
5969     poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
5970 
5971     static constexpr int BLOCK_LENGTH = 16;
5972     Label DONE, LOOP;
5973 
5974     __ mv(t1, BLOCK_LENGTH);
5975     __ blt(length, t1, DONE); {
5976       __ bind(LOOP);
5977 
5978       // S_n is to be the sum of U_n and the next block of data
5979       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
5980       __ ld(S_0, Address(input_start, 0));
5981       __ ld(S_1, Address(input_start, wordSize));
5982 
5983       __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
5984       __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
5985       __ add(S_2, U_2, t1);
5986 
5987       __ addi(S_2, S_2, 1);
5988 
5989       const Register U_0HI = *++regs, U_1HI = *++regs;
5990 
5991       // NB: this logic depends on some of the special properties of
5992       // Poly1305 keys. In particular, because we know that the top
5993       // four bits of R_0 and R_1 are zero, we can add together
5994       // partial products without any risk of needing to propagate a
5995       // carry out.
5996       __ wide_mul(U_0, U_0HI, S_0, R_0);
5997       __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
5998       __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
5999 
6000       __ wide_mul(U_1, U_1HI, S_0, R_1);
6001       __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
6002       __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
6003 
6004       __ andi(U_2, R_0, right_2_bits);
6005       __ mul(U_2, S_2, U_2);
6006 
6007       // Partial reduction mod 2**130 - 5
6008       __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
6009       __ adc(U_2, U_2, U_1HI, t1);
6010       // Sum is now in U_2:U_1:U_0.
6011 
6012       // U_2:U_1:U_0: += (U_2 >> 2) * 5
6013       poly1305_reduce(U_2, U_1, U_0, t1, t2);
6014 
6015       __ sub(length, length, BLOCK_LENGTH);
6016       __ addi(input_start, input_start, BLOCK_LENGTH);
6017       __ mv(t1, BLOCK_LENGTH);
6018       __ bge(length, t1, LOOP);
6019     }
6020 
6021     // Further reduce modulo 2^130 - 5
6022     poly1305_reduce(U_2, U_1, U_0, t1, t2);
6023 
6024     // Unpack the sum into five 26-bit limbs and write to memory.
6025     // First 26 bits is the first limb
6026     __ slli(t1, U_0, 38); // Take lowest 26 bits
6027     __ srli(t1, t1, 38);
6028     __ sd(t1, Address(acc_start)); // First 26-bit limb
6029 
6030     // 27-52 bits of U_0 is the second limb
6031     __ slli(t1, U_0, 12); // Take next 27-52 bits
6032     __ srli(t1, t1, 38);
6033     __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
6034 
6035     // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
6036     __ srli(t1, U_0, 52);
6037     __ slli(t2, U_1, 50);
6038     __ srli(t2, t2, 38);
6039     __ add(t1, t1, t2);
6040     __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
6041 
6042     // Storing 15-40 bits of U_1
6043     __ slli(t1, U_1, 24); // Already used up 14 bits
6044     __ srli(t1, t1, 38); // Clear all other bits from t1
6045     __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
6046 
6047     // Storing 41-64 bits of U_1 and first three bits from U_2 in one register
6048     __ srli(t1, U_1, 40);
6049     __ andi(t2, U_2, right_3_bits);
6050     __ slli(t2, t2, 24);
6051     __ add(t1, t1, t2);
6052     __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
6053 
6054     __ bind(DONE);
6055     __ pop_reg(saved_regs, sp);
6056     __ leave(); // Required for proper stackwalking
6057     __ ret();
6058 
6059     return start;
6060   }
6061 
6062 #endif // COMPILER2
6063 
6064   /**
6065    *  Arguments:
6066    *
6067    * Inputs:
6068    *   c_rarg0   - int crc
6069    *   c_rarg1   - byte* buf
6070    *   c_rarg2   - int length
6071    *
6072    * Output:
6073    *   c_rarg0   - int crc result
6074    */
6075   address generate_updateBytesCRC32() {
6076     assert(UseCRC32Intrinsics, "what are we doing here?");
6077 
6078     __ align(CodeEntryAlignment);
6079     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6080 
6081     address start = __ pc();
6082 
6083     const Register crc    = c_rarg0;  // crc
6084     const Register buf    = c_rarg1;  // source java byte array address
6085     const Register len    = c_rarg2;  // length
6086     const Register table0 = c_rarg3;  // crc_table address
6087     const Register table1 = c_rarg4;
6088     const Register table2 = c_rarg5;
6089     const Register table3 = c_rarg6;
6090 
6091     const Register tmp1 = c_rarg7;
6092     const Register tmp2 = t2;
6093     const Register tmp3 = x28; // t3
6094     const Register tmp4 = x29; // t4
6095     const Register tmp5 = x30; // t5
6096     const Register tmp6 = x31; // t6
6097 
6098     BLOCK_COMMENT("Entry:");
6099     __ enter(); // required for proper stackwalking of RuntimeStub frame
6100 
6101     __ kernel_crc32(crc, buf, len, table0, table1, table2,
6102                     table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
6103 
6104     __ leave(); // required for proper stackwalking of RuntimeStub frame
6105     __ ret();
6106 
6107     return start;
6108   }
6109 
6110   // exception handler for upcall stubs
6111   address generate_upcall_stub_exception_handler() {
6112     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
6113     address start = __ pc();
6114 
6115     // Native caller has no idea how to handle exceptions,
6116     // so we just crash here. Up to callee to catch exceptions.
6117     __ verify_oop(x10); // return a exception oop in a0
6118     __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
6119     __ should_not_reach_here();
6120 
6121     return start;
6122   }
6123 
6124 #undef __
6125 
6126   // Initialization
6127   void generate_initial_stubs() {
6128     // Generate initial stubs and initializes the entry points
6129 
6130     // entry points that exist in all platforms Note: This is code
6131     // that could be shared among different platforms - however the
6132     // benefit seems to be smaller than the disadvantage of having a
6133     // much more complicated generator structure. See also comment in
6134     // stubRoutines.hpp.
6135 
6136     StubRoutines::_forward_exception_entry = generate_forward_exception();
6137 
6138     if (UnsafeMemoryAccess::_table == nullptr) {
6139       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
6140     }
6141 
6142     StubRoutines::_call_stub_entry =
6143       generate_call_stub(StubRoutines::_call_stub_return_address);
6144 
6145     // is referenced by megamorphic call
6146     StubRoutines::_catch_exception_entry = generate_catch_exception();
6147 
6148     if (UseCRC32Intrinsics) {
6149       // set table address before stub generation which use it
6150       StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table;
6151       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
6152     }
6153   }
6154 
6155   void generate_continuation_stubs() {
6156     // Continuation stubs:
6157     StubRoutines::_cont_thaw             = generate_cont_thaw();
6158     StubRoutines::_cont_returnBarrier    = generate_cont_returnBarrier();
6159     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
6160   }
6161 
6162   void generate_final_stubs() {
6163     // support for verify_oop (must happen after universe_init)
6164     if (VerifyOops) {
6165       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
6166     }
6167 
6168     // arraycopy stubs used by compilers
6169     generate_arraycopy_stubs();
6170 
6171     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
6172     if (bs_nm != nullptr) {
6173       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
6174     }
6175 
6176 #ifdef COMPILER2
6177     if (UseSecondarySupersTable) {
6178       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
6179       if (!InlineSecondarySupersTest) {
6180         for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
6181           StubRoutines::_lookup_secondary_supers_table_stubs[slot]
6182             = generate_lookup_secondary_supers_table_stub(slot);
6183         }
6184       }
6185     }
6186 #endif // COMPILER2
6187 
6188     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
6189 
6190     StubRoutines::riscv::set_completed();
6191   }
6192 
6193   void generate_compiler_stubs() {
6194 #ifdef COMPILER2
6195     if (UseMulAddIntrinsic) {
6196       StubRoutines::_mulAdd = generate_mulAdd();
6197     }
6198 
6199     if (UseMultiplyToLenIntrinsic) {
6200       StubRoutines::_multiplyToLen = generate_multiplyToLen();
6201     }
6202 
6203     if (UseSquareToLenIntrinsic) {
6204       StubRoutines::_squareToLen = generate_squareToLen();
6205     }
6206 
6207     if (UseMontgomeryMultiplyIntrinsic) {
6208       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
6209       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
6210       StubRoutines::_montgomeryMultiply = g.generate_multiply();
6211     }
6212 
6213     if (UseMontgomerySquareIntrinsic) {
6214       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
6215       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
6216       StubRoutines::_montgomerySquare = g.generate_square();
6217     }
6218 
6219     if (UsePoly1305Intrinsics) {
6220       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
6221     }
6222 
6223     if (UseRVVForBigIntegerShiftIntrinsics) {
6224       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
6225       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
6226     }
6227 
6228     if (UseSHA256Intrinsics) {
6229       Sha2Generator sha2(_masm, this);
6230       StubRoutines::_sha256_implCompress   = sha2.generate_sha256_implCompress(false);
6231       StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true);
6232     }
6233 
6234     if (UseSHA512Intrinsics) {
6235       Sha2Generator sha2(_masm, this);
6236       StubRoutines::_sha512_implCompress   = sha2.generate_sha512_implCompress(false);
6237       StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true);
6238     }
6239 
6240     if (UseMD5Intrinsics) {
6241       StubRoutines::_md5_implCompress   = generate_md5_implCompress(false, "md5_implCompress");
6242       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true,  "md5_implCompressMB");
6243     }
6244 
6245     if (UseChaCha20Intrinsics) {
6246       StubRoutines::_chacha20Block = generate_chacha20Block();
6247     }
6248 
6249     if (UseSHA1Intrinsics) {
6250       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false, "sha1_implCompress");
6251       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true, "sha1_implCompressMB");
6252     }
6253 
6254     if (UseBASE64Intrinsics) {
6255       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6256       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
6257     }
6258 
6259     if (UseAdler32Intrinsics) {
6260       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6261     }
6262 
6263     generate_compare_long_strings();
6264 
6265     generate_string_indexof_stubs();
6266 
6267 #endif // COMPILER2
6268   }
6269 
6270  public:
6271   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
6272     switch(kind) {
6273     case Initial_stubs:
6274       generate_initial_stubs();
6275       break;
6276      case Continuation_stubs:
6277       generate_continuation_stubs();
6278       break;
6279     case Compiler_stubs:
6280       generate_compiler_stubs();
6281       break;
6282     case Final_stubs:
6283       generate_final_stubs();
6284       break;
6285     default:
6286       fatal("unexpected stubs kind: %d", kind);
6287       break;
6288     };
6289   }
6290 }; // end class declaration
6291 
6292 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
6293   StubGenerator g(code, kind);
6294 }