New src/hotspot/cpu/riscv/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/universe.hpp"
  35 #include "nativeInst_riscv.hpp"
  36 #include "oops/instanceOop.hpp"
  37 #include "oops/method.hpp"
  38 #include "oops/objArrayKlass.hpp"
  39 #include "oops/oop.inline.hpp"
  40 #include "prims/methodHandles.hpp"
  41 #include "prims/upcallLinker.hpp"
  42 #include "runtime/continuation.hpp"
  43 #include "runtime/continuationEntry.inline.hpp"
  44 #include "runtime/frame.inline.hpp"
  45 #include "runtime/handles.inline.hpp"
  46 #include "runtime/javaThread.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubCodeGenerator.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "utilities/align.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/runtime.hpp"
  54 #endif
  55 
  56 // Declaration and definition of StubGenerator (no .hpp file).
  57 // For a more detailed description of the stub routine structure
  58 // see the comment in stubRoutines.hpp
  59 
  60 #undef __
  61 #define __ _masm->
  62 
  63 #ifdef PRODUCT
  64 #define BLOCK_COMMENT(str) /* nothing */
  65 #else
  66 #define BLOCK_COMMENT(str) __ block_comment(str)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Stub Code definitions
  72 
  73 class StubGenerator: public StubCodeGenerator {
  74  private:
  75 
  76 #ifdef PRODUCT
  77 #define inc_counter_np(counter) ((void)0)
  78 #else
  79   void inc_counter_np_(uint& counter) {
  80     __ incrementw(ExternalAddress((address)&counter));
  81   }
  82 #define inc_counter_np(counter) \
  83   BLOCK_COMMENT("inc_counter " #counter); \
  84   inc_counter_np_(counter);
  85 #endif
  86 
  87   // Call stubs are used to call Java from C
  88   //
  89   // Arguments:
  90   //    c_rarg0:   call wrapper address                   address
  91   //    c_rarg1:   result                                 address
  92   //    c_rarg2:   result type                            BasicType
  93   //    c_rarg3:   method                                 Method*
  94   //    c_rarg4:   (interpreter) entry point              address
  95   //    c_rarg5:   parameters                             intptr_t*
  96   //    c_rarg6:   parameter size (in words)              int
  97   //    c_rarg7:   thread                                 Thread*
  98   //
  99   // There is no return from the stub itself as any Java result
 100   // is written to result
 101   //
 102   // we save x1 (ra) as the return PC at the base of the frame and
 103   // link x8 (fp) below it as the frame pointer installing sp (x2)
 104   // into fp.
 105   //
 106   // we save x10-x17, which accounts for all the c arguments.
 107   //
 108   // TODO: strictly do we need to save them all? they are treated as
 109   // volatile by C so could we omit saving the ones we are going to
 110   // place in global registers (thread? method?) or those we only use
 111   // during setup of the Java call?
 112   //
 113   // we don't need to save x5 which C uses as an indirect result location
 114   // return register.
 115   //
 116   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 117   // volatile
 118   //
 119   // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
 120   // registers and C expects to be callee-save
 121   //
 122   // so the stub frame looks like this when we enter Java code
 123   //
 124   //     [ return_from_Java     ] <--- sp
 125   //     [ argument word n      ]
 126   //      ...
 127   // -35 [ argument word 1      ]
 128   // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
 129   // -33 [ saved f27            ]
 130   // -32 [ saved f26            ]
 131   // -31 [ saved f25            ]
 132   // -30 [ saved f24            ]
 133   // -29 [ saved f23            ]
 134   // -28 [ saved f22            ]
 135   // -27 [ saved f21            ]
 136   // -26 [ saved f20            ]
 137   // -25 [ saved f19            ]
 138   // -24 [ saved f18            ]
 139   // -23 [ saved f9             ]
 140   // -22 [ saved f8             ]
 141   // -21 [ saved x27            ]
 142   // -20 [ saved x26            ]
 143   // -19 [ saved x25            ]
 144   // -18 [ saved x24            ]
 145   // -17 [ saved x23            ]
 146   // -16 [ saved x22            ]
 147   // -15 [ saved x21            ]
 148   // -14 [ saved x20            ]
 149   // -13 [ saved x19            ]
 150   // -12 [ saved x18            ]
 151   // -11 [ saved x9             ]
 152   // -10 [ call wrapper   (x10) ]
 153   //  -9 [ result         (x11) ]
 154   //  -8 [ result type    (x12) ]
 155   //  -7 [ method         (x13) ]
 156   //  -6 [ entry point    (x14) ]
 157   //  -5 [ parameters     (x15) ]
 158   //  -4 [ parameter size (x16) ]
 159   //  -3 [ thread         (x17) ]
 160   //  -2 [ saved fp       (x8)  ]
 161   //  -1 [ saved ra       (x1)  ]
 162   //   0 [                      ] <--- fp == saved sp (x2)
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off  = -34,
 167 
 168     frm_off            = sp_after_call_off,
 169     f27_off            = -33,
 170     f26_off            = -32,
 171     f25_off            = -31,
 172     f24_off            = -30,
 173     f23_off            = -29,
 174     f22_off            = -28,
 175     f21_off            = -27,
 176     f20_off            = -26,
 177     f19_off            = -25,
 178     f18_off            = -24,
 179     f9_off             = -23,
 180     f8_off             = -22,
 181 
 182     x27_off            = -21,
 183     x26_off            = -20,
 184     x25_off            = -19,
 185     x24_off            = -18,
 186     x23_off            = -17,
 187     x22_off            = -16,
 188     x21_off            = -15,
 189     x20_off            = -14,
 190     x19_off            = -13,
 191     x18_off            = -12,
 192     x9_off             = -11,
 193 
 194     call_wrapper_off   = -10,
 195     result_off         = -9,
 196     result_type_off    = -8,
 197     method_off         = -7,
 198     entry_point_off    = -6,
 199     parameters_off     = -5,
 200     parameter_size_off = -4,
 201     thread_off         = -3,
 202     fp_f               = -2,
 203     retaddr_off        = -1,
 204   };
 205 
 206   address generate_call_stub(address& return_address) {
 207     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 208            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 209            "adjust this code");
 210 
 211     StubCodeMark mark(this, "StubRoutines", "call_stub");
 212     address start = __ pc();
 213 
 214     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 215 
 216     const Address frm_save      (fp, frm_off           * wordSize);
 217     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 218     const Address result        (fp, result_off         * wordSize);
 219     const Address result_type   (fp, result_type_off    * wordSize);
 220     const Address method        (fp, method_off         * wordSize);
 221     const Address entry_point   (fp, entry_point_off    * wordSize);
 222     const Address parameters    (fp, parameters_off     * wordSize);
 223     const Address parameter_size(fp, parameter_size_off * wordSize);
 224 
 225     const Address thread        (fp, thread_off         * wordSize);
 226 
 227     const Address f27_save      (fp, f27_off            * wordSize);
 228     const Address f26_save      (fp, f26_off            * wordSize);
 229     const Address f25_save      (fp, f25_off            * wordSize);
 230     const Address f24_save      (fp, f24_off            * wordSize);
 231     const Address f23_save      (fp, f23_off            * wordSize);
 232     const Address f22_save      (fp, f22_off            * wordSize);
 233     const Address f21_save      (fp, f21_off            * wordSize);
 234     const Address f20_save      (fp, f20_off            * wordSize);
 235     const Address f19_save      (fp, f19_off            * wordSize);
 236     const Address f18_save      (fp, f18_off            * wordSize);
 237     const Address f9_save       (fp, f9_off             * wordSize);
 238     const Address f8_save       (fp, f8_off             * wordSize);
 239 
 240     const Address x27_save      (fp, x27_off            * wordSize);
 241     const Address x26_save      (fp, x26_off            * wordSize);
 242     const Address x25_save      (fp, x25_off            * wordSize);
 243     const Address x24_save      (fp, x24_off            * wordSize);
 244     const Address x23_save      (fp, x23_off            * wordSize);
 245     const Address x22_save      (fp, x22_off            * wordSize);
 246     const Address x21_save      (fp, x21_off            * wordSize);
 247     const Address x20_save      (fp, x20_off            * wordSize);
 248     const Address x19_save      (fp, x19_off            * wordSize);
 249     const Address x18_save      (fp, x18_off            * wordSize);
 250 
 251     const Address x9_save       (fp, x9_off             * wordSize);
 252 
 253     // stub code
 254 
 255     address riscv_entry = __ pc();
 256 
 257     // set up frame and move sp to end of save area
 258     __ enter();
 259     __ addi(sp, fp, sp_after_call_off * wordSize);
 260 
 261     // save register parameters and Java temporary/global registers
 262     // n.b. we save thread even though it gets installed in
 263     // xthread because we want to sanity check tp later
 264     __ sd(c_rarg7, thread);
 265     __ sw(c_rarg6, parameter_size);
 266     __ sd(c_rarg5, parameters);
 267     __ sd(c_rarg4, entry_point);
 268     __ sd(c_rarg3, method);
 269     __ sd(c_rarg2, result_type);
 270     __ sd(c_rarg1, result);
 271     __ sd(c_rarg0, call_wrapper);
 272 
 273     __ sd(x9, x9_save);
 274 
 275     __ sd(x18, x18_save);
 276     __ sd(x19, x19_save);
 277     __ sd(x20, x20_save);
 278     __ sd(x21, x21_save);
 279     __ sd(x22, x22_save);
 280     __ sd(x23, x23_save);
 281     __ sd(x24, x24_save);
 282     __ sd(x25, x25_save);
 283     __ sd(x26, x26_save);
 284     __ sd(x27, x27_save);
 285 
 286     __ fsd(f8,  f8_save);
 287     __ fsd(f9,  f9_save);
 288     __ fsd(f18, f18_save);
 289     __ fsd(f19, f19_save);
 290     __ fsd(f20, f20_save);
 291     __ fsd(f21, f21_save);
 292     __ fsd(f22, f22_save);
 293     __ fsd(f23, f23_save);
 294     __ fsd(f24, f24_save);
 295     __ fsd(f25, f25_save);
 296     __ fsd(f26, f26_save);
 297     __ fsd(f27, f27_save);
 298 
 299     __ frrm(t0);
 300     __ sd(t0, frm_save);
 301     // Set frm to the state we need. We do want Round to Nearest. We
 302     // don't want non-IEEE rounding modes.
 303     Label skip_fsrmi;
 304     guarantee(__ RoundingMode::rne == 0, "must be");
 305     __ beqz(t0, skip_fsrmi);
 306     __ fsrmi(__ RoundingMode::rne);
 307     __ bind(skip_fsrmi);
 308 
 309     // install Java thread in global register now we have saved
 310     // whatever value it held
 311     __ mv(xthread, c_rarg7);
 312 
 313     // And method
 314     __ mv(xmethod, c_rarg3);
 315 
 316     // set up the heapbase register
 317     __ reinit_heapbase();
 318 
 319 #ifdef ASSERT
 320     // make sure we have no pending exceptions
 321     {
 322       Label L;
 323       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 324       __ beqz(t0, L);
 325       __ stop("StubRoutines::call_stub: entered with pending exception");
 326       __ BIND(L);
 327     }
 328 #endif
 329     // pass parameters if any
 330     __ mv(esp, sp);
 331     __ slli(t0, c_rarg6, LogBytesPerWord);
 332     __ sub(t0, sp, t0); // Move SP out of the way
 333     __ andi(sp, t0, -2 * wordSize);
 334 
 335     BLOCK_COMMENT("pass parameters if any");
 336     Label parameters_done;
 337     // parameter count is still in c_rarg6
 338     // and parameter pointer identifying param 1 is in c_rarg5
 339     __ beqz(c_rarg6, parameters_done);
 340 
 341     address loop = __ pc();
 342     __ ld(t0, Address(c_rarg5, 0));
 343     __ addi(c_rarg5, c_rarg5, wordSize);
 344     __ addi(c_rarg6, c_rarg6, -1);
 345     __ push_reg(t0);
 346     __ bgtz(c_rarg6, loop);
 347 
 348     __ BIND(parameters_done);
 349 
 350     // call Java entry -- passing methdoOop, and current sp
 351     //      xmethod: Method*
 352     //      x19_sender_sp: sender sp
 353     BLOCK_COMMENT("call Java function");
 354     __ mv(x19_sender_sp, sp);
 355     __ jalr(c_rarg4);
 356 
 357     // save current address for use by exception handling code
 358 
 359     return_address = __ pc();
 360 
 361     // store result depending on type (everything that is not
 362     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 363     // n.b. this assumes Java returns an integral result in x10
 364     // and a floating result in j_farg0
 365     __ ld(j_rarg2, result);
 366     Label is_long, is_float, is_double, exit;
 367     __ ld(j_rarg1, result_type);
 368     __ mv(t0, (u1)T_OBJECT);
 369     __ beq(j_rarg1, t0, is_long);
 370     __ mv(t0, (u1)T_LONG);
 371     __ beq(j_rarg1, t0, is_long);
 372     __ mv(t0, (u1)T_FLOAT);
 373     __ beq(j_rarg1, t0, is_float);
 374     __ mv(t0, (u1)T_DOUBLE);
 375     __ beq(j_rarg1, t0, is_double);
 376 
 377     // handle T_INT case
 378     __ sw(x10, Address(j_rarg2));
 379 
 380     __ BIND(exit);
 381 
 382     // pop parameters
 383     __ addi(esp, fp, sp_after_call_off * wordSize);
 384 
 385 #ifdef ASSERT
 386     // verify that threads correspond
 387     {
 388       Label L, S;
 389       __ ld(t0, thread);
 390       __ bne(xthread, t0, S);
 391       __ get_thread(t0);
 392       __ beq(xthread, t0, L);
 393       __ BIND(S);
 394       __ stop("StubRoutines::call_stub: threads must correspond");
 395       __ BIND(L);
 396     }
 397 #endif
 398 
 399     __ pop_cont_fastpath(xthread);
 400 
 401     // restore callee-save registers
 402     __ fld(f27, f27_save);
 403     __ fld(f26, f26_save);
 404     __ fld(f25, f25_save);
 405     __ fld(f24, f24_save);
 406     __ fld(f23, f23_save);
 407     __ fld(f22, f22_save);
 408     __ fld(f21, f21_save);
 409     __ fld(f20, f20_save);
 410     __ fld(f19, f19_save);
 411     __ fld(f18, f18_save);
 412     __ fld(f9,  f9_save);
 413     __ fld(f8,  f8_save);
 414 
 415     __ ld(x27, x27_save);
 416     __ ld(x26, x26_save);
 417     __ ld(x25, x25_save);
 418     __ ld(x24, x24_save);
 419     __ ld(x23, x23_save);
 420     __ ld(x22, x22_save);
 421     __ ld(x21, x21_save);
 422     __ ld(x20, x20_save);
 423     __ ld(x19, x19_save);
 424     __ ld(x18, x18_save);
 425 
 426     __ ld(x9, x9_save);
 427 
 428     // restore frm
 429     Label skip_fsrm;
 430     __ ld(t0, frm_save);
 431     __ frrm(t1);
 432     __ beq(t0, t1, skip_fsrm);
 433     __ fsrm(t0);
 434     __ bind(skip_fsrm);
 435 
 436     __ ld(c_rarg0, call_wrapper);
 437     __ ld(c_rarg1, result);
 438     __ ld(c_rarg2, result_type);
 439     __ ld(c_rarg3, method);
 440     __ ld(c_rarg4, entry_point);
 441     __ ld(c_rarg5, parameters);
 442     __ ld(c_rarg6, parameter_size);
 443     __ ld(c_rarg7, thread);
 444 
 445     // leave frame and return to caller
 446     __ leave();
 447     __ ret();
 448 
 449     // handle return types different from T_INT
 450 
 451     __ BIND(is_long);
 452     __ sd(x10, Address(j_rarg2, 0));
 453     __ j(exit);
 454 
 455     __ BIND(is_float);
 456     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 457     __ j(exit);
 458 
 459     __ BIND(is_double);
 460     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 461     __ j(exit);
 462 
 463     return start;
 464   }
 465 
 466   // Return point for a Java call if there's an exception thrown in
 467   // Java code.  The exception is caught and transformed into a
 468   // pending exception stored in JavaThread that can be tested from
 469   // within the VM.
 470   //
 471   // Note: Usually the parameters are removed by the callee. In case
 472   // of an exception crossing an activation frame boundary, that is
 473   // not the case if the callee is compiled code => need to setup the
 474   // sp.
 475   //
 476   // x10: exception oop
 477 
 478   address generate_catch_exception() {
 479     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 480     address start = __ pc();
 481 
 482     // same as in generate_call_stub():
 483     const Address thread(fp, thread_off * wordSize);
 484 
 485 #ifdef ASSERT
 486     // verify that threads correspond
 487     {
 488       Label L, S;
 489       __ ld(t0, thread);
 490       __ bne(xthread, t0, S);
 491       __ get_thread(t0);
 492       __ beq(xthread, t0, L);
 493       __ bind(S);
 494       __ stop("StubRoutines::catch_exception: threads must correspond");
 495       __ bind(L);
 496     }
 497 #endif
 498 
 499     // set pending exception
 500     __ verify_oop(x10);
 501 
 502     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 503     __ mv(t0, (address)__FILE__);
 504     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 505     __ mv(t0, (int)__LINE__);
 506     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 507 
 508     // complete return to VM
 509     assert(StubRoutines::_call_stub_return_address != nullptr,
 510            "_call_stub_return_address must have been generated before");
 511     __ j(StubRoutines::_call_stub_return_address);
 512 
 513     return start;
 514   }
 515 
 516   // Continuation point for runtime calls returning with a pending
 517   // exception.  The pending exception check happened in the runtime
 518   // or native call stub.  The pending exception in Thread is
 519   // converted into a Java-level exception.
 520   //
 521   // Contract with Java-level exception handlers:
 522   // x10: exception
 523   // x13: throwing pc
 524   //
 525   // NOTE: At entry of this stub, exception-pc must be in RA !!
 526 
 527   // NOTE: this is always used as a jump target within generated code
 528   // so it just needs to be generated code with no x86 prolog
 529 
 530   address generate_forward_exception() {
 531     StubCodeMark mark(this, "StubRoutines", "forward exception");
 532     address start = __ pc();
 533 
 534     // Upon entry, RA points to the return address returning into
 535     // Java (interpreted or compiled) code; i.e., the return address
 536     // becomes the throwing pc.
 537     //
 538     // Arguments pushed before the runtime call are still on the stack
 539     // but the exception handler will reset the stack pointer ->
 540     // ignore them.  A potential result in registers can be ignored as
 541     // well.
 542 
 543 #ifdef ASSERT
 544     // make sure this code is only executed if there is a pending exception
 545     {
 546       Label L;
 547       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 548       __ bnez(t0, L);
 549       __ stop("StubRoutines::forward exception: no pending exception (1)");
 550       __ bind(L);
 551     }
 552 #endif
 553 
 554     // compute exception handler into x9
 555 
 556     // call the VM to find the handler address associated with the
 557     // caller address. pass thread in x10 and caller pc (ret address)
 558     // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
 559     // the stack.
 560     __ mv(c_rarg1, ra);
 561     // ra will be trashed by the VM call so we move it to x9
 562     // (callee-saved) because we also need to pass it to the handler
 563     // returned by this call.
 564     __ mv(x9, ra);
 565     BLOCK_COMMENT("call exception_handler_for_return_address");
 566     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 567                          SharedRuntime::exception_handler_for_return_address),
 568                     xthread, c_rarg1);
 569     // we should not really care that ra is no longer the callee
 570     // address. we saved the value the handler needs in x9 so we can
 571     // just copy it to x13. however, the C2 handler will push its own
 572     // frame and then calls into the VM and the VM code asserts that
 573     // the PC for the frame above the handler belongs to a compiled
 574     // Java method. So, we restore ra here to satisfy that assert.
 575     __ mv(ra, x9);
 576     // setup x10 & x13 & clear pending exception
 577     __ mv(x13, x9);
 578     __ mv(x9, x10);
 579     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 580     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 581 
 582 #ifdef ASSERT
 583     // make sure exception is set
 584     {
 585       Label L;
 586       __ bnez(x10, L);
 587       __ stop("StubRoutines::forward exception: no pending exception (2)");
 588       __ bind(L);
 589     }
 590 #endif
 591 
 592     // continue at exception handler
 593     // x10: exception
 594     // x13: throwing pc
 595     // x9: exception handler
 596     __ verify_oop(x10);
 597     __ jr(x9);
 598 
 599     return start;
 600   }
 601 
 602   // Non-destructive plausibility checks for oops
 603   //
 604   // Arguments:
 605   //    x10: oop to verify
 606   //    t0: error message
 607   //
 608   // Stack after saving c_rarg3:
 609   //    [tos + 0]: saved c_rarg3
 610   //    [tos + 1]: saved c_rarg2
 611   //    [tos + 2]: saved ra
 612   //    [tos + 3]: saved t1
 613   //    [tos + 4]: saved x10
 614   //    [tos + 5]: saved t0
 615   address generate_verify_oop() {
 616 
 617     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 618     address start = __ pc();
 619 
 620     Label exit, error;
 621 
 622     __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
 623 
 624     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 625     __ ld(c_rarg3, Address(c_rarg2));
 626     __ add(c_rarg3, c_rarg3, 1);
 627     __ sd(c_rarg3, Address(c_rarg2));
 628 
 629     // object is in x10
 630     // make sure object is 'reasonable'
 631     __ beqz(x10, exit); // if obj is null it is OK
 632 
 633     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 634     bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
 635 
 636     // return if everything seems ok
 637     __ bind(exit);
 638 
 639     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp);  // pop c_rarg2 and c_rarg3
 640     __ ret();
 641 
 642     // handle errors
 643     __ bind(error);
 644     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
 645 
 646     __ push_reg(RegSet::range(x0, x31), sp);
 647     // debug(char* msg, int64_t pc, int64_t regs[])
 648     __ mv(c_rarg0, t0);             // pass address of error message
 649     __ mv(c_rarg1, ra);             // pass return address
 650     __ mv(c_rarg2, sp);             // pass address of regs on stack
 651 #ifndef PRODUCT
 652     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 653 #endif
 654     BLOCK_COMMENT("call MacroAssembler::debug");
 655     __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 656     __ ebreak();
 657 
 658     return start;
 659   }
 660 
 661   // The inner part of zero_words().
 662   //
 663   // Inputs:
 664   // x28: the HeapWord-aligned base address of an array to zero.
 665   // x29: the count in HeapWords, x29 > 0.
 666   //
 667   // Returns x28 and x29, adjusted for the caller to clear.
 668   // x28: the base address of the tail of words left to clear.
 669   // x29: the number of words in the tail.
 670   //      x29 < MacroAssembler::zero_words_block_size.
 671 
 672   address generate_zero_blocks() {
 673     Label done;
 674 
 675     const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
 676 
 677     __ align(CodeEntryAlignment);
 678     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 679     address start = __ pc();
 680 
 681     if (UseBlockZeroing) {
 682       // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero
 683       // after alignment.
 684       Label small;
 685       int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize;
 686       __ mv(tmp1, low_limit);
 687       __ blt(cnt, tmp1, small);
 688       __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
 689       __ bind(small);
 690     }
 691 
 692     {
 693       // Clear the remaining blocks.
 694       Label loop;
 695       __ mv(tmp1, MacroAssembler::zero_words_block_size);
 696       __ blt(cnt, tmp1, done);
 697       __ bind(loop);
 698       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 699         __ sd(zr, Address(base, i * wordSize));
 700       }
 701       __ add(base, base, MacroAssembler::zero_words_block_size * wordSize);
 702       __ sub(cnt, cnt, MacroAssembler::zero_words_block_size);
 703       __ bge(cnt, tmp1, loop);
 704       __ bind(done);
 705     }
 706 
 707     __ ret();
 708 
 709     return start;
 710   }
 711 
 712   typedef enum {
 713     copy_forwards = 1,
 714     copy_backwards = -1
 715   } copy_direction;
 716 
 717   // Bulk copy of blocks of 8 words.
 718   //
 719   // count is a count of words.
 720   //
 721   // Precondition: count >= 8
 722   //
 723   // Postconditions:
 724   //
 725   // The least significant bit of count contains the remaining count
 726   // of words to copy.  The rest of count is trash.
 727   //
 728   // s and d are adjusted to point to the remaining words to copy
 729   //
 730   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 731                            copy_direction direction) {
 732     int unit = wordSize * direction;
 733     int bias = wordSize;
 734 
 735     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 736       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 737 
 738     const Register stride = x30;
 739 
 740     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 741       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 742     assert_different_registers(s, d, count, t0);
 743 
 744     Label again, drain;
 745     const char* stub_name = nullptr;
 746     if (direction == copy_forwards) {
 747       stub_name = "forward_copy_longs";
 748     } else {
 749       stub_name = "backward_copy_longs";
 750     }
 751     StubCodeMark mark(this, "StubRoutines", stub_name);
 752     __ align(CodeEntryAlignment);
 753     __ bind(start);
 754 
 755     if (direction == copy_forwards) {
 756       __ sub(s, s, bias);
 757       __ sub(d, d, bias);
 758     }
 759 
 760 #ifdef ASSERT
 761     // Make sure we are never given < 8 words
 762     {
 763       Label L;
 764 
 765       __ mv(t0, 8);
 766       __ bge(count, t0, L);
 767       __ stop("genrate_copy_longs called with < 8 words");
 768       __ bind(L);
 769     }
 770 #endif
 771 
 772     __ ld(tmp_reg0, Address(s, 1 * unit));
 773     __ ld(tmp_reg1, Address(s, 2 * unit));
 774     __ ld(tmp_reg2, Address(s, 3 * unit));
 775     __ ld(tmp_reg3, Address(s, 4 * unit));
 776     __ ld(tmp_reg4, Address(s, 5 * unit));
 777     __ ld(tmp_reg5, Address(s, 6 * unit));
 778     __ ld(tmp_reg6, Address(s, 7 * unit));
 779     __ ld(tmp_reg7, Address(s, 8 * unit));
 780     __ addi(s, s, 8 * unit);
 781 
 782     __ sub(count, count, 16);
 783     __ bltz(count, drain);
 784 
 785     __ bind(again);
 786 
 787     __ sd(tmp_reg0, Address(d, 1 * unit));
 788     __ sd(tmp_reg1, Address(d, 2 * unit));
 789     __ sd(tmp_reg2, Address(d, 3 * unit));
 790     __ sd(tmp_reg3, Address(d, 4 * unit));
 791     __ sd(tmp_reg4, Address(d, 5 * unit));
 792     __ sd(tmp_reg5, Address(d, 6 * unit));
 793     __ sd(tmp_reg6, Address(d, 7 * unit));
 794     __ sd(tmp_reg7, Address(d, 8 * unit));
 795 
 796     __ ld(tmp_reg0, Address(s, 1 * unit));
 797     __ ld(tmp_reg1, Address(s, 2 * unit));
 798     __ ld(tmp_reg2, Address(s, 3 * unit));
 799     __ ld(tmp_reg3, Address(s, 4 * unit));
 800     __ ld(tmp_reg4, Address(s, 5 * unit));
 801     __ ld(tmp_reg5, Address(s, 6 * unit));
 802     __ ld(tmp_reg6, Address(s, 7 * unit));
 803     __ ld(tmp_reg7, Address(s, 8 * unit));
 804 
 805     __ addi(s, s, 8 * unit);
 806     __ addi(d, d, 8 * unit);
 807 
 808     __ sub(count, count, 8);
 809     __ bgez(count, again);
 810 
 811     // Drain
 812     __ bind(drain);
 813 
 814     __ sd(tmp_reg0, Address(d, 1 * unit));
 815     __ sd(tmp_reg1, Address(d, 2 * unit));
 816     __ sd(tmp_reg2, Address(d, 3 * unit));
 817     __ sd(tmp_reg3, Address(d, 4 * unit));
 818     __ sd(tmp_reg4, Address(d, 5 * unit));
 819     __ sd(tmp_reg5, Address(d, 6 * unit));
 820     __ sd(tmp_reg6, Address(d, 7 * unit));
 821     __ sd(tmp_reg7, Address(d, 8 * unit));
 822     __ addi(d, d, 8 * unit);
 823 
 824     {
 825       Label L1, L2;
 826       __ test_bit(t0, count, 2);
 827       __ beqz(t0, L1);
 828 
 829       __ ld(tmp_reg0, Address(s, 1 * unit));
 830       __ ld(tmp_reg1, Address(s, 2 * unit));
 831       __ ld(tmp_reg2, Address(s, 3 * unit));
 832       __ ld(tmp_reg3, Address(s, 4 * unit));
 833       __ addi(s, s, 4 * unit);
 834 
 835       __ sd(tmp_reg0, Address(d, 1 * unit));
 836       __ sd(tmp_reg1, Address(d, 2 * unit));
 837       __ sd(tmp_reg2, Address(d, 3 * unit));
 838       __ sd(tmp_reg3, Address(d, 4 * unit));
 839       __ addi(d, d, 4 * unit);
 840 
 841       __ bind(L1);
 842 
 843       if (direction == copy_forwards) {
 844         __ addi(s, s, bias);
 845         __ addi(d, d, bias);
 846       }
 847 
 848       __ test_bit(t0, count, 1);
 849       __ beqz(t0, L2);
 850       if (direction == copy_backwards) {
 851         __ addi(s, s, 2 * unit);
 852         __ ld(tmp_reg0, Address(s));
 853         __ ld(tmp_reg1, Address(s, wordSize));
 854         __ addi(d, d, 2 * unit);
 855         __ sd(tmp_reg0, Address(d));
 856         __ sd(tmp_reg1, Address(d, wordSize));
 857       } else {
 858         __ ld(tmp_reg0, Address(s));
 859         __ ld(tmp_reg1, Address(s, wordSize));
 860         __ addi(s, s, 2 * unit);
 861         __ sd(tmp_reg0, Address(d));
 862         __ sd(tmp_reg1, Address(d, wordSize));
 863         __ addi(d, d, 2 * unit);
 864       }
 865       __ bind(L2);
 866     }
 867 
 868     __ ret();
 869   }
 870 
 871   Label copy_f, copy_b;
 872 
 873   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 874 
 875   void copy_memory_v(Register s, Register d, Register count, int step) {
 876     bool is_backward = step < 0;
 877     int granularity = uabs(step);
 878 
 879     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 880     assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
 881     Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
 882     Label loop_forward, loop_backward, done;
 883 
 884     __ mv(dst, d);
 885     __ mv(src, s);
 886     __ mv(cnt, count);
 887 
 888     __ bind(loop_forward);
 889     __ vsetvli(vl, cnt, sew, Assembler::m8);
 890     if (is_backward) {
 891       __ bne(vl, cnt, loop_backward);
 892     }
 893 
 894     __ vlex_v(v0, src, sew);
 895     __ sub(cnt, cnt, vl);
 896     if (sew != Assembler::e8) {
 897       // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 898       __ slli(vl, vl, sew);
 899     }
 900     __ add(src, src, vl);
 901 
 902     __ vsex_v(v0, dst, sew);
 903     __ add(dst, dst, vl);
 904     __ bnez(cnt, loop_forward);
 905 
 906     if (is_backward) {
 907       __ j(done);
 908 
 909       __ bind(loop_backward);
 910       __ sub(t0, cnt, vl);
 911       if (sew != Assembler::e8) {
 912         // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 913         __ slli(t0, t0, sew);
 914       }
 915       __ add(tmp1, s, t0);
 916       __ vlex_v(v0, tmp1, sew);
 917       __ add(tmp2, d, t0);
 918       __ vsex_v(v0, tmp2, sew);
 919       __ sub(cnt, cnt, vl);
 920       __ bnez(cnt, loop_forward);
 921       __ bind(done);
 922     }
 923   }
 924 
 925   // All-singing all-dancing memory copy.
 926   //
 927   // Copy count units of memory from s to d.  The size of a unit is
 928   // step, which can be positive or negative depending on the direction
 929   // of copy.
 930   //
 931   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 932                    Register s, Register d, Register count, int step) {
 933     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 934     if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
 935       return copy_memory_v(s, d, count, step);
 936     }
 937 
 938     bool is_backwards = step < 0;
 939     int granularity = uabs(step);
 940 
 941     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
 942     const Register gct1 = x28, gct2 = x29, gct3 = t2;
 943 
 944     Label same_aligned;
 945     Label copy_big, copy32_loop, copy8_loop, copy_small, done;
 946 
 947     // The size of copy32_loop body increases significantly with ZGC GC barriers.
 948     // Need conditional far branches to reach a point beyond the loop in this case.
 949     bool is_far = UseZGC && ZGenerational;
 950 
 951     __ beqz(count, done, is_far);
 952     __ slli(cnt, count, exact_log2(granularity));
 953     if (is_backwards) {
 954       __ add(src, s, cnt);
 955       __ add(dst, d, cnt);
 956     } else {
 957       __ mv(src, s);
 958       __ mv(dst, d);
 959     }
 960 
 961     if (is_aligned) {
 962       __ addi(t0, cnt, -32);
 963       __ bgez(t0, copy32_loop);
 964       __ addi(t0, cnt, -8);
 965       __ bgez(t0, copy8_loop, is_far);
 966       __ j(copy_small);
 967     } else {
 968       __ mv(t0, 16);
 969       __ blt(cnt, t0, copy_small, is_far);
 970 
 971       __ xorr(t0, src, dst);
 972       __ andi(t0, t0, 0b111);
 973       __ bnez(t0, copy_small, is_far);
 974 
 975       __ bind(same_aligned);
 976       __ andi(t0, src, 0b111);
 977       __ beqz(t0, copy_big);
 978       if (is_backwards) {
 979         __ addi(src, src, step);
 980         __ addi(dst, dst, step);
 981       }
 982       bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
 983       bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
 984       if (!is_backwards) {
 985         __ addi(src, src, step);
 986         __ addi(dst, dst, step);
 987       }
 988       __ addi(cnt, cnt, -granularity);
 989       __ beqz(cnt, done, is_far);
 990       __ j(same_aligned);
 991 
 992       __ bind(copy_big);
 993       __ mv(t0, 32);
 994       __ blt(cnt, t0, copy8_loop, is_far);
 995     }
 996 
 997     __ bind(copy32_loop);
 998     if (is_backwards) {
 999       __ addi(src, src, -wordSize * 4);
1000       __ addi(dst, dst, -wordSize * 4);
1001     }
1002     // we first load 32 bytes, then write it, so the direction here doesn't matter
1003     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src),     gct1);
1004     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8),  gct1);
1005     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
1006     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
1007 
1008     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst),     tmp3, gct1, gct2, gct3);
1009     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8),  tmp4, gct1, gct2, gct3);
1010     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
1011     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
1012 
1013     if (!is_backwards) {
1014       __ addi(src, src, wordSize * 4);
1015       __ addi(dst, dst, wordSize * 4);
1016     }
1017     __ addi(t0, cnt, -(32 + wordSize * 4));
1018     __ addi(cnt, cnt, -wordSize * 4);
1019     __ bgez(t0, copy32_loop); // cnt >= 32, do next loop
1020 
1021     __ beqz(cnt, done); // if that's all - done
1022 
1023     __ addi(t0, cnt, -8); // if not - copy the reminder
1024     __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
1025 
1026     __ bind(copy8_loop);
1027     if (is_backwards) {
1028       __ addi(src, src, -wordSize);
1029       __ addi(dst, dst, -wordSize);
1030     }
1031     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1032     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1033 
1034     if (!is_backwards) {
1035       __ addi(src, src, wordSize);
1036       __ addi(dst, dst, wordSize);
1037     }
1038     __ addi(t0, cnt, -(8 + wordSize));
1039     __ addi(cnt, cnt, -wordSize);
1040     __ bgez(t0, copy8_loop); // cnt >= 8, do next loop
1041 
1042     __ beqz(cnt, done); // if that's all - done
1043 
1044     __ bind(copy_small);
1045     if (is_backwards) {
1046       __ addi(src, src, step);
1047       __ addi(dst, dst, step);
1048     }
1049 
1050     bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
1051     bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
1052 
1053     if (!is_backwards) {
1054       __ addi(src, src, step);
1055       __ addi(dst, dst, step);
1056     }
1057     __ addi(cnt, cnt, -granularity);
1058     __ bgtz(cnt, copy_small);
1059 
1060     __ bind(done);
1061   }
1062 
1063   // Scan over array at a for count oops, verifying each one.
1064   // Preserves a and count, clobbers t0 and t1.
1065   void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1066     Label loop, end;
1067     __ mv(t1, zr);
1068     __ slli(t0, count, exact_log2(size));
1069     __ bind(loop);
1070     __ bgeu(t1, t0, end);
1071 
1072     __ add(temp, a, t1);
1073     if (size == (size_t)wordSize) {
1074       __ ld(temp, Address(temp, 0));
1075       __ verify_oop(temp);
1076     } else {
1077       __ lwu(temp, Address(temp, 0));
1078       __ decode_heap_oop(temp); // calls verify_oop
1079     }
1080     __ add(t1, t1, size);
1081     __ j(loop);
1082     __ bind(end);
1083   }
1084 
1085   // Arguments:
1086   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1087   //             ignored
1088   //   is_oop  - true => oop array, so generate store check code
1089   //   name    - stub name string
1090   //
1091   // Inputs:
1092   //   c_rarg0   - source array address
1093   //   c_rarg1   - destination array address
1094   //   c_rarg2   - element count, treated as ssize_t, can be zero
1095   //
1096   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1097   // the hardware handle it.  The two dwords within qwords that span
1098   // cache line boundaries will still be loaded and stored atomically.
1099   //
1100   // Side Effects:
1101   //   disjoint_int_copy_entry is set to the no-overlap entry point
1102   //   used by generate_conjoint_int_oop_copy().
1103   //
1104   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry,
1105                                  const char* name, bool dest_uninitialized = false) {
1106     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1107     RegSet saved_reg = RegSet::of(s, d, count);
1108     __ align(CodeEntryAlignment);
1109     StubCodeMark mark(this, "StubRoutines", name);
1110     address start = __ pc();
1111     __ enter();
1112 
1113     if (entry != nullptr) {
1114       *entry = __ pc();
1115       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1116       BLOCK_COMMENT("Entry:");
1117     }
1118 
1119     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1120     if (dest_uninitialized) {
1121       decorators |= IS_DEST_UNINITIALIZED;
1122     }
1123     if (aligned) {
1124       decorators |= ARRAYCOPY_ALIGNED;
1125     }
1126 
1127     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1128     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1129 
1130     if (is_oop) {
1131       // save regs before copy_memory
1132       __ push_reg(RegSet::of(d, count), sp);
1133     }
1134 
1135     {
1136       // UnsafeMemoryAccess page error: continue after unsafe access
1137       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1138       UnsafeMemoryAccessMark umam(this, add_entry, true);
1139       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1140     }
1141 
1142     if (is_oop) {
1143       __ pop_reg(RegSet::of(d, count), sp);
1144       if (VerifyOops) {
1145         verify_oop_array(size, d, count, t2);
1146       }
1147     }
1148 
1149     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1150 
1151     __ leave();
1152     __ mv(x10, zr); // return 0
1153     __ ret();
1154     return start;
1155   }
1156 
1157   // Arguments:
1158   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1159   //             ignored
1160   //   is_oop  - true => oop array, so generate store check code
1161   //   name    - stub name string
1162   //
1163   // Inputs:
1164   //   c_rarg0   - source array address
1165   //   c_rarg1   - destination array address
1166   //   c_rarg2   - element count, treated as ssize_t, can be zero
1167   //
1168   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1169   // the hardware handle it.  The two dwords within qwords that span
1170   // cache line boundaries will still be loaded and stored atomically.
1171   //
1172   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1173                                  address* entry, const char* name,
1174                                  bool dest_uninitialized = false) {
1175     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1176     RegSet saved_regs = RegSet::of(s, d, count);
1177     StubCodeMark mark(this, "StubRoutines", name);
1178     address start = __ pc();
1179     __ enter();
1180 
1181     if (entry != nullptr) {
1182       *entry = __ pc();
1183       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1184       BLOCK_COMMENT("Entry:");
1185     }
1186 
1187     // use fwd copy when (d-s) above_equal (count*size)
1188     __ sub(t0, d, s);
1189     __ slli(t1, count, exact_log2(size));
1190     Label L_continue;
1191     __ bltu(t0, t1, L_continue);
1192     __ j(nooverlap_target);
1193     __ bind(L_continue);
1194 
1195     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1196     if (dest_uninitialized) {
1197       decorators |= IS_DEST_UNINITIALIZED;
1198     }
1199     if (aligned) {
1200       decorators |= ARRAYCOPY_ALIGNED;
1201     }
1202 
1203     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1204     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1205 
1206     if (is_oop) {
1207       // save regs before copy_memory
1208       __ push_reg(RegSet::of(d, count), sp);
1209     }
1210 
1211     {
1212       // UnsafeMemoryAccess page error: continue after unsafe access
1213       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1214       UnsafeMemoryAccessMark umam(this, add_entry, true);
1215       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1216     }
1217 
1218     if (is_oop) {
1219       __ pop_reg(RegSet::of(d, count), sp);
1220       if (VerifyOops) {
1221         verify_oop_array(size, d, count, t2);
1222       }
1223     }
1224     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet());
1225     __ leave();
1226     __ mv(x10, zr); // return 0
1227     __ ret();
1228     return start;
1229   }
1230 
1231   // Arguments:
1232   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1233   //             ignored
1234   //   name    - stub name string
1235   //
1236   // Inputs:
1237   //   c_rarg0   - source array address
1238   //   c_rarg1   - destination array address
1239   //   c_rarg2   - element count, treated as ssize_t, can be zero
1240   //
1241   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1242   // we let the hardware handle it.  The one to eight bytes within words,
1243   // dwords or qwords that span cache line boundaries will still be loaded
1244   // and stored atomically.
1245   //
1246   // Side Effects:
1247   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1248   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1249   // we let the hardware handle it.  The one to eight bytes within words,
1250   // dwords or qwords that span cache line boundaries will still be loaded
1251   // and stored atomically.
1252   //
1253   // Side Effects:
1254   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1255   //   used by generate_conjoint_byte_copy().
1256   //
1257   address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) {
1258     const bool not_oop = false;
1259     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1260   }
1261 
1262   // Arguments:
1263   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1264   //             ignored
1265   //   name    - stub name string
1266   //
1267   // Inputs:
1268   //   c_rarg0   - source array address
1269   //   c_rarg1   - destination array address
1270   //   c_rarg2   - element count, treated as ssize_t, can be zero
1271   //
1272   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1273   // we let the hardware handle it.  The one to eight bytes within words,
1274   // dwords or qwords that span cache line boundaries will still be loaded
1275   // and stored atomically.
1276   //
1277   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1278                                       address* entry, const char* name) {
1279     const bool not_oop = false;
1280     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1281   }
1282 
1283   // Arguments:
1284   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1285   //             ignored
1286   //   name    - stub name string
1287   //
1288   // Inputs:
1289   //   c_rarg0   - source array address
1290   //   c_rarg1   - destination array address
1291   //   c_rarg2   - element count, treated as ssize_t, can be zero
1292   //
1293   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1294   // let the hardware handle it.  The two or four words within dwords
1295   // or qwords that span cache line boundaries will still be loaded
1296   // and stored atomically.
1297   //
1298   // Side Effects:
1299   //   disjoint_short_copy_entry is set to the no-overlap entry point
1300   //   used by generate_conjoint_short_copy().
1301   //
1302   address generate_disjoint_short_copy(bool aligned,
1303                                        address* entry, const char* name) {
1304     const bool not_oop = false;
1305     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1306   }
1307 
1308   // Arguments:
1309   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1310   //             ignored
1311   //   name    - stub name string
1312   //
1313   // Inputs:
1314   //   c_rarg0   - source array address
1315   //   c_rarg1   - destination array address
1316   //   c_rarg2   - element count, treated as ssize_t, can be zero
1317   //
1318   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1319   // let the hardware handle it.  The two or four words within dwords
1320   // or qwords that span cache line boundaries will still be loaded
1321   // and stored atomically.
1322   //
1323   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1324                                        address* entry, const char* name) {
1325     const bool not_oop = false;
1326     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1327   }
1328 
1329   // Arguments:
1330   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1331   //             ignored
1332   //   name    - stub name string
1333   //
1334   // Inputs:
1335   //   c_rarg0   - source array address
1336   //   c_rarg1   - destination array address
1337   //   c_rarg2   - element count, treated as ssize_t, can be zero
1338   //
1339   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1340   // the hardware handle it.  The two dwords within qwords that span
1341   // cache line boundaries will still be loaded and stored atomically.
1342   //
1343   // Side Effects:
1344   //   disjoint_int_copy_entry is set to the no-overlap entry point
1345   //   used by generate_conjoint_int_oop_copy().
1346   //
1347   address generate_disjoint_int_copy(bool aligned, address* entry,
1348                                      const char* name, bool dest_uninitialized = false) {
1349     const bool not_oop = false;
1350     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1351   }
1352 
1353   // Arguments:
1354   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1355   //             ignored
1356   //   name    - stub name string
1357   //
1358   // Inputs:
1359   //   c_rarg0   - source array address
1360   //   c_rarg1   - destination array address
1361   //   c_rarg2   - element count, treated as ssize_t, can be zero
1362   //
1363   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1364   // the hardware handle it.  The two dwords within qwords that span
1365   // cache line boundaries will still be loaded and stored atomically.
1366   //
1367   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1368                                      address* entry, const char* name,
1369                                      bool dest_uninitialized = false) {
1370     const bool not_oop = false;
1371     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1372   }
1373 
1374 
1375   // Arguments:
1376   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1377   //             ignored
1378   //   name    - stub name string
1379   //
1380   // Inputs:
1381   //   c_rarg0   - source array address
1382   //   c_rarg1   - destination array address
1383   //   c_rarg2   - element count, treated as size_t, can be zero
1384   //
1385   // Side Effects:
1386   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1387   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1388   //
1389   address generate_disjoint_long_copy(bool aligned, address* entry,
1390                                       const char* name, bool dest_uninitialized = false) {
1391     const bool not_oop = false;
1392     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1393   }
1394 
1395   // Arguments:
1396   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1397   //             ignored
1398   //   name    - stub name string
1399   //
1400   // Inputs:
1401   //   c_rarg0   - source array address
1402   //   c_rarg1   - destination array address
1403   //   c_rarg2   - element count, treated as size_t, can be zero
1404   //
1405   address generate_conjoint_long_copy(bool aligned,
1406                                       address nooverlap_target, address* entry,
1407                                       const char* name, bool dest_uninitialized = false) {
1408     const bool not_oop = false;
1409     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1410   }
1411 
1412   // Arguments:
1413   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1414   //             ignored
1415   //   name    - stub name string
1416   //
1417   // Inputs:
1418   //   c_rarg0   - source array address
1419   //   c_rarg1   - destination array address
1420   //   c_rarg2   - element count, treated as size_t, can be zero
1421   //
1422   // Side Effects:
1423   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1424   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1425   //
1426   address generate_disjoint_oop_copy(bool aligned, address* entry,
1427                                      const char* name, bool dest_uninitialized) {
1428     const bool is_oop = true;
1429     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1430     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1431   }
1432 
1433   // Arguments:
1434   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1435   //             ignored
1436   //   name    - stub name string
1437   //
1438   // Inputs:
1439   //   c_rarg0   - source array address
1440   //   c_rarg1   - destination array address
1441   //   c_rarg2   - element count, treated as size_t, can be zero
1442   //
1443   address generate_conjoint_oop_copy(bool aligned,
1444                                      address nooverlap_target, address* entry,
1445                                      const char* name, bool dest_uninitialized) {
1446     const bool is_oop = true;
1447     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1448     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1449                                   name, dest_uninitialized);
1450   }
1451 
1452   // Helper for generating a dynamic type check.
1453   // Smashes t0, t1.
1454   void generate_type_check(Register sub_klass,
1455                            Register super_check_offset,
1456                            Register super_klass,
1457                            Label& L_success) {
1458     assert_different_registers(sub_klass, super_check_offset, super_klass);
1459 
1460     BLOCK_COMMENT("type_check:");
1461 
1462     Label L_miss;
1463 
1464     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
1465     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1466 
1467     // Fall through on failure!
1468     __ BIND(L_miss);
1469   }
1470 
1471   //
1472   //  Generate checkcasting array copy stub
1473   //
1474   //  Input:
1475   //    c_rarg0   - source array address
1476   //    c_rarg1   - destination array address
1477   //    c_rarg2   - element count, treated as ssize_t, can be zero
1478   //    c_rarg3   - size_t ckoff (super_check_offset)
1479   //    c_rarg4   - oop ckval (super_klass)
1480   //
1481   //  Output:
1482   //    x10 ==  0  -  success
1483   //    x10 == -1^K - failure, where K is partial transfer count
1484   //
1485   address generate_checkcast_copy(const char* name, address* entry,
1486                                   bool dest_uninitialized = false) {
1487     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1488 
1489     // Input registers (after setup_arg_regs)
1490     const Register from        = c_rarg0;   // source array address
1491     const Register to          = c_rarg1;   // destination array address
1492     const Register count       = c_rarg2;   // elementscount
1493     const Register ckoff       = c_rarg3;   // super_check_offset
1494     const Register ckval       = c_rarg4;   // super_klass
1495 
1496     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1497     RegSet wb_post_saved_regs  = RegSet::of(count);
1498 
1499     // Registers used as temps (x7, x9, x18 are save-on-entry)
1500     const Register count_save  = x19;       // orig elementscount
1501     const Register start_to    = x18;       // destination array start address
1502     const Register copied_oop  = x7;        // actual oop copied
1503     const Register r9_klass    = x9;        // oop._klass
1504 
1505     // Registers used as gc temps (x15, x16, x17 are save-on-call)
1506     const Register gct1 = x15, gct2 = x16, gct3 = x17;
1507 
1508     //---------------------------------------------------------------
1509     // Assembler stub will be used for this call to arraycopy
1510     // if the two arrays are subtypes of Object[] but the
1511     // destination array type is not equal to or a supertype
1512     // of the source type.  Each element must be separately
1513     // checked.
1514 
1515     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1516                                copied_oop, r9_klass, count_save);
1517 
1518     __ align(CodeEntryAlignment);
1519     StubCodeMark mark(this, "StubRoutines", name);
1520     address start = __ pc();
1521 
1522     __ enter(); // required for proper stackwalking of RuntimeStub frame
1523 
1524     // Caller of this entry point must set up the argument registers.
1525     if (entry != nullptr) {
1526       *entry = __ pc();
1527       BLOCK_COMMENT("Entry:");
1528     }
1529 
1530     // Empty array:  Nothing to do
1531     __ beqz(count, L_done);
1532 
1533     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1534 
1535 #ifdef ASSERT
1536     BLOCK_COMMENT("assert consistent ckoff/ckval");
1537     // The ckoff and ckval must be mutually consistent,
1538     // even though caller generates both.
1539     { Label L;
1540       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1541       __ lwu(start_to, Address(ckval, sco_offset));
1542       __ beq(ckoff, start_to, L);
1543       __ stop("super_check_offset inconsistent");
1544       __ bind(L);
1545     }
1546 #endif //ASSERT
1547 
1548     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1549     if (dest_uninitialized) {
1550       decorators |= IS_DEST_UNINITIALIZED;
1551     }
1552 
1553     bool is_oop = true;
1554     int element_size = UseCompressedOops ? 4 : 8;
1555 
1556     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1557     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1558 
1559     // save the original count
1560     __ mv(count_save, count);
1561 
1562     // Copy from low to high addresses
1563     __ mv(start_to, to);              // Save destination array start address
1564     __ j(L_load_element);
1565 
1566     // ======== begin loop ========
1567     // (Loop is rotated; its entry is L_load_element.)
1568     // Loop control:
1569     //   for count to 0 do
1570     //     copied_oop = load_heap_oop(from++)
1571     //     ... generate_type_check ...
1572     //     store_heap_oop(to++, copied_oop)
1573     //   end
1574 
1575     __ align(OptoLoopAlignment);
1576 
1577     __ BIND(L_store_element);
1578     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1579                       Address(to, 0), copied_oop,
1580                       gct1, gct2, gct3);
1581     __ add(to, to, UseCompressedOops ? 4 : 8);
1582     __ sub(count, count, 1);
1583     __ beqz(count, L_do_card_marks);
1584 
1585     // ======== loop entry is here ========
1586     __ BIND(L_load_element);
1587     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1588                      copied_oop, Address(from, 0),
1589                      gct1);
1590     __ add(from, from, UseCompressedOops ? 4 : 8);
1591     __ beqz(copied_oop, L_store_element);
1592 
1593     __ load_klass(r9_klass, copied_oop);// query the object klass
1594     generate_type_check(r9_klass, ckoff, ckval, L_store_element);
1595     // ======== end loop ========
1596 
1597     // It was a real error; we must depend on the caller to finish the job.
1598     // Register count = remaining oops, count_orig = total oops.
1599     // Emit GC store barriers for the oops we have copied and report
1600     // their number to the caller.
1601 
1602     __ sub(count, count_save, count);     // K = partially copied oop count
1603     __ xori(count, count, -1);                   // report (-1^K) to caller
1604     __ beqz(count, L_done_pop);
1605 
1606     __ BIND(L_do_card_marks);
1607     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs);
1608 
1609     __ bind(L_done_pop);
1610     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1611     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1612 
1613     __ bind(L_done);
1614     __ mv(x10, count);
1615     __ leave();
1616     __ ret();
1617 
1618     return start;
1619   }
1620 
1621   // Perform range checks on the proposed arraycopy.
1622   // Kills temp, but nothing else.
1623   // Also, clean the sign bits of src_pos and dst_pos.
1624   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1625                               Register src_pos, // source position (c_rarg1)
1626                               Register dst,     // destination array oo (c_rarg2)
1627                               Register dst_pos, // destination position (c_rarg3)
1628                               Register length,
1629                               Register temp,
1630                               Label& L_failed) {
1631     BLOCK_COMMENT("arraycopy_range_checks:");
1632 
1633     assert_different_registers(t0, temp);
1634 
1635     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1636     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1637     __ addw(temp, length, src_pos);
1638     __ bgtu(temp, t0, L_failed);
1639 
1640     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1641     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1642     __ addw(temp, length, dst_pos);
1643     __ bgtu(temp, t0, L_failed);
1644 
1645     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1646     __ zero_extend(src_pos, src_pos, 32);
1647     __ zero_extend(dst_pos, dst_pos, 32);
1648 
1649     BLOCK_COMMENT("arraycopy_range_checks done");
1650   }
1651 
1652   //
1653   //  Generate 'unsafe' array copy stub
1654   //  Though just as safe as the other stubs, it takes an unscaled
1655   //  size_t argument instead of an element count.
1656   //
1657   //  Input:
1658   //    c_rarg0   - source array address
1659   //    c_rarg1   - destination array address
1660   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1661   //
1662   // Examines the alignment of the operands and dispatches
1663   // to a long, int, short, or byte copy loop.
1664   //
1665   address generate_unsafe_copy(const char* name,
1666                                address byte_copy_entry,
1667                                address short_copy_entry,
1668                                address int_copy_entry,
1669                                address long_copy_entry) {
1670     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1671                 int_copy_entry != nullptr && long_copy_entry != nullptr);
1672     Label L_long_aligned, L_int_aligned, L_short_aligned;
1673     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1674 
1675     __ align(CodeEntryAlignment);
1676     StubCodeMark mark(this, "StubRoutines", name);
1677     address start = __ pc();
1678     __ enter(); // required for proper stackwalking of RuntimeStub frame
1679 
1680     // bump this on entry, not on exit:
1681     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1682 
1683     __ orr(t0, s, d);
1684     __ orr(t0, t0, count);
1685 
1686     __ andi(t0, t0, BytesPerLong - 1);
1687     __ beqz(t0, L_long_aligned);
1688     __ andi(t0, t0, BytesPerInt - 1);
1689     __ beqz(t0, L_int_aligned);
1690     __ test_bit(t0, t0, 0);
1691     __ beqz(t0, L_short_aligned);
1692     __ j(RuntimeAddress(byte_copy_entry));
1693 
1694     __ BIND(L_short_aligned);
1695     __ srli(count, count, LogBytesPerShort);  // size => short_count
1696     __ j(RuntimeAddress(short_copy_entry));
1697     __ BIND(L_int_aligned);
1698     __ srli(count, count, LogBytesPerInt);    // size => int_count
1699     __ j(RuntimeAddress(int_copy_entry));
1700     __ BIND(L_long_aligned);
1701     __ srli(count, count, LogBytesPerLong);   // size => long_count
1702     __ j(RuntimeAddress(long_copy_entry));
1703 
1704     return start;
1705   }
1706 
1707   //
1708   //  Generate generic array copy stubs
1709   //
1710   //  Input:
1711   //    c_rarg0    -  src oop
1712   //    c_rarg1    -  src_pos (32-bits)
1713   //    c_rarg2    -  dst oop
1714   //    c_rarg3    -  dst_pos (32-bits)
1715   //    c_rarg4    -  element count (32-bits)
1716   //
1717   //  Output:
1718   //    x10 ==  0  -  success
1719   //    x10 == -1^K - failure, where K is partial transfer count
1720   //
1721   address generate_generic_copy(const char* name,
1722                                 address byte_copy_entry, address short_copy_entry,
1723                                 address int_copy_entry, address oop_copy_entry,
1724                                 address long_copy_entry, address checkcast_copy_entry) {
1725     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1726                 int_copy_entry != nullptr && oop_copy_entry != nullptr &&
1727                 long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
1728     Label L_failed, L_failed_0, L_objArray;
1729     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1730 
1731     // Input registers
1732     const Register src        = c_rarg0;  // source array oop
1733     const Register src_pos    = c_rarg1;  // source position
1734     const Register dst        = c_rarg2;  // destination array oop
1735     const Register dst_pos    = c_rarg3;  // destination position
1736     const Register length     = c_rarg4;
1737 
1738     // Registers used as temps
1739     const Register dst_klass = c_rarg5;
1740 
1741     __ align(CodeEntryAlignment);
1742 
1743     StubCodeMark mark(this, "StubRoutines", name);
1744 
1745     address start = __ pc();
1746 
1747     __ enter(); // required for proper stackwalking of RuntimeStub frame
1748 
1749     // bump this on entry, not on exit:
1750     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1751 
1752     //-----------------------------------------------------------------------
1753     // Assembler stub will be used for this call to arraycopy
1754     // if the following conditions are met:
1755     //
1756     // (1) src and dst must not be null.
1757     // (2) src_pos must not be negative.
1758     // (3) dst_pos must not be negative.
1759     // (4) length  must not be negative.
1760     // (5) src klass and dst klass should be the same and not null.
1761     // (6) src and dst should be arrays.
1762     // (7) src_pos + length must not exceed length of src.
1763     // (8) dst_pos + length must not exceed length of dst.
1764     //
1765 
1766     // if src is null then return -1
1767     __ beqz(src, L_failed);
1768 
1769     // if [src_pos < 0] then return -1
1770     __ sign_extend(t0, src_pos, 32);
1771     __ bltz(t0, L_failed);
1772 
1773     // if dst is null then return -1
1774     __ beqz(dst, L_failed);
1775 
1776     // if [dst_pos < 0] then return -1
1777     __ sign_extend(t0, dst_pos, 32);
1778     __ bltz(t0, L_failed);
1779 
1780     // registers used as temp
1781     const Register scratch_length    = x28; // elements count to copy
1782     const Register scratch_src_klass = x29; // array klass
1783     const Register lh                = x30; // layout helper
1784 
1785     // if [length < 0] then return -1
1786     __ sign_extend(scratch_length, length, 32);    // length (elements count, 32-bits value)
1787     __ bltz(scratch_length, L_failed);
1788 
1789     __ load_klass(scratch_src_klass, src);
1790 #ifdef ASSERT
1791     {
1792       BLOCK_COMMENT("assert klasses not null {");
1793       Label L1, L2;
1794       __ bnez(scratch_src_klass, L2);   // it is broken if klass is null
1795       __ bind(L1);
1796       __ stop("broken null klass");
1797       __ bind(L2);
1798       __ load_klass(t0, dst, t1);
1799       __ beqz(t0, L1);     // this would be broken also
1800       BLOCK_COMMENT("} assert klasses not null done");
1801     }
1802 #endif
1803 
1804     // Load layout helper (32-bits)
1805     //
1806     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1807     // 32        30    24            16              8     2                 0
1808     //
1809     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1810     //
1811 
1812     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1813 
1814     // Handle objArrays completely differently...
1815     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1816     __ lw(lh, Address(scratch_src_klass, lh_offset));
1817     __ mv(t0, objArray_lh);
1818     __ beq(lh, t0, L_objArray);
1819 
1820     // if [src->klass() != dst->klass()] then return -1
1821     __ load_klass(t1, dst);
1822     __ bne(t1, scratch_src_klass, L_failed);
1823 
1824     // if src->is_Array() isn't null then return -1
1825     // i.e. (lh >= 0)
1826     __ bgez(lh, L_failed);
1827 
1828     // At this point, it is known to be a typeArray (array_tag 0x3).
1829 #ifdef ASSERT
1830     {
1831       BLOCK_COMMENT("assert primitive array {");
1832       Label L;
1833       __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1834       __ bge(lh, t1, L);
1835       __ stop("must be a primitive array");
1836       __ bind(L);
1837       BLOCK_COMMENT("} assert primitive array done");
1838     }
1839 #endif
1840 
1841     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1842                            t1, L_failed);
1843 
1844     // TypeArrayKlass
1845     //
1846     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1847     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1848     //
1849 
1850     const Register t0_offset = t0;    // array offset
1851     const Register x30_elsize = lh;   // element size
1852 
1853     // Get array_header_in_bytes()
1854     int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1855     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1856     __ slli(t0_offset, lh, XLEN - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1857     __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1858 
1859     __ add(src, src, t0_offset);           // src array offset
1860     __ add(dst, dst, t0_offset);           // dst array offset
1861     BLOCK_COMMENT("choose copy loop based on element size");
1862 
1863     // next registers should be set before the jump to corresponding stub
1864     const Register from     = c_rarg0;  // source array address
1865     const Register to       = c_rarg1;  // destination array address
1866     const Register count    = c_rarg2;  // elements count
1867 
1868     // 'from', 'to', 'count' registers should be set in such order
1869     // since they are the same as 'src', 'src_pos', 'dst'.
1870 
1871     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1872 
1873     // The possible values of elsize are 0-3, i.e. exact_log2(element
1874     // size in bytes).  We do a simple bitwise binary search.
1875   __ BIND(L_copy_bytes);
1876     __ test_bit(t0, x30_elsize, 1);
1877     __ bnez(t0, L_copy_ints);
1878     __ test_bit(t0, x30_elsize, 0);
1879     __ bnez(t0, L_copy_shorts);
1880     __ add(from, src, src_pos); // src_addr
1881     __ add(to, dst, dst_pos); // dst_addr
1882     __ sign_extend(count, scratch_length, 32); // length
1883     __ j(RuntimeAddress(byte_copy_entry));
1884 
1885   __ BIND(L_copy_shorts);
1886     __ shadd(from, src_pos, src, t0, 1); // src_addr
1887     __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1888     __ sign_extend(count, scratch_length, 32); // length
1889     __ j(RuntimeAddress(short_copy_entry));
1890 
1891   __ BIND(L_copy_ints);
1892     __ test_bit(t0, x30_elsize, 0);
1893     __ bnez(t0, L_copy_longs);
1894     __ shadd(from, src_pos, src, t0, 2); // src_addr
1895     __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
1896     __ sign_extend(count, scratch_length, 32); // length
1897     __ j(RuntimeAddress(int_copy_entry));
1898 
1899   __ BIND(L_copy_longs);
1900 #ifdef ASSERT
1901     {
1902       BLOCK_COMMENT("assert long copy {");
1903       Label L;
1904       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
1905       __ sign_extend(lh, lh, 32);
1906       __ mv(t0, LogBytesPerLong);
1907       __ beq(x30_elsize, t0, L);
1908       __ stop("must be long copy, but elsize is wrong");
1909       __ bind(L);
1910       BLOCK_COMMENT("} assert long copy done");
1911     }
1912 #endif
1913     __ shadd(from, src_pos, src, t0, 3); // src_addr
1914     __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
1915     __ sign_extend(count, scratch_length, 32); // length
1916     __ j(RuntimeAddress(long_copy_entry));
1917 
1918     // ObjArrayKlass
1919   __ BIND(L_objArray);
1920     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1921 
1922     Label L_plain_copy, L_checkcast_copy;
1923     // test array classes for subtyping
1924     __ load_klass(t2, dst);
1925     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
1926 
1927     // Identically typed arrays can be copied without element-wise checks.
1928     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1929                            t1, L_failed);
1930 
1931     __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1932     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1933     __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1934     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1935     __ sign_extend(count, scratch_length, 32); // length
1936   __ BIND(L_plain_copy);
1937     __ j(RuntimeAddress(oop_copy_entry));
1938 
1939   __ BIND(L_checkcast_copy);
1940     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
1941     {
1942       // Before looking at dst.length, make sure dst is also an objArray.
1943       __ lwu(t0, Address(t2, lh_offset));
1944       __ mv(t1, objArray_lh);
1945       __ bne(t0, t1, L_failed);
1946 
1947       // It is safe to examine both src.length and dst.length.
1948       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1949                              t2, L_failed);
1950 
1951       __ load_klass(dst_klass, dst); // reload
1952 
1953       // Marshal the base address arguments now, freeing registers.
1954       __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
1955       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1956       __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
1957       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1958       __ sign_extend(count, length, 32);      // length (reloaded)
1959       const Register sco_temp = c_rarg3;      // this register is free now
1960       assert_different_registers(from, to, count, sco_temp,
1961                                  dst_klass, scratch_src_klass);
1962 
1963       // Generate the type check.
1964       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
1965       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1966 
1967       // Smashes t0, t1
1968       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
1969 
1970       // Fetch destination element klass from the ObjArrayKlass header.
1971       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1972       __ ld(dst_klass, Address(dst_klass, ek_offset));
1973       __ lwu(sco_temp, Address(dst_klass, sco_offset));
1974 
1975       // the checkcast_copy loop needs two extra arguments:
1976       assert(c_rarg3 == sco_temp, "#3 already in place");
1977       // Set up arguments for checkcast_copy_entry.
1978       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
1979       __ j(RuntimeAddress(checkcast_copy_entry));
1980     }
1981 
1982   __ BIND(L_failed);
1983     __ mv(x10, -1);
1984     __ leave();   // required for proper stackwalking of RuntimeStub frame
1985     __ ret();
1986 
1987     return start;
1988   }
1989 
1990   //
1991   // Generate stub for array fill. If "aligned" is true, the
1992   // "to" address is assumed to be heapword aligned.
1993   //
1994   // Arguments for generated stub:
1995   //   to:    c_rarg0
1996   //   value: c_rarg1
1997   //   count: c_rarg2 treated as signed
1998   //
1999   address generate_fill(BasicType t, bool aligned, const char* name) {
2000     __ align(CodeEntryAlignment);
2001     StubCodeMark mark(this, "StubRoutines", name);
2002     address start = __ pc();
2003 
2004     BLOCK_COMMENT("Entry:");
2005 
2006     const Register to        = c_rarg0;  // source array address
2007     const Register value     = c_rarg1;  // value
2008     const Register count     = c_rarg2;  // elements count
2009 
2010     const Register bz_base   = x28;      // base for block_zero routine
2011     const Register cnt_words = x29;      // temp register
2012     const Register tmp_reg   = t1;
2013 
2014     __ enter();
2015 
2016     Label L_fill_elements, L_exit1;
2017 
2018     int shift = -1;
2019     switch (t) {
2020       case T_BYTE:
2021         shift = 0;
2022 
2023         // Zero extend value
2024         // 8 bit -> 16 bit
2025         __ andi(value, value, 0xff);
2026         __ mv(tmp_reg, value);
2027         __ slli(tmp_reg, tmp_reg, 8);
2028         __ orr(value, value, tmp_reg);
2029 
2030         // 16 bit -> 32 bit
2031         __ mv(tmp_reg, value);
2032         __ slli(tmp_reg, tmp_reg, 16);
2033         __ orr(value, value, tmp_reg);
2034 
2035         __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2036         __ bltu(count, tmp_reg, L_fill_elements);
2037         break;
2038       case T_SHORT:
2039         shift = 1;
2040         // Zero extend value
2041         // 16 bit -> 32 bit
2042         __ andi(value, value, 0xffff);
2043         __ mv(tmp_reg, value);
2044         __ slli(tmp_reg, tmp_reg, 16);
2045         __ orr(value, value, tmp_reg);
2046 
2047         // Short arrays (< 8 bytes) fill by element
2048         __ mv(tmp_reg, 8 >> shift);
2049         __ bltu(count, tmp_reg, L_fill_elements);
2050         break;
2051       case T_INT:
2052         shift = 2;
2053 
2054         // Short arrays (< 8 bytes) fill by element
2055         __ mv(tmp_reg, 8 >> shift);
2056         __ bltu(count, tmp_reg, L_fill_elements);
2057         break;
2058       default: ShouldNotReachHere();
2059     }
2060 
2061     // Align source address at 8 bytes address boundary.
2062     Label L_skip_align1, L_skip_align2, L_skip_align4;
2063     if (!aligned) {
2064       switch (t) {
2065         case T_BYTE:
2066           // One byte misalignment happens only for byte arrays.
2067           __ test_bit(t0, to, 0);
2068           __ beqz(t0, L_skip_align1);
2069           __ sb(value, Address(to, 0));
2070           __ addi(to, to, 1);
2071           __ addiw(count, count, -1);
2072           __ bind(L_skip_align1);
2073           // Fallthrough
2074         case T_SHORT:
2075           // Two bytes misalignment happens only for byte and short (char) arrays.
2076           __ test_bit(t0, to, 1);
2077           __ beqz(t0, L_skip_align2);
2078           __ sh(value, Address(to, 0));
2079           __ addi(to, to, 2);
2080           __ addiw(count, count, -(2 >> shift));
2081           __ bind(L_skip_align2);
2082           // Fallthrough
2083         case T_INT:
2084           // Align to 8 bytes, we know we are 4 byte aligned to start.
2085           __ test_bit(t0, to, 2);
2086           __ beqz(t0, L_skip_align4);
2087           __ sw(value, Address(to, 0));
2088           __ addi(to, to, 4);
2089           __ addiw(count, count, -(4 >> shift));
2090           __ bind(L_skip_align4);
2091           break;
2092         default: ShouldNotReachHere();
2093       }
2094     }
2095 
2096     //
2097     //  Fill large chunks
2098     //
2099     __ srliw(cnt_words, count, 3 - shift); // number of words
2100 
2101     // 32 bit -> 64 bit
2102     __ andi(value, value, 0xffffffff);
2103     __ mv(tmp_reg, value);
2104     __ slli(tmp_reg, tmp_reg, 32);
2105     __ orr(value, value, tmp_reg);
2106 
2107     __ slli(tmp_reg, cnt_words, 3 - shift);
2108     __ subw(count, count, tmp_reg);
2109     {
2110       __ fill_words(to, cnt_words, value);
2111     }
2112 
2113     // Remaining count is less than 8 bytes. Fill it by a single store.
2114     // Note that the total length is no less than 8 bytes.
2115     if (t == T_BYTE || t == T_SHORT) {
2116       __ beqz(count, L_exit1);
2117       __ shadd(to, count, to, tmp_reg, shift); // points to the end
2118       __ sd(value, Address(to, -8)); // overwrite some elements
2119       __ bind(L_exit1);
2120       __ leave();
2121       __ ret();
2122     }
2123 
2124     // Handle copies less than 8 bytes.
2125     Label L_fill_2, L_fill_4, L_exit2;
2126     __ bind(L_fill_elements);
2127     switch (t) {
2128       case T_BYTE:
2129         __ test_bit(t0, count, 0);
2130         __ beqz(t0, L_fill_2);
2131         __ sb(value, Address(to, 0));
2132         __ addi(to, to, 1);
2133         __ bind(L_fill_2);
2134         __ test_bit(t0, count, 1);
2135         __ beqz(t0, L_fill_4);
2136         __ sh(value, Address(to, 0));
2137         __ addi(to, to, 2);
2138         __ bind(L_fill_4);
2139         __ test_bit(t0, count, 2);
2140         __ beqz(t0, L_exit2);
2141         __ sw(value, Address(to, 0));
2142         break;
2143       case T_SHORT:
2144         __ test_bit(t0, count, 0);
2145         __ beqz(t0, L_fill_4);
2146         __ sh(value, Address(to, 0));
2147         __ addi(to, to, 2);
2148         __ bind(L_fill_4);
2149         __ test_bit(t0, count, 1);
2150         __ beqz(t0, L_exit2);
2151         __ sw(value, Address(to, 0));
2152         break;
2153       case T_INT:
2154         __ beqz(count, L_exit2);
2155         __ sw(value, Address(to, 0));
2156         break;
2157       default: ShouldNotReachHere();
2158     }
2159     __ bind(L_exit2);
2160     __ leave();
2161     __ ret();
2162     return start;
2163   }
2164 
2165   void generate_arraycopy_stubs() {
2166     address entry                     = nullptr;
2167     address entry_jbyte_arraycopy     = nullptr;
2168     address entry_jshort_arraycopy    = nullptr;
2169     address entry_jint_arraycopy      = nullptr;
2170     address entry_oop_arraycopy       = nullptr;
2171     address entry_jlong_arraycopy     = nullptr;
2172     address entry_checkcast_arraycopy = nullptr;
2173 
2174     generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards);
2175     generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards);
2176 
2177     StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2178 
2179     //*** jbyte
2180     // Always need aligned and unaligned versions
2181     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_byte_copy(false, &entry,
2182                                                                                    "jbyte_disjoint_arraycopy");
2183     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_byte_copy(false, entry,
2184                                                                                    &entry_jbyte_arraycopy,
2185                                                                                    "jbyte_arraycopy");
2186     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(true, &entry,
2187                                                                                    "arrayof_jbyte_disjoint_arraycopy");
2188     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_byte_copy(true, entry, nullptr,
2189                                                                                    "arrayof_jbyte_arraycopy");
2190 
2191     //*** jshort
2192     // Always need aligned and unaligned versions
2193     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2194                                                                                     "jshort_disjoint_arraycopy");
2195     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2196                                                                                     &entry_jshort_arraycopy,
2197                                                                                     "jshort_arraycopy");
2198     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2199                                                                                     "arrayof_jshort_disjoint_arraycopy");
2200     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2201                                                                                     "arrayof_jshort_arraycopy");
2202 
2203     //*** jint
2204     // Aligned versions
2205     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_int_copy(true, &entry,
2206                                                                                   "arrayof_jint_disjoint_arraycopy");
2207     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2208                                                                                   "arrayof_jint_arraycopy");
2209     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2210     // entry_jint_arraycopy always points to the unaligned version
2211     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_int_copy(false, &entry,
2212                                                                                   "jint_disjoint_arraycopy");
2213     StubRoutines::_jint_arraycopy                    = generate_conjoint_int_copy(false, entry,
2214                                                                                   &entry_jint_arraycopy,
2215                                                                                   "jint_arraycopy");
2216 
2217     //*** jlong
2218     // It is always aligned
2219     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_disjoint_long_copy(true, &entry,
2220                                                                                    "arrayof_jlong_disjoint_arraycopy");
2221     StubRoutines::_arrayof_jlong_arraycopy           = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2222                                                                                    "arrayof_jlong_arraycopy");
2223     StubRoutines::_jlong_disjoint_arraycopy          = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2224     StubRoutines::_jlong_arraycopy                   = StubRoutines::_arrayof_jlong_arraycopy;
2225 
2226     //*** oops
2227     {
2228       // With compressed oops we need unaligned versions; notice that
2229       // we overwrite entry_oop_arraycopy.
2230       bool aligned = !UseCompressedOops;
2231 
2232       StubRoutines::_arrayof_oop_disjoint_arraycopy
2233         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2234                                      /*dest_uninitialized*/false);
2235       StubRoutines::_arrayof_oop_arraycopy
2236         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2237                                      /*dest_uninitialized*/false);
2238       // Aligned versions without pre-barriers
2239       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2240         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2241                                      /*dest_uninitialized*/true);
2242       StubRoutines::_arrayof_oop_arraycopy_uninit
2243         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2244                                      /*dest_uninitialized*/true);
2245     }
2246 
2247     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2248     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2249     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2250     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2251 
2252     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2253     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2254                                                                         /*dest_uninitialized*/true);
2255 
2256 
2257     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2258                                                               entry_jbyte_arraycopy,
2259                                                               entry_jshort_arraycopy,
2260                                                               entry_jint_arraycopy,
2261                                                               entry_jlong_arraycopy);
2262 
2263     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2264                                                                entry_jbyte_arraycopy,
2265                                                                entry_jshort_arraycopy,
2266                                                                entry_jint_arraycopy,
2267                                                                entry_oop_arraycopy,
2268                                                                entry_jlong_arraycopy,
2269                                                                entry_checkcast_arraycopy);
2270 
2271     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2272     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2273     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2274     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2275     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2276     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2277   }
2278 
2279   // code for comparing 16 bytes of strings with same encoding
2280   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
2281     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31;
2282     __ ld(tmp5, Address(str1));
2283     __ addi(str1, str1, 8);
2284     __ xorr(tmp4, tmp1, tmp2);
2285     __ ld(cnt1, Address(str2));
2286     __ addi(str2, str2, 8);
2287     __ bnez(tmp4, DIFF1);
2288     __ ld(tmp1, Address(str1));
2289     __ addi(str1, str1, 8);
2290     __ xorr(tmp4, tmp5, cnt1);
2291     __ ld(tmp2, Address(str2));
2292     __ addi(str2, str2, 8);
2293     __ bnez(tmp4, DIFF2);
2294   }
2295 
2296   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
2297   void compare_string_8_x_LU(Register tmpL, Register tmpU, Register strL, Register strU, Label& DIFF) {
2298     const Register tmp = x30, tmpLval = x12;
2299     __ ld(tmpLval, Address(strL));
2300     __ addi(strL, strL, wordSize);
2301     __ ld(tmpU, Address(strU));
2302     __ addi(strU, strU, wordSize);
2303     __ inflate_lo32(tmpL, tmpLval);
2304     __ xorr(tmp, tmpU, tmpL);
2305     __ bnez(tmp, DIFF);
2306 
2307     __ ld(tmpU, Address(strU));
2308     __ addi(strU, strU, wordSize);
2309     __ inflate_hi32(tmpL, tmpLval);
2310     __ xorr(tmp, tmpU, tmpL);
2311     __ bnez(tmp, DIFF);
2312   }
2313 
2314   // x10  = result
2315   // x11  = str1
2316   // x12  = cnt1
2317   // x13  = str2
2318   // x14  = cnt2
2319   // x28  = tmp1
2320   // x29  = tmp2
2321   // x30  = tmp3
2322   address generate_compare_long_string_different_encoding(bool isLU) {
2323     __ align(CodeEntryAlignment);
2324     StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL");
2325     address entry = __ pc();
2326     Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
2327     const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
2328                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
2329 
2330     // cnt2 == amount of characters left to compare
2331     // Check already loaded first 4 symbols
2332     __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2);
2333     __ mv(isLU ? tmp1 : tmp2, tmp3);
2334     __ addi(str1, str1, isLU ? wordSize / 2 : wordSize);
2335     __ addi(str2, str2, isLU ? wordSize : wordSize / 2);
2336     __ sub(cnt2, cnt2, wordSize / 2); // Already loaded 4 symbols
2337 
2338     __ xorr(tmp3, tmp1, tmp2);
2339     __ bnez(tmp3, CALCULATE_DIFFERENCE);
2340 
2341     Register strU = isLU ? str2 : str1,
2342              strL = isLU ? str1 : str2,
2343              tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
2344              tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
2345 
2346     // make sure main loop is 8 byte-aligned, we should load another 4 bytes from strL
2347     // cnt2 is >= 68 here, no need to check it for >= 0
2348     __ lwu(tmpL, Address(strL));
2349     __ addi(strL, strL, wordSize / 2);
2350     __ ld(tmpU, Address(strU));
2351     __ addi(strU, strU, wordSize);
2352     __ inflate_lo32(tmp3, tmpL);
2353     __ mv(tmpL, tmp3);
2354     __ xorr(tmp3, tmpU, tmpL);
2355     __ bnez(tmp3, CALCULATE_DIFFERENCE);
2356     __ addi(cnt2, cnt2, -wordSize / 2);
2357 
2358     // we are now 8-bytes aligned on strL
2359     __ sub(cnt2, cnt2, wordSize * 2);
2360     __ bltz(cnt2, TAIL);
2361     __ bind(SMALL_LOOP); // smaller loop
2362       __ sub(cnt2, cnt2, wordSize * 2);
2363       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2364       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2365       __ bgez(cnt2, SMALL_LOOP);
2366       __ addi(t0, cnt2, wordSize * 2);
2367       __ beqz(t0, DONE);
2368     __ bind(TAIL);  // 1..15 characters left
2369       // Aligned access. Load bytes in portions - 4, 2, 1.
2370 
2371       __ addi(t0, cnt2, wordSize);
2372       __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
2373       __ bltz(t0, LOAD_LAST);
2374       // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
2375       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
2376       __ addi(cnt2, cnt2, -wordSize);
2377       __ beqz(cnt2, DONE);  // no character left
2378       __ bind(LOAD_LAST);   // cnt2 = 1..7 characters left
2379 
2380       __ addi(cnt2, cnt2, -wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
2381       __ slli(t0, cnt2, 1);     // t0 is now an offset in strU which points to last 16 bytes
2382       __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
2383       __ add(strU, strU, t0);   // Address of last 16 bytes in UTF-16 string
2384       __ load_int_misaligned(tmpL, Address(strL), t0, false);
2385       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
2386       __ inflate_lo32(tmp3, tmpL);
2387       __ mv(tmpL, tmp3);
2388       __ xorr(tmp3, tmpU, tmpL);
2389       __ bnez(tmp3, CALCULATE_DIFFERENCE);
2390 
2391       __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
2392       __ addi(strU, strU, wordSize);   // Address of last 8 bytes in UTF-16 string
2393       __ load_int_misaligned(tmpL, Address(strL), t0, false);
2394       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
2395       __ inflate_lo32(tmp3, tmpL);
2396       __ mv(tmpL, tmp3);
2397       __ xorr(tmp3, tmpU, tmpL);
2398       __ bnez(tmp3, CALCULATE_DIFFERENCE);
2399       __ j(DONE); // no character left
2400 
2401       // Find the first different characters in the longwords and
2402       // compute their difference.
2403     __ bind(CALCULATE_DIFFERENCE);
2404       __ ctzc_bit(tmp4, tmp3);
2405       __ srl(tmp1, tmp1, tmp4);
2406       __ srl(tmp2, tmp2, tmp4);
2407       __ andi(tmp1, tmp1, 0xFFFF);
2408       __ andi(tmp2, tmp2, 0xFFFF);
2409       __ sub(result, tmp1, tmp2);
2410     __ bind(DONE);
2411       __ ret();
2412     return entry;
2413   }
2414 
2415   address generate_method_entry_barrier() {
2416     __ align(CodeEntryAlignment);
2417     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
2418 
2419     Label deoptimize_label;
2420 
2421     address start = __ pc();
2422 
2423     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
2424 
2425     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
2426       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
2427       Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
2428       __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
2429       __ lwu(t1, t1);
2430       __ sw(t1, thread_epoch_addr);
2431       __ membar(__ LoadLoad);
2432     }
2433 
2434     __ set_last_Java_frame(sp, fp, ra);
2435 
2436     __ enter();
2437     __ add(t1, sp, wordSize);
2438 
2439     __ sub(sp, sp, 4 * wordSize);
2440 
2441     __ push_call_clobbered_registers();
2442 
2443     __ mv(c_rarg0, t1);
2444     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
2445 
2446     __ reset_last_Java_frame(true);
2447 
2448     __ mv(t0, x10);
2449 
2450     __ pop_call_clobbered_registers();
2451 
2452     __ bnez(t0, deoptimize_label);
2453 
2454     __ leave();
2455     __ ret();
2456 
2457     __ BIND(deoptimize_label);
2458 
2459     __ ld(t0, Address(sp, 0));
2460     __ ld(fp, Address(sp, wordSize));
2461     __ ld(ra, Address(sp, wordSize * 2));
2462     __ ld(t1, Address(sp, wordSize * 3));
2463 
2464     __ mv(sp, t0);
2465     __ jr(t1);
2466 
2467     return start;
2468   }
2469 
2470   // x10  = result
2471   // x11  = str1
2472   // x12  = cnt1
2473   // x13  = str2
2474   // x14  = cnt2
2475   // x28  = tmp1
2476   // x29  = tmp2
2477   // x30  = tmp3
2478   // x31  = tmp4
2479   address generate_compare_long_string_same_encoding(bool isLL) {
2480     __ align(CodeEntryAlignment);
2481     StubCodeMark mark(this, "StubRoutines", isLL ?
2482                       "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU");
2483     address entry = __ pc();
2484     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
2485           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
2486     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
2487                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
2488     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
2489 
2490     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
2491     // update cnt2 counter with already loaded 8 bytes
2492     __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
2493     // update pointers, because of previous read
2494     __ add(str1, str1, wordSize);
2495     __ add(str2, str2, wordSize);
2496     // less than 16 bytes left?
2497     __ sub(cnt2, cnt2, isLL ? 16 : 8);
2498     __ push_reg(spilled_regs, sp);
2499     __ bltz(cnt2, TAIL);
2500     __ bind(SMALL_LOOP);
2501       compare_string_16_bytes_same(DIFF, DIFF2);
2502       __ sub(cnt2, cnt2, isLL ? 16 : 8);
2503       __ bgez(cnt2, SMALL_LOOP);
2504     __ bind(TAIL);
2505       __ addi(cnt2, cnt2, isLL ? 16 : 8);
2506       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
2507       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2508       __ blez(cnt2, CHECK_LAST);
2509       __ xorr(tmp4, tmp1, tmp2);
2510       __ bnez(tmp4, DIFF);
2511       __ ld(tmp1, Address(str1));
2512       __ addi(str1, str1, 8);
2513       __ ld(tmp2, Address(str2));
2514       __ addi(str2, str2, 8);
2515       __ sub(cnt2, cnt2, isLL ? 8 : 4);
2516     __ bind(CHECK_LAST);
2517       if (!isLL) {
2518         __ add(cnt2, cnt2, cnt2); // now in bytes
2519       }
2520       __ xorr(tmp4, tmp1, tmp2);
2521       __ bnez(tmp4, DIFF);
2522       __ add(str1, str1, cnt2);
2523       __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
2524       __ add(str2, str2, cnt2);
2525       __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
2526       __ xorr(tmp4, tmp5, cnt1);
2527       __ beqz(tmp4, LENGTH_DIFF);
2528       // Find the first different characters in the longwords and
2529       // compute their difference.
2530     __ bind(DIFF2);
2531       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2532       __ srl(tmp5, tmp5, tmp3);
2533       __ srl(cnt1, cnt1, tmp3);
2534       if (isLL) {
2535         __ andi(tmp5, tmp5, 0xFF);
2536         __ andi(cnt1, cnt1, 0xFF);
2537       } else {
2538         __ andi(tmp5, tmp5, 0xFFFF);
2539         __ andi(cnt1, cnt1, 0xFFFF);
2540       }
2541       __ sub(result, tmp5, cnt1);
2542       __ j(LENGTH_DIFF);
2543     __ bind(DIFF);
2544       __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb
2545       __ srl(tmp1, tmp1, tmp3);
2546       __ srl(tmp2, tmp2, tmp3);
2547       if (isLL) {
2548         __ andi(tmp1, tmp1, 0xFF);
2549         __ andi(tmp2, tmp2, 0xFF);
2550       } else {
2551         __ andi(tmp1, tmp1, 0xFFFF);
2552         __ andi(tmp2, tmp2, 0xFFFF);
2553       }
2554       __ sub(result, tmp1, tmp2);
2555       __ j(LENGTH_DIFF);
2556     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
2557       __ xorr(tmp4, tmp1, tmp2);
2558       __ bnez(tmp4, DIFF);
2559     __ bind(LENGTH_DIFF);
2560       __ pop_reg(spilled_regs, sp);
2561       __ ret();
2562     return entry;
2563   }
2564 
2565   void generate_compare_long_strings() {
2566     StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true);
2567     StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false);
2568     StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true);
2569     StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false);
2570   }
2571 
2572   // x10 result
2573   // x11 src
2574   // x12 src count
2575   // x13 pattern
2576   // x14 pattern count
2577   address generate_string_indexof_linear(bool needle_isL, bool haystack_isL)
2578   {
2579     const char* stubName = needle_isL
2580            ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul")
2581            : "indexof_linear_uu";
2582     __ align(CodeEntryAlignment);
2583     StubCodeMark mark(this, "StubRoutines", stubName);
2584     address entry = __ pc();
2585 
2586     int needle_chr_size = needle_isL ? 1 : 2;
2587     int haystack_chr_size = haystack_isL ? 1 : 2;
2588     int needle_chr_shift = needle_isL ? 0 : 1;
2589     int haystack_chr_shift = haystack_isL ? 0 : 1;
2590     bool isL = needle_isL && haystack_isL;
2591     // parameters
2592     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
2593     // temporary registers
2594     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
2595     // redefinitions
2596     Register ch1 = x28, ch2 = x29;
2597     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
2598 
2599     __ push_reg(spilled_regs, sp);
2600 
2601     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
2602           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
2603           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
2604           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
2605           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
2606           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
2607 
2608     __ ld(ch1, Address(needle));
2609     __ ld(ch2, Address(haystack));
2610     // src.length - pattern.length
2611     __ sub(haystack_len, haystack_len, needle_len);
2612 
2613     // first is needle[0]
2614     __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first);
2615     uint64_t mask0101 = UCONST64(0x0101010101010101);
2616     uint64_t mask0001 = UCONST64(0x0001000100010001);
2617     __ mv(mask1, haystack_isL ? mask0101 : mask0001);
2618     __ mul(first, first, mask1);
2619     uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
2620     uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
2621     __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
2622     if (needle_isL != haystack_isL) {
2623       __ mv(tmp, ch1);
2624     }
2625     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
2626     __ blez(haystack_len, L_SMALL);
2627 
2628     if (needle_isL != haystack_isL) {
2629       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2630     }
2631     // xorr, sub, orr, notr, andr
2632     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
2633     // eg:
2634     // first:        aa aa aa aa aa aa aa aa
2635     // ch2:          aa aa li nx jd ka aa aa
2636     // match_mask:   80 80 00 00 00 00 80 80
2637     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2638 
2639     // search first char of needle, if success, goto L_HAS_ZERO;
2640     __ bnez(match_mask, L_HAS_ZERO);
2641     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2642     __ add(result, result, wordSize / haystack_chr_size);
2643     __ add(haystack, haystack, wordSize);
2644     __ bltz(haystack_len, L_POST_LOOP);
2645 
2646     __ bind(L_LOOP);
2647     __ ld(ch2, Address(haystack));
2648     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
2649     __ bnez(match_mask, L_HAS_ZERO);
2650 
2651     __ bind(L_LOOP_PROCEED);
2652     __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size);
2653     __ add(haystack, haystack, wordSize);
2654     __ add(result, result, wordSize / haystack_chr_size);
2655     __ bgez(haystack_len, L_LOOP);
2656 
2657     __ bind(L_POST_LOOP);
2658     __ mv(ch2, -wordSize / haystack_chr_size);
2659     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
2660     __ ld(ch2, Address(haystack));
2661     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2662     __ neg(haystack_len, haystack_len);
2663     __ xorr(ch2, first, ch2);
2664     __ sub(match_mask, ch2, mask1);
2665     __ orr(ch2, ch2, mask2);
2666     __ mv(trailing_zeros, -1); // all bits set
2667     __ j(L_SMALL_PROCEED);
2668 
2669     __ align(OptoLoopAlignment);
2670     __ bind(L_SMALL);
2671     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
2672     __ neg(haystack_len, haystack_len);
2673     if (needle_isL != haystack_isL) {
2674       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
2675     }
2676     __ xorr(ch2, first, ch2);
2677     __ sub(match_mask, ch2, mask1);
2678     __ orr(ch2, ch2, mask2);
2679     __ mv(trailing_zeros, -1); // all bits set
2680 
2681     __ bind(L_SMALL_PROCEED);
2682     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
2683     __ notr(ch2, ch2);
2684     __ andr(match_mask, match_mask, ch2);
2685     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
2686     __ beqz(match_mask, NOMATCH);
2687 
2688     __ bind(L_SMALL_HAS_ZERO_LOOP);
2689     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros
2690     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2691     __ mv(ch2, wordSize / haystack_chr_size);
2692     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
2693     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2694     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2695     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2696 
2697     __ bind(L_SMALL_CMP_LOOP);
2698     __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
2699     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2700     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
2701     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2702     __ add(trailing_zeros, trailing_zeros, 1);
2703     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
2704     __ beq(first, ch2, L_SMALL_CMP_LOOP);
2705 
2706     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
2707     __ beqz(match_mask, NOMATCH);
2708     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2709     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2710     __ add(result, result, 1);
2711     __ add(haystack, haystack, haystack_chr_size);
2712     __ j(L_SMALL_HAS_ZERO_LOOP);
2713 
2714     __ align(OptoLoopAlignment);
2715     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
2716     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2717     __ j(DONE);
2718 
2719     __ align(OptoLoopAlignment);
2720     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
2721     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2722     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
2723     __ j(DONE);
2724 
2725     __ align(OptoLoopAlignment);
2726     __ bind(L_HAS_ZERO);
2727     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
2728     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2729     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
2730     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
2731     __ sub(result, result, 1); // array index from 0, so result -= 1
2732 
2733     __ bind(L_HAS_ZERO_LOOP);
2734     __ mv(needle_len, wordSize / haystack_chr_size);
2735     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
2736     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
2737     // load next 8 bytes from haystack, and increase result index
2738     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2739     __ add(result, result, 1);
2740     __ mv(trailing_zeros, wordSize / haystack_chr_size);
2741     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2742 
2743     // compare one char
2744     __ bind(L_CMP_LOOP);
2745     __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
2746     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
2747     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
2748     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
2749     __ add(trailing_zeros, trailing_zeros, 1); // next char index
2750     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
2751     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
2752     __ beq(needle_len, ch2, L_CMP_LOOP);
2753 
2754     __ bind(L_CMP_LOOP_NOMATCH);
2755     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
2756     __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index
2757     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
2758     __ add(haystack, haystack, haystack_chr_size);
2759     __ j(L_HAS_ZERO_LOOP);
2760 
2761     __ align(OptoLoopAlignment);
2762     __ bind(L_CMP_LOOP_LAST_CMP);
2763     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
2764     __ j(DONE);
2765 
2766     __ align(OptoLoopAlignment);
2767     __ bind(L_CMP_LOOP_LAST_CMP2);
2768     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
2769     __ add(result, result, 1);
2770     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
2771     __ j(DONE);
2772 
2773     __ align(OptoLoopAlignment);
2774     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
2775     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
2776     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
2777     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
2778     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
2779     // result by analyzed characters value, so, we can just reset lower bits
2780     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
2781     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
2782     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
2783     // index of last analyzed substring inside current octet. So, haystack in at
2784     // respective start address. We need to advance it to next octet
2785     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
2786     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
2787     __ andi(result, result, haystack_isL ? -8 : -4);
2788     __ slli(tmp, match_mask, haystack_chr_shift);
2789     __ sub(haystack, haystack, tmp);
2790     __ sign_extend(haystack_len, haystack_len, 32);
2791     __ j(L_LOOP_PROCEED);
2792 
2793     __ align(OptoLoopAlignment);
2794     __ bind(NOMATCH);
2795     __ mv(result, -1);
2796 
2797     __ bind(DONE);
2798     __ pop_reg(spilled_regs, sp);
2799     __ ret();
2800     return entry;
2801   }
2802 
2803   void generate_string_indexof_stubs()
2804   {
2805     StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
2806     StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
2807     StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
2808   }
2809 
2810 #ifdef COMPILER2
2811   address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
2812     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
2813 
2814     address start = __ pc();
2815     const Register
2816       r_super_klass  = x10,
2817       r_array_base   = x11,
2818       r_array_length = x12,
2819       r_array_index  = x13,
2820       r_sub_klass    = x14,
2821       result         = x15,
2822       r_bitmap       = x16;
2823 
2824     Label L_success;
2825     __ enter();
2826     __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, result,
2827                                      r_array_base, r_array_length, r_array_index,
2828                                      r_bitmap, super_klass_index, /*stub_is_near*/true);
2829     __ leave();
2830     __ ret();
2831 
2832     return start;
2833   }
2834 
2835   // Slow path implementation for UseSecondarySupersTable.
2836   address generate_lookup_secondary_supers_table_slow_path_stub() {
2837     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
2838 
2839     address start = __ pc();
2840     const Register
2841       r_super_klass  = x10,        // argument
2842       r_array_base   = x11,        // argument
2843       temp1          = x12,        // tmp
2844       r_array_index  = x13,        // argument
2845       result         = x15,        // argument
2846       r_bitmap       = x16;        // argument
2847 
2848 
2849     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
2850     __ ret();
2851 
2852     return start;
2853   }
2854 
2855   address generate_mulAdd()
2856   {
2857     __ align(CodeEntryAlignment);
2858     StubCodeMark mark(this, "StubRoutines", "mulAdd");
2859 
2860     address entry = __ pc();
2861 
2862     const Register out     = x10;
2863     const Register in      = x11;
2864     const Register offset  = x12;
2865     const Register len     = x13;
2866     const Register k       = x14;
2867     const Register tmp     = x28;
2868 
2869     BLOCK_COMMENT("Entry:");
2870     __ enter();
2871     __ mul_add(out, in, offset, len, k, tmp);
2872     __ leave();
2873     __ ret();
2874 
2875     return entry;
2876   }
2877 
2878   /**
2879    *  Arguments:
2880    *
2881    *  Input:
2882    *    c_rarg0   - x address
2883    *    c_rarg1   - x length
2884    *    c_rarg2   - y address
2885    *    c_rarg3   - y length
2886    *    c_rarg4   - z address
2887    */
2888   address generate_multiplyToLen()
2889   {
2890     __ align(CodeEntryAlignment);
2891     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2892     address entry = __ pc();
2893 
2894     const Register x     = x10;
2895     const Register xlen  = x11;
2896     const Register y     = x12;
2897     const Register ylen  = x13;
2898     const Register z     = x14;
2899 
2900     const Register tmp0  = x15;
2901     const Register tmp1  = x16;
2902     const Register tmp2  = x17;
2903     const Register tmp3  = x7;
2904     const Register tmp4  = x28;
2905     const Register tmp5  = x29;
2906     const Register tmp6  = x30;
2907     const Register tmp7  = x31;
2908 
2909     BLOCK_COMMENT("Entry:");
2910     __ enter(); // required for proper stackwalking of RuntimeStub frame
2911     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2912     __ leave(); // required for proper stackwalking of RuntimeStub frame
2913     __ ret();
2914 
2915     return entry;
2916   }
2917 
2918   address generate_squareToLen()
2919   {
2920     __ align(CodeEntryAlignment);
2921     StubCodeMark mark(this, "StubRoutines", "squareToLen");
2922     address entry = __ pc();
2923 
2924     const Register x     = x10;
2925     const Register xlen  = x11;
2926     const Register z     = x12;
2927     const Register y     = x14; // == x
2928     const Register ylen  = x15; // == xlen
2929 
2930     const Register tmp0  = x13; // zlen, unused
2931     const Register tmp1  = x16;
2932     const Register tmp2  = x17;
2933     const Register tmp3  = x7;
2934     const Register tmp4  = x28;
2935     const Register tmp5  = x29;
2936     const Register tmp6  = x30;
2937     const Register tmp7  = x31;
2938 
2939     BLOCK_COMMENT("Entry:");
2940     __ enter();
2941     __ mv(y, x);
2942     __ mv(ylen, xlen);
2943     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2944     __ leave();
2945     __ ret();
2946 
2947     return entry;
2948   }
2949 
2950   // Arguments:
2951   //
2952   // Input:
2953   //   c_rarg0   - newArr address
2954   //   c_rarg1   - oldArr address
2955   //   c_rarg2   - newIdx
2956   //   c_rarg3   - shiftCount
2957   //   c_rarg4   - numIter
2958   //
2959   address generate_bigIntegerLeftShift() {
2960     __ align(CodeEntryAlignment);
2961     StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
2962     address entry = __ pc();
2963 
2964     Label loop, exit;
2965 
2966     Register newArr        = c_rarg0;
2967     Register oldArr        = c_rarg1;
2968     Register newIdx        = c_rarg2;
2969     Register shiftCount    = c_rarg3;
2970     Register numIter       = c_rarg4;
2971 
2972     Register shiftRevCount = c_rarg5;
2973     Register oldArrNext    = t1;
2974 
2975     __ beqz(numIter, exit);
2976     __ shadd(newArr, newIdx, newArr, t0, 2);
2977 
2978     __ mv(shiftRevCount, 32);
2979     __ sub(shiftRevCount, shiftRevCount, shiftCount);
2980 
2981     __ bind(loop);
2982     __ addi(oldArrNext, oldArr, 4);
2983     __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
2984     __ vle32_v(v0, oldArr);
2985     __ vle32_v(v4, oldArrNext);
2986     __ vsll_vx(v0, v0, shiftCount);
2987     __ vsrl_vx(v4, v4, shiftRevCount);
2988     __ vor_vv(v0, v0, v4);
2989     __ vse32_v(v0, newArr);
2990     __ sub(numIter, numIter, t0);
2991     __ shadd(oldArr, t0, oldArr, t1, 2);
2992     __ shadd(newArr, t0, newArr, t1, 2);
2993     __ bnez(numIter, loop);
2994 
2995     __ bind(exit);
2996     __ ret();
2997 
2998     return entry;
2999   }
3000 
3001   // Arguments:
3002   //
3003   // Input:
3004   //   c_rarg0   - newArr address
3005   //   c_rarg1   - oldArr address
3006   //   c_rarg2   - newIdx
3007   //   c_rarg3   - shiftCount
3008   //   c_rarg4   - numIter
3009   //
3010   address generate_bigIntegerRightShift() {
3011     __ align(CodeEntryAlignment);
3012     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
3013     address entry = __ pc();
3014 
3015     Label loop, exit;
3016 
3017     Register newArr        = c_rarg0;
3018     Register oldArr        = c_rarg1;
3019     Register newIdx        = c_rarg2;
3020     Register shiftCount    = c_rarg3;
3021     Register numIter       = c_rarg4;
3022     Register idx           = numIter;
3023 
3024     Register shiftRevCount = c_rarg5;
3025     Register oldArrNext    = c_rarg6;
3026     Register newArrCur     = t0;
3027     Register oldArrCur     = t1;
3028 
3029     __ beqz(idx, exit);
3030     __ shadd(newArr, newIdx, newArr, t0, 2);
3031 
3032     __ mv(shiftRevCount, 32);
3033     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3034 
3035     __ bind(loop);
3036     __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3037     __ sub(idx, idx, t0);
3038     __ shadd(oldArrNext, idx, oldArr, t1, 2);
3039     __ shadd(newArrCur, idx, newArr, t1, 2);
3040     __ addi(oldArrCur, oldArrNext, 4);
3041     __ vle32_v(v0, oldArrCur);
3042     __ vle32_v(v4, oldArrNext);
3043     __ vsrl_vx(v0, v0, shiftCount);
3044     __ vsll_vx(v4, v4, shiftRevCount);
3045     __ vor_vv(v0, v0, v4);
3046     __ vse32_v(v0, newArrCur);
3047     __ bnez(idx, loop);
3048 
3049     __ bind(exit);
3050     __ ret();
3051 
3052     return entry;
3053   }
3054 #endif
3055 
3056 #ifdef COMPILER2
3057   class MontgomeryMultiplyGenerator : public MacroAssembler {
3058 
3059     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3060       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3061 
3062     RegSet _toSave;
3063     bool _squaring;
3064 
3065   public:
3066     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3067       : MacroAssembler(as->code()), _squaring(squaring) {
3068 
3069       // Register allocation
3070 
3071       RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
3072       Pa_base = *regs;       // Argument registers
3073       if (squaring) {
3074         Pb_base = Pa_base;
3075       } else {
3076         Pb_base = *++regs;
3077       }
3078       Pn_base = *++regs;
3079       Rlen= *++regs;
3080       inv = *++regs;
3081       Pm_base = *++regs;
3082 
3083                         // Working registers:
3084       Ra =  *++regs;    // The current digit of a, b, n, and m.
3085       Rb =  *++regs;
3086       Rm =  *++regs;
3087       Rn =  *++regs;
3088 
3089       Pa =  *++regs;      // Pointers to the current/next digit of a, b, n, and m.
3090       Pb =  *++regs;
3091       Pm =  *++regs;
3092       Pn =  *++regs;
3093 
3094       tmp0 =  *++regs;    // Three registers which form a
3095       tmp1 =  *++regs;    // triple-precision accumuator.
3096       tmp2 =  *++regs;
3097 
3098       Ri =  x6;         // Inner and outer loop indexes.
3099       Rj =  x7;
3100 
3101       Rhi_ab = x28;     // Product registers: low and high parts
3102       Rlo_ab = x29;     // of a*b and m*n.
3103       Rhi_mn = x30;
3104       Rlo_mn = x31;
3105 
3106       // x18 and up are callee-saved.
3107       _toSave = RegSet::range(x18, *regs) + Pm_base;
3108     }
3109 
3110   private:
3111     void save_regs() {
3112       push_reg(_toSave, sp);
3113     }
3114 
3115     void restore_regs() {
3116       pop_reg(_toSave, sp);
3117     }
3118 
3119     template <typename T>
3120     void unroll_2(Register count, T block) {
3121       Label loop, end, odd;
3122       beqz(count, end);
3123       test_bit(t0, count, 0);
3124       bnez(t0, odd);
3125       align(16);
3126       bind(loop);
3127       (this->*block)();
3128       bind(odd);
3129       (this->*block)();
3130       addi(count, count, -2);
3131       bgtz(count, loop);
3132       bind(end);
3133     }
3134 
3135     template <typename T>
3136     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3137       Label loop, end, odd;
3138       beqz(count, end);
3139       test_bit(tmp, count, 0);
3140       bnez(tmp, odd);
3141       align(16);
3142       bind(loop);
3143       (this->*block)(d, s, tmp);
3144       bind(odd);
3145       (this->*block)(d, s, tmp);
3146       addi(count, count, -2);
3147       bgtz(count, loop);
3148       bind(end);
3149     }
3150 
3151     void pre1(RegisterOrConstant i) {
3152       block_comment("pre1");
3153       // Pa = Pa_base;
3154       // Pb = Pb_base + i;
3155       // Pm = Pm_base;
3156       // Pn = Pn_base + i;
3157       // Ra = *Pa;
3158       // Rb = *Pb;
3159       // Rm = *Pm;
3160       // Rn = *Pn;
3161       if (i.is_register()) {
3162         slli(t0, i.as_register(), LogBytesPerWord);
3163       } else {
3164         mv(t0, i.as_constant());
3165         slli(t0, t0, LogBytesPerWord);
3166       }
3167 
3168       mv(Pa, Pa_base);
3169       add(Pb, Pb_base, t0);
3170       mv(Pm, Pm_base);
3171       add(Pn, Pn_base, t0);
3172 
3173       ld(Ra, Address(Pa));
3174       ld(Rb, Address(Pb));
3175       ld(Rm, Address(Pm));
3176       ld(Rn, Address(Pn));
3177 
3178       // Zero the m*n result.
3179       mv(Rhi_mn, zr);
3180       mv(Rlo_mn, zr);
3181     }
3182 
3183     // The core multiply-accumulate step of a Montgomery
3184     // multiplication.  The idea is to schedule operations as a
3185     // pipeline so that instructions with long latencies (loads and
3186     // multiplies) have time to complete before their results are
3187     // used.  This most benefits in-order implementations of the
3188     // architecture but out-of-order ones also benefit.
3189     void step() {
3190       block_comment("step");
3191       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3192       // Ra = *++Pa;
3193       // Rb = *--Pb;
3194       mulhu(Rhi_ab, Ra, Rb);
3195       mul(Rlo_ab, Ra, Rb);
3196       addi(Pa, Pa, wordSize);
3197       ld(Ra, Address(Pa));
3198       addi(Pb, Pb, -wordSize);
3199       ld(Rb, Address(Pb));
3200       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
3201                                             // previous iteration.
3202       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3203       // Rm = *++Pm;
3204       // Rn = *--Pn;
3205       mulhu(Rhi_mn, Rm, Rn);
3206       mul(Rlo_mn, Rm, Rn);
3207       addi(Pm, Pm, wordSize);
3208       ld(Rm, Address(Pm));
3209       addi(Pn, Pn, -wordSize);
3210       ld(Rn, Address(Pn));
3211       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3212     }
3213 
3214     void post1() {
3215       block_comment("post1");
3216 
3217       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3218       // Ra = *++Pa;
3219       // Rb = *--Pb;
3220       mulhu(Rhi_ab, Ra, Rb);
3221       mul(Rlo_ab, Ra, Rb);
3222       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3223       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3224 
3225       // *Pm = Rm = tmp0 * inv;
3226       mul(Rm, tmp0, inv);
3227       sd(Rm, Address(Pm));
3228 
3229       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3230       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3231       mulhu(Rhi_mn, Rm, Rn);
3232 
3233 #ifndef PRODUCT
3234       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3235       {
3236         mul(Rlo_mn, Rm, Rn);
3237         add(Rlo_mn, tmp0, Rlo_mn);
3238         Label ok;
3239         beqz(Rlo_mn, ok);
3240         stop("broken Montgomery multiply");
3241         bind(ok);
3242       }
3243 #endif
3244       // We have very carefully set things up so that
3245       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3246       // the lower half of Rm * Rn because we know the result already:
3247       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3248       // tmp0 != 0.  So, rather than do a mul and an cad we just set
3249       // the carry flag iff tmp0 is nonzero.
3250       //
3251       // mul(Rlo_mn, Rm, Rn);
3252       // cad(zr, tmp0, Rlo_mn);
3253       addi(t0, tmp0, -1);
3254       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3255       cadc(tmp0, tmp1, Rhi_mn, t0);
3256       adc(tmp1, tmp2, zr, t0);
3257       mv(tmp2, zr);
3258     }
3259 
3260     void pre2(Register i, Register len) {
3261       block_comment("pre2");
3262       // Pa = Pa_base + i-len;
3263       // Pb = Pb_base + len;
3264       // Pm = Pm_base + i-len;
3265       // Pn = Pn_base + len;
3266 
3267       sub(Rj, i, len);
3268       // Rj == i-len
3269 
3270       // Ra as temp register
3271       slli(Ra, Rj, LogBytesPerWord);
3272       add(Pa, Pa_base, Ra);
3273       add(Pm, Pm_base, Ra);
3274       slli(Ra, len, LogBytesPerWord);
3275       add(Pb, Pb_base, Ra);
3276       add(Pn, Pn_base, Ra);
3277 
3278       // Ra = *++Pa;
3279       // Rb = *--Pb;
3280       // Rm = *++Pm;
3281       // Rn = *--Pn;
3282       add(Pa, Pa, wordSize);
3283       ld(Ra, Address(Pa));
3284       add(Pb, Pb, -wordSize);
3285       ld(Rb, Address(Pb));
3286       add(Pm, Pm, wordSize);
3287       ld(Rm, Address(Pm));
3288       add(Pn, Pn, -wordSize);
3289       ld(Rn, Address(Pn));
3290 
3291       mv(Rhi_mn, zr);
3292       mv(Rlo_mn, zr);
3293     }
3294 
3295     void post2(Register i, Register len) {
3296       block_comment("post2");
3297       sub(Rj, i, len);
3298 
3299       cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
3300 
3301       // As soon as we know the least significant digit of our result,
3302       // store it.
3303       // Pm_base[i-len] = tmp0;
3304       // Rj as temp register
3305       slli(Rj, Rj, LogBytesPerWord);
3306       add(Rj, Pm_base, Rj);
3307       sd(tmp0, Address(Rj));
3308 
3309       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3310       cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
3311       adc(tmp1, tmp2, zr, t0);
3312       mv(tmp2, zr);
3313     }
3314 
3315     // A carry in tmp0 after Montgomery multiplication means that we
3316     // should subtract multiples of n from our result in m.  We'll
3317     // keep doing that until there is no carry.
3318     void normalize(Register len) {
3319       block_comment("normalize");
3320       // while (tmp0)
3321       //   tmp0 = sub(Pm_base, Pn_base, tmp0, len);
3322       Label loop, post, again;
3323       Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
3324       beqz(tmp0, post); {
3325         bind(again); {
3326           mv(i, zr);
3327           mv(cnt, len);
3328           slli(Rn, i, LogBytesPerWord);
3329           add(Rm, Pm_base, Rn);
3330           ld(Rm, Address(Rm));
3331           add(Rn, Pn_base, Rn);
3332           ld(Rn, Address(Rn));
3333           mv(t0, 1); // set carry flag, i.e. no borrow
3334           align(16);
3335           bind(loop); {
3336             notr(Rn, Rn);
3337             add(Rm, Rm, t0);
3338             add(Rm, Rm, Rn);
3339             sltu(t0, Rm, Rn);
3340             slli(Rn, i, LogBytesPerWord); // Rn as temp register
3341             add(Rn, Pm_base, Rn);
3342             sd(Rm, Address(Rn));
3343             add(i, i, 1);
3344             slli(Rn, i, LogBytesPerWord);
3345             add(Rm, Pm_base, Rn);
3346             ld(Rm, Address(Rm));
3347             add(Rn, Pn_base, Rn);
3348             ld(Rn, Address(Rn));
3349             sub(cnt, cnt, 1);
3350           } bnez(cnt, loop);
3351           addi(tmp0, tmp0, -1);
3352           add(tmp0, tmp0, t0);
3353         } bnez(tmp0, again);
3354       } bind(post);
3355     }
3356 
3357     // Move memory at s to d, reversing words.
3358     //    Increments d to end of copied memory
3359     //    Destroys tmp1, tmp2
3360     //    Preserves len
3361     //    Leaves s pointing to the address which was in d at start
3362     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3363       assert(tmp1->encoding() < x28->encoding(), "register corruption");
3364       assert(tmp2->encoding() < x28->encoding(), "register corruption");
3365 
3366       shadd(s, len, s, tmp1, LogBytesPerWord);
3367       mv(tmp1, len);
3368       unroll_2(tmp1,  &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3369       slli(tmp1, len, LogBytesPerWord);
3370       sub(s, d, tmp1);
3371     }
3372     // [63...0] -> [31...0][63...32]
3373     void reverse1(Register d, Register s, Register tmp) {
3374       addi(s, s, -wordSize);
3375       ld(tmp, Address(s));
3376       ror_imm(tmp, tmp, 32, t0);
3377       sd(tmp, Address(d));
3378       addi(d, d, wordSize);
3379     }
3380 
3381     void step_squaring() {
3382       // An extra ACC
3383       step();
3384       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3385     }
3386 
3387     void last_squaring(Register i) {
3388       Label dont;
3389       // if ((i & 1) == 0) {
3390       test_bit(t0, i, 0);
3391       bnez(t0, dont); {
3392         // MACC(Ra, Rb, tmp0, tmp1, tmp2);
3393         // Ra = *++Pa;
3394         // Rb = *--Pb;
3395         mulhu(Rhi_ab, Ra, Rb);
3396         mul(Rlo_ab, Ra, Rb);
3397         acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
3398       } bind(dont);
3399     }
3400 
3401     void extra_step_squaring() {
3402       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3403 
3404       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3405       // Rm = *++Pm;
3406       // Rn = *--Pn;
3407       mulhu(Rhi_mn, Rm, Rn);
3408       mul(Rlo_mn, Rm, Rn);
3409       addi(Pm, Pm, wordSize);
3410       ld(Rm, Address(Pm));
3411       addi(Pn, Pn, -wordSize);
3412       ld(Rn, Address(Pn));
3413     }
3414 
3415     void post1_squaring() {
3416       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
3417 
3418       // *Pm = Rm = tmp0 * inv;
3419       mul(Rm, tmp0, inv);
3420       sd(Rm, Address(Pm));
3421 
3422       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
3423       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
3424       mulhu(Rhi_mn, Rm, Rn);
3425 
3426 #ifndef PRODUCT
3427       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
3428       {
3429         mul(Rlo_mn, Rm, Rn);
3430         add(Rlo_mn, tmp0, Rlo_mn);
3431         Label ok;
3432         beqz(Rlo_mn, ok); {
3433           stop("broken Montgomery multiply");
3434         } bind(ok);
3435       }
3436 #endif
3437       // We have very carefully set things up so that
3438       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
3439       // the lower half of Rm * Rn because we know the result already:
3440       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
3441       // tmp0 != 0.  So, rather than do a mul and a cad we just set
3442       // the carry flag iff tmp0 is nonzero.
3443       //
3444       // mul(Rlo_mn, Rm, Rn);
3445       // cad(zr, tmp, Rlo_mn);
3446       addi(t0, tmp0, -1);
3447       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
3448       cadc(tmp0, tmp1, Rhi_mn, t0);
3449       adc(tmp1, tmp2, zr, t0);
3450       mv(tmp2, zr);
3451     }
3452 
3453     // use t0 as carry
3454     void acc(Register Rhi, Register Rlo,
3455              Register tmp0, Register tmp1, Register tmp2) {
3456       cad(tmp0, tmp0, Rlo, t0);
3457       cadc(tmp1, tmp1, Rhi, t0);
3458       adc(tmp2, tmp2, zr, t0);
3459     }
3460 
3461   public:
3462     /**
3463      * Fast Montgomery multiplication.  The derivation of the
3464      * algorithm is in A Cryptographic Library for the Motorola
3465      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3466      *
3467      * Arguments:
3468      *
3469      * Inputs for multiplication:
3470      *   c_rarg0   - int array elements a
3471      *   c_rarg1   - int array elements b
3472      *   c_rarg2   - int array elements n (the modulus)
3473      *   c_rarg3   - int length
3474      *   c_rarg4   - int inv
3475      *   c_rarg5   - int array elements m (the result)
3476      *
3477      * Inputs for squaring:
3478      *   c_rarg0   - int array elements a
3479      *   c_rarg1   - int array elements n (the modulus)
3480      *   c_rarg2   - int length
3481      *   c_rarg3   - int inv
3482      *   c_rarg4   - int array elements m (the result)
3483      *
3484      */
3485     address generate_multiply() {
3486       Label argh, nothing;
3487       bind(argh);
3488       stop("MontgomeryMultiply total_allocation must be <= 8192");
3489 
3490       align(CodeEntryAlignment);
3491       address entry = pc();
3492 
3493       beqz(Rlen, nothing);
3494 
3495       enter();
3496 
3497       // Make room.
3498       mv(Ra, 512);
3499       bgt(Rlen, Ra, argh);
3500       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3501       sub(Ra, sp, Ra);
3502       andi(sp, Ra, -2 * wordSize);
3503 
3504       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3505 
3506       {
3507         // Copy input args, reversing as we go.  We use Ra as a
3508         // temporary variable.
3509         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3510         if (!_squaring)
3511           reverse(Ra, Pb_base, Rlen, Ri, Rj);
3512         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3513       }
3514 
3515       // Push all call-saved registers and also Pm_base which we'll need
3516       // at the end.
3517       save_regs();
3518 
3519 #ifndef PRODUCT
3520       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3521       {
3522         ld(Rn, Address(Pn_base));
3523         mul(Rlo_mn, Rn, inv);
3524         mv(t0, -1);
3525         Label ok;
3526         beq(Rlo_mn, t0, ok);
3527         stop("broken inverse in Montgomery multiply");
3528         bind(ok);
3529       }
3530 #endif
3531 
3532       mv(Pm_base, Ra);
3533 
3534       mv(tmp0, zr);
3535       mv(tmp1, zr);
3536       mv(tmp2, zr);
3537 
3538       block_comment("for (int i = 0; i < len; i++) {");
3539       mv(Ri, zr); {
3540         Label loop, end;
3541         bge(Ri, Rlen, end);
3542 
3543         bind(loop);
3544         pre1(Ri);
3545 
3546         block_comment("  for (j = i; j; j--) {"); {
3547           mv(Rj, Ri);
3548           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3549         } block_comment("  } // j");
3550 
3551         post1();
3552         addw(Ri, Ri, 1);
3553         blt(Ri, Rlen, loop);
3554         bind(end);
3555         block_comment("} // i");
3556       }
3557 
3558       block_comment("for (int i = len; i < 2*len; i++) {");
3559       mv(Ri, Rlen); {
3560         Label loop, end;
3561         slli(t0, Rlen, 1);
3562         bge(Ri, t0, end);
3563 
3564         bind(loop);
3565         pre2(Ri, Rlen);
3566 
3567         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3568           slliw(Rj, Rlen, 1);
3569           subw(Rj, Rj, Ri);
3570           subw(Rj, Rj, 1);
3571           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3572         } block_comment("  } // j");
3573 
3574         post2(Ri, Rlen);
3575         addw(Ri, Ri, 1);
3576         slli(t0, Rlen, 1);
3577         blt(Ri, t0, loop);
3578         bind(end);
3579       }
3580       block_comment("} // i");
3581 
3582       normalize(Rlen);
3583 
3584       mv(Ra, Pm_base);  // Save Pm_base in Ra
3585       restore_regs();  // Restore caller's Pm_base
3586 
3587       // Copy our result into caller's Pm_base
3588       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3589 
3590       leave();
3591       bind(nothing);
3592       ret();
3593 
3594       return entry;
3595     }
3596 
3597     /**
3598      *
3599      * Arguments:
3600      *
3601      * Inputs:
3602      *   c_rarg0   - int array elements a
3603      *   c_rarg1   - int array elements n (the modulus)
3604      *   c_rarg2   - int length
3605      *   c_rarg3   - int inv
3606      *   c_rarg4   - int array elements m (the result)
3607      *
3608      */
3609     address generate_square() {
3610       Label argh;
3611       bind(argh);
3612       stop("MontgomeryMultiply total_allocation must be <= 8192");
3613 
3614       align(CodeEntryAlignment);
3615       address entry = pc();
3616 
3617       enter();
3618 
3619       // Make room.
3620       mv(Ra, 512);
3621       bgt(Rlen, Ra, argh);
3622       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
3623       sub(Ra, sp, Ra);
3624       andi(sp, Ra, -2 * wordSize);
3625 
3626       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
3627 
3628       {
3629         // Copy input args, reversing as we go.  We use Ra as a
3630         // temporary variable.
3631         reverse(Ra, Pa_base, Rlen, Ri, Rj);
3632         reverse(Ra, Pn_base, Rlen, Ri, Rj);
3633       }
3634 
3635       // Push all call-saved registers and also Pm_base which we'll need
3636       // at the end.
3637       save_regs();
3638 
3639       mv(Pm_base, Ra);
3640 
3641       mv(tmp0, zr);
3642       mv(tmp1, zr);
3643       mv(tmp2, zr);
3644 
3645       block_comment("for (int i = 0; i < len; i++) {");
3646       mv(Ri, zr); {
3647         Label loop, end;
3648         bind(loop);
3649         bge(Ri, Rlen, end);
3650 
3651         pre1(Ri);
3652 
3653         block_comment("for (j = (i+1)/2; j; j--) {"); {
3654           addi(Rj, Ri, 1);
3655           srliw(Rj, Rj, 1);
3656           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3657         } block_comment("  } // j");
3658 
3659         last_squaring(Ri);
3660 
3661         block_comment("  for (j = i/2; j; j--) {"); {
3662           srliw(Rj, Ri, 1);
3663           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3664         } block_comment("  } // j");
3665 
3666         post1_squaring();
3667         addi(Ri, Ri, 1);
3668         blt(Ri, Rlen, loop);
3669 
3670         bind(end);
3671         block_comment("} // i");
3672       }
3673 
3674       block_comment("for (int i = len; i < 2*len; i++) {");
3675       mv(Ri, Rlen); {
3676         Label loop, end;
3677         bind(loop);
3678         slli(t0, Rlen, 1);
3679         bge(Ri, t0, end);
3680 
3681         pre2(Ri, Rlen);
3682 
3683         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
3684           slli(Rj, Rlen, 1);
3685           sub(Rj, Rj, Ri);
3686           sub(Rj, Rj, 1);
3687           srliw(Rj, Rj, 1);
3688           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3689         } block_comment("  } // j");
3690 
3691         last_squaring(Ri);
3692 
3693         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
3694           slli(Rj, Rlen, 1);
3695           sub(Rj, Rj, Ri);
3696           srliw(Rj, Rj, 1);
3697           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3698         } block_comment("  } // j");
3699 
3700         post2(Ri, Rlen);
3701         addi(Ri, Ri, 1);
3702         slli(t0, Rlen, 1);
3703         blt(Ri, t0, loop);
3704 
3705         bind(end);
3706         block_comment("} // i");
3707       }
3708 
3709       normalize(Rlen);
3710 
3711       mv(Ra, Pm_base);  // Save Pm_base in Ra
3712       restore_regs();  // Restore caller's Pm_base
3713 
3714       // Copy our result into caller's Pm_base
3715       reverse(Pm_base, Ra, Rlen, Ri, Rj);
3716 
3717       leave();
3718       ret();
3719 
3720       return entry;
3721     }
3722   };
3723 
3724 #endif // COMPILER2
3725 
3726   address generate_cont_thaw(Continuation::thaw_kind kind) {
3727     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
3728     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
3729 
3730     address start = __ pc();
3731 
3732     if (return_barrier) {
3733       __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
3734     }
3735 
3736 #ifndef PRODUCT
3737     {
3738       Label OK;
3739       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
3740       __ beq(sp, t0, OK);
3741       __ stop("incorrect sp");
3742       __ bind(OK);
3743     }
3744 #endif
3745 
3746     if (return_barrier) {
3747       // preserve possible return value from a method returning to the return barrier
3748       __ sub(sp, sp, 2 * wordSize);
3749       __ fsd(f10, Address(sp, 0 * wordSize));
3750       __ sd(x10, Address(sp, 1 * wordSize));
3751     }
3752 
3753     __ mv(c_rarg1, (return_barrier ? 1 : 0));
3754     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
3755     __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
3756 
3757     if (return_barrier) {
3758       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
3759       __ ld(x10, Address(sp, 1 * wordSize));
3760       __ fld(f10, Address(sp, 0 * wordSize));
3761       __ add(sp, sp, 2 * wordSize);
3762     }
3763 
3764 #ifndef PRODUCT
3765     {
3766       Label OK;
3767       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
3768       __ beq(sp, t0, OK);
3769       __ stop("incorrect sp");
3770       __ bind(OK);
3771     }
3772 #endif
3773 
3774     Label thaw_success;
3775     // t1 contains the size of the frames to thaw, 0 if overflow or no more frames
3776     __ bnez(t1, thaw_success);
3777     __ la(t0, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
3778     __ jr(t0);
3779     __ bind(thaw_success);
3780 
3781     // make room for the thawed frames
3782     __ sub(t0, sp, t1);
3783     __ andi(sp, t0, -16); // align
3784 
3785     if (return_barrier) {
3786       // save original return value -- again
3787       __ sub(sp, sp, 2 * wordSize);
3788       __ fsd(f10, Address(sp, 0 * wordSize));
3789       __ sd(x10, Address(sp, 1 * wordSize));
3790     }
3791 
3792     // If we want, we can templatize thaw by kind, and have three different entries
3793     __ mv(c_rarg1, kind);
3794 
3795     __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
3796     __ mv(t1, x10); // x10 is the sp of the yielding frame
3797 
3798     if (return_barrier) {
3799       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
3800       __ ld(x10, Address(sp, 1 * wordSize));
3801       __ fld(f10, Address(sp, 0 * wordSize));
3802       __ add(sp, sp, 2 * wordSize);
3803     } else {
3804       __ mv(x10, zr); // return 0 (success) from doYield
3805     }
3806 
3807     // we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
3808     __ mv(fp, t1);
3809     __ sub(sp, t1, 2 * wordSize); // now pointing to fp spill
3810 
3811     if (return_barrier_exception) {
3812       __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
3813       __ verify_oop(x10);
3814       __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
3815 
3816       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
3817 
3818       // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
3819 
3820       __ mv(x11, x10); // the exception handler
3821       __ mv(x10, x9); // restore return value contaning the exception oop
3822       __ verify_oop(x10);
3823 
3824       __ leave();
3825       __ mv(x13, ra);
3826       __ jr(x11); // the exception handler
3827     } else {
3828       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
3829       __ leave();
3830       __ ret();
3831     }
3832 
3833     return start;
3834   }
3835 
3836   address generate_cont_thaw() {
3837     if (!Continuations::enabled()) return nullptr;
3838 
3839     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
3840     address start = __ pc();
3841     generate_cont_thaw(Continuation::thaw_top);
3842     return start;
3843   }
3844 
3845   address generate_cont_returnBarrier() {
3846     if (!Continuations::enabled()) return nullptr;
3847 
3848     // TODO: will probably need multiple return barriers depending on return type
3849     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
3850     address start = __ pc();
3851 
3852     generate_cont_thaw(Continuation::thaw_return_barrier);
3853 
3854     return start;
3855   }
3856 
3857   address generate_cont_returnBarrier_exception() {
3858     if (!Continuations::enabled()) return nullptr;
3859 
3860     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
3861     address start = __ pc();
3862 
3863     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
3864 
3865     return start;
3866   }
3867 
3868   address generate_cont_preempt_stub() {
3869     if (!Continuations::enabled()) return nullptr;
3870     StubCodeMark mark(this, "StubRoutines","Continuation preempt stub");
3871     address start = __ pc();
3872 
3873     __ reset_last_Java_frame(true);
3874 
3875     // reset the flag
3876     __ sb(zr, Address(xthread, JavaThread::preempting_offset()));
3877 
3878     // Set sp to enterSpecial frame and then remove it from the stack
3879     __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
3880 
3881     Label preemption_cancelled;
3882     __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset()));
3883     __ bnez(t0, preemption_cancelled);
3884 
3885     // Remove enterSpecial frame from the stack and return to Continuation.run()
3886     SharedRuntime::continuation_enter_cleanup(_masm);
3887     __ leave();
3888     __ ret();
3889 
3890     __ bind(preemption_cancelled);
3891     __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset()));
3892     __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize)));
3893     __ la(t0, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
3894     __ ld(t0, Address(t0));
3895     __ jr(t0);
3896 
3897     return start;
3898   }
3899 
3900 #if COMPILER2_OR_JVMCI
3901 
3902 #undef __
3903 #define __ this->
3904 
3905   class Sha2Generator : public MacroAssembler {
3906     StubCodeGenerator* _cgen;
3907    public:
3908       Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
3909       address generate_sha256_implCompress(bool multi_block) {
3910         return generate_sha2_implCompress(Assembler::e32, multi_block);
3911       }
3912       address generate_sha512_implCompress(bool multi_block) {
3913         return generate_sha2_implCompress(Assembler::e64, multi_block);
3914       }
3915    private:
3916 
3917     void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
3918       if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
3919       else                            __ vle64_v(vr, sr);
3920     }
3921 
3922     void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
3923       if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
3924       else                            __ vse64_v(vr, sr);
3925     }
3926 
3927     // Overview of the logic in each "quad round".
3928     //
3929     // The code below repeats 16/20 times the logic implementing four rounds
3930     // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
3931     // to implementing the 64/80 single rounds.
3932     //
3933     //    // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
3934     //    // Output:
3935     //    //   vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
3936     //    vl1reXX.v vTmp1, ofs
3937     //
3938     //    // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
3939     //    addi ofs, ofs, 16/32
3940     //
3941     //    // Add constants to message schedule words:
3942     //    //  Input
3943     //    //    vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
3944     //    //    vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
3945     //    //  Output
3946     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
3947     //    vadd.vv vTmp0, vTmp1, vW0
3948     //
3949     //    //  2 rounds of working variables updates.
3950     //    //     vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
3951     //    //  Input:
3952     //    //    vState1 = {c[t],d[t],g[t],h[t]}   " = vState1[t] "
3953     //    //    vState0 = {a[t],b[t],e[t],f[t]}
3954     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
3955     //    //  Output:
3956     //    //    vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}  " = vState0[t+2] "
3957     //    //        = {h[t+4],g[t+4],d[t+4],c[t+4]}  " = vState1[t+4] "
3958     //    vsha2cl.vv vState1, vState0, vTmp0
3959     //
3960     //    //  2 rounds of working variables updates.
3961     //    //     vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
3962     //    //  Input
3963     //    //   vState0 = {a[t],b[t],e[t],f[t]}       " = vState0[t] "
3964     //    //       = {h[t+2],g[t+2],d[t+2],c[t+2]}   " = vState1[t+2] "
3965     //    //   vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}   " = vState0[t+2] "
3966     //    //   vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
3967     //    //  Output:
3968     //    //   vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]}   " = vState0[t+4] "
3969     //    vsha2ch.vv vState0, vState1, vTmp0
3970     //
3971     //    // Combine 2QW into 1QW
3972     //    //
3973     //    // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
3974     //    //     vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
3975     //    // and it can only take 3 vectors as inputs. Hence we need to combine
3976     //    // vW1[0] and vW2[1..3] in a single vector.
3977     //    //
3978     //    // vmerge Vt4, Vt1, Vt2, V0
3979     //    // Input
3980     //    //  V0 = mask // first word from vW2, 1..3 words from vW1
3981     //    //  vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
3982     //    //  vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
3983     //    // Output
3984     //    //  Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
3985     //    vmerge.vvm vTmp0, vW2, vW1, v0
3986     //
3987     //    // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
3988     //    // Input
3989     //    //  vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]}     W[ 3: 0]
3990     //    //  vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]}     W[15:12]
3991     //    //  vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]}     W[11: 9,4]
3992     //    // Output (next four message schedule words)
3993     //    //  vW0 = {W[t+19],  W[t+18],  W[t+17],  W[t+16]}  W[19:16]
3994     //    vsha2ms.vv vW0, vTmp0, vW3
3995     //
3996     // BEFORE
3997     //  vW0 - vW3 hold the message schedule words (initially the block words)
3998     //    vW0 = W[ 3: 0]   "oldest"
3999     //    vW1 = W[ 7: 4]
4000     //    vW2 = W[11: 8]
4001     //    vW3 = W[15:12]   "newest"
4002     //
4003     //  vt6 - vt7 hold the working state variables
4004     //    vState0 = {a[t],b[t],e[t],f[t]}   // initially {H5,H4,H1,H0}
4005     //    vState1 = {c[t],d[t],g[t],h[t]}   // initially {H7,H6,H3,H2}
4006     //
4007     // AFTER
4008     //  vW0 - vW3 hold the message schedule words (initially the block words)
4009     //    vW1 = W[ 7: 4]   "oldest"
4010     //    vW2 = W[11: 8]
4011     //    vW3 = W[15:12]
4012     //    vW0 = W[19:16]   "newest"
4013     //
4014     //  vState0 and vState1 hold the working state variables
4015     //    vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
4016     //    vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
4017     //
4018     //  The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
4019     //  hence the uses of those vectors rotate in each round, and we get back to the
4020     //  initial configuration every 4 quad-rounds. We could avoid those changes at
4021     //  the cost of moving those vectors at the end of each quad-rounds.
4022     void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
4023                          Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
4024                          bool gen_words = true, bool step_const = true) {
4025       __ vleXX_v(vset_sew, vtemp, scalarconst);
4026       if (step_const) {
4027         __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
4028       }
4029       __ vadd_vv(vtemp2, vtemp, rot1);
4030       __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
4031       __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
4032       if (gen_words) {
4033         __ vmerge_vvm(vtemp2, rot3, rot2);
4034         __ vsha2ms_vv(rot1, vtemp2, rot4);
4035       }
4036     }
4037 
4038     const char* stub_name(Assembler::SEW vset_sew, bool multi_block) {
4039       if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress";
4040       if (vset_sew == Assembler::e32 &&  multi_block) return "sha256_implCompressMB";
4041       if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress";
4042       if (vset_sew == Assembler::e64 &&  multi_block) return "sha512_implCompressMB";
4043       ShouldNotReachHere();
4044       return "bad name lookup";
4045     }
4046 
4047     // Arguments:
4048     //
4049     // Inputs:
4050     //   c_rarg0   - byte[]  source+offset
4051     //   c_rarg1   - int[]   SHA.state
4052     //   c_rarg2   - int     offset
4053     //   c_rarg3   - int     limit
4054     //
4055     address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) {
4056       alignas(64) static const uint32_t round_consts_256[64] = {
4057         0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
4058         0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
4059         0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
4060         0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
4061         0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
4062         0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
4063         0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
4064         0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
4065         0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
4066         0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
4067         0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
4068         0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
4069         0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
4070         0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
4071         0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
4072         0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
4073       };
4074       alignas(64) static const uint64_t round_consts_512[80] = {
4075         0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
4076         0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
4077         0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
4078         0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
4079         0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
4080         0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
4081         0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
4082         0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
4083         0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
4084         0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
4085         0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
4086         0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
4087         0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
4088         0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
4089         0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
4090         0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
4091         0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
4092         0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
4093         0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
4094         0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
4095         0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
4096         0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
4097         0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
4098         0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
4099         0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
4100         0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
4101         0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
4102       };
4103       const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
4104 
4105       __ align(CodeEntryAlignment);
4106       StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block));
4107       address start = __ pc();
4108 
4109       Register buf   = c_rarg0;
4110       Register state = c_rarg1;
4111       Register ofs   = c_rarg2;
4112       Register limit = c_rarg3;
4113       Register consts =  t2; // caller saved
4114       Register state_c = x28; // caller saved
4115       VectorRegister vindex = v2;
4116       VectorRegister vW0 = v4;
4117       VectorRegister vW1 = v6;
4118       VectorRegister vW2 = v8;
4119       VectorRegister vW3 = v10;
4120       VectorRegister vState0 = v12;
4121       VectorRegister vState1 = v14;
4122       VectorRegister vHash0  = v16;
4123       VectorRegister vHash1  = v18;
4124       VectorRegister vTmp0   = v20;
4125       VectorRegister vTmp1   = v22;
4126 
4127       Label multi_block_loop;
4128 
4129       __ enter();
4130 
4131       address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
4132       la(consts, ExternalAddress(constant_table));
4133 
4134       // Register use in this function:
4135       //
4136       // VECTORS
4137       //  vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
4138       //             schedule words (Wt). They start with the message block
4139       //             content (W0 to W15), then further words in the message
4140       //             schedule generated via vsha2ms from previous Wt.
4141       //   Initially:
4142       //     vW0 = W[  3:0] = { W3,  W2,  W1,  W0}
4143       //     vW1 = W[  7:4] = { W7,  W6,  W5,  W4}
4144       //     vW2 = W[ 11:8] = {W11, W10,  W9,  W8}
4145       //     vW3 = W[15:12] = {W15, W14, W13, W12}
4146       //
4147       //  vState0 - vState1 hold the working state variables (a, b, ..., h)
4148       //    vState0 = {f[t],e[t],b[t],a[t]}
4149       //    vState1 = {h[t],g[t],d[t],c[t]}
4150       //   Initially:
4151       //    vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
4152       //    vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
4153       //
4154       //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
4155       //
4156       //  vTmp0 = temporary, Wt+Kt
4157       //  vTmp1 = temporary, Kt
4158       //
4159       //  vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
4160       //
4161       // During most of the function the vector state is configured so that each
4162       // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
4163 
4164       // vsha2ch/vsha2cl uses EGW of 4*SEW.
4165       // SHA256 SEW = e32, EGW = 128-bits
4166       // SHA512 SEW = e64, EGW = 256-bits
4167       //
4168       // VLEN is required to be at least 128.
4169       // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
4170       //
4171       // m1: LMUL=1/2
4172       // ta: tail agnostic (don't care about those lanes)
4173       // ma: mask agnostic (don't care about those lanes)
4174       // x0 is not written, we known the number of vector elements.
4175 
4176       if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
4177         __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
4178       } else {
4179         __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
4180       }
4181 
4182       int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
4183       __ li(t0, indexes);
4184       __ vmv_v_x(vindex, t0);
4185 
4186       // Step-over a,b, so we are pointing to c.
4187       // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
4188       __ addi(state_c, state, const_add/2);
4189 
4190       // Use index-load to get {f,e,b,a},{h,g,d,c}
4191       __ vluxei8_v(vState0, state, vindex);
4192       __ vluxei8_v(vState1, state_c, vindex);
4193 
4194       __ bind(multi_block_loop);
4195 
4196       // Capture the initial H values in vHash0 and vHash1 to allow for computing
4197       // the resulting H', since H' = H+{a',b',c',...,h'}.
4198       __ vmv_v_v(vHash0, vState0);
4199       __ vmv_v_v(vHash1, vState1);
4200 
4201       // Load the 512/1024-bits of the message block in vW0-vW3 and perform
4202       // an endian swap on each 4/8 bytes element.
4203       //
4204       // If Zvkb is not implemented one can use vrgather
4205       // with an index sequence to byte-swap.
4206       //  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
4207       //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
4208       //  this sequence. 'vid' gives us the N.
4209       __ vleXX_v(vset_sew, vW0, buf);
4210       __ vrev8_v(vW0, vW0);
4211       __ addi(buf, buf, const_add);
4212       __ vleXX_v(vset_sew, vW1, buf);
4213       __ vrev8_v(vW1, vW1);
4214       __ addi(buf, buf, const_add);
4215       __ vleXX_v(vset_sew, vW2, buf);
4216       __ vrev8_v(vW2, vW2);
4217       __ addi(buf, buf, const_add);
4218       __ vleXX_v(vset_sew, vW3, buf);
4219       __ vrev8_v(vW3, vW3);
4220       __ addi(buf, buf, const_add);
4221 
4222       // Set v0 up for the vmerge that replaces the first word (idx==0)
4223       __ vid_v(v0);
4224       __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
4225 
4226       VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
4227       int rot_pos = 0;
4228       // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
4229       const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
4230       for (int i = 0; i < qr_end; i++) {
4231         sha2_quad_round(vset_sew,
4232                    rotation_regs[(rot_pos + 0) & 0x3],
4233                    rotation_regs[(rot_pos + 1) & 0x3],
4234                    rotation_regs[(rot_pos + 2) & 0x3],
4235                    rotation_regs[(rot_pos + 3) & 0x3],
4236                    consts,
4237                    vTmp1, vTmp0, vState0, vState1);
4238         ++rot_pos;
4239       }
4240       // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
4241       // Note that we stop generating new message schedule words (Wt, vW0-13)
4242       // as we already generated all the words we end up consuming (i.e., W[63:60]).
4243       const int qr_c_end = qr_end + 4;
4244       for (int i = qr_end; i < qr_c_end; i++) {
4245         sha2_quad_round(vset_sew,
4246                    rotation_regs[(rot_pos + 0) & 0x3],
4247                    rotation_regs[(rot_pos + 1) & 0x3],
4248                    rotation_regs[(rot_pos + 2) & 0x3],
4249                    rotation_regs[(rot_pos + 3) & 0x3],
4250                    consts,
4251                    vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
4252         ++rot_pos;
4253       }
4254 
4255       //--------------------------------------------------------------------------------
4256       // Compute the updated hash value H'
4257       //   H' = H + {h',g',...,b',a'}
4258       //      = {h,g,...,b,a} + {h',g',...,b',a'}
4259       //      = {h+h',g+g',...,b+b',a+a'}
4260 
4261       // H' = H+{a',b',c',...,h'}
4262       __ vadd_vv(vState0, vHash0, vState0);
4263       __ vadd_vv(vState1, vHash1, vState1);
4264 
4265       if (multi_block) {
4266         int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
4267         __ addi(consts, consts, -total_adds);
4268         __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
4269         __ ble(ofs, limit, multi_block_loop);
4270         __ mv(c_rarg0, ofs); // return ofs
4271       }
4272 
4273       // Store H[0..8] = {a,b,c,d,e,f,g,h} from
4274       //  vState0 = {f,e,b,a}
4275       //  vState1 = {h,g,d,c}
4276       __ vsuxei8_v(vState0, state,   vindex);
4277       __ vsuxei8_v(vState1, state_c, vindex);
4278 
4279       __ leave();
4280       __ ret();
4281 
4282       return start;
4283     }
4284   };
4285 
4286 #undef __
4287 #define __ _masm->
4288 
4289   // Set of L registers that correspond to a contiguous memory area.
4290   // Each 64-bit register typically corresponds to 2 32-bit integers.
4291   template <uint L>
4292   class RegCache {
4293   private:
4294     MacroAssembler *_masm;
4295     Register _regs[L];
4296 
4297   public:
4298     RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
4299       assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
4300       auto it = rs.begin();
4301       for (auto &r: _regs) {
4302         r = *it;
4303         ++it;
4304       }
4305     }
4306 
4307     // generate load for the i'th register
4308     void gen_load(uint i, Register base) {
4309       assert(i < L, "invalid i: %u", i);
4310       __ ld(_regs[i], Address(base, 8 * i));
4311     }
4312 
4313     // add i'th 32-bit integer to dest
4314     void add_u32(const Register dest, uint i, const Register rtmp = t0) {
4315       assert(i < 2 * L, "invalid i: %u", i);
4316 
4317       if (is_even(i)) {
4318         // Use the bottom 32 bits. No need to mask off the top 32 bits
4319         // as addw will do the right thing.
4320         __ addw(dest, dest, _regs[i / 2]);
4321       } else {
4322         // Use the top 32 bits by right-shifting them.
4323         __ srli(rtmp, _regs[i / 2], 32);
4324         __ addw(dest, dest, rtmp);
4325       }
4326     }
4327   };
4328 
4329   typedef RegCache<8> BufRegCache;
4330 
4331   // a += value + x + ac;
4332   // a = Integer.rotateLeft(a, s) + b;
4333   void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
4334                                Register a, Register b, Register c, Register d,
4335                                int k, int s, int t,
4336                                Register value) {
4337     // a += ac
4338     __ addw(a, a, t, t1);
4339 
4340     // a += x;
4341     reg_cache.add_u32(a, k);
4342     // a += value;
4343     __ addw(a, a, value);
4344 
4345     // a = Integer.rotateLeft(a, s) + b;
4346     __ rolw_imm(a, a, s);
4347     __ addw(a, a, b);
4348   }
4349 
4350   // a += ((b & c) | ((~b) & d)) + x + ac;
4351   // a = Integer.rotateLeft(a, s) + b;
4352   void md5_FF(BufRegCache& reg_cache,
4353               Register a, Register b, Register c, Register d,
4354               int k, int s, int t,
4355               Register rtmp1, Register rtmp2) {
4356     // rtmp1 = b & c
4357     __ andr(rtmp1, b, c);
4358 
4359     // rtmp2 = (~b) & d
4360     __ andn(rtmp2, d, b);
4361 
4362     // rtmp1 = (b & c) | ((~b) & d)
4363     __ orr(rtmp1, rtmp1, rtmp2);
4364 
4365     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4366   }
4367 
4368   // a += ((b & d) | (c & (~d))) + x + ac;
4369   // a = Integer.rotateLeft(a, s) + b;
4370   void md5_GG(BufRegCache& reg_cache,
4371               Register a, Register b, Register c, Register d,
4372               int k, int s, int t,
4373               Register rtmp1, Register rtmp2) {
4374     // rtmp1 = b & d
4375     __ andr(rtmp1, b, d);
4376 
4377     // rtmp2 = c & (~d)
4378     __ andn(rtmp2, c, d);
4379 
4380     // rtmp1 = (b & d) | (c & (~d))
4381     __ orr(rtmp1, rtmp1, rtmp2);
4382 
4383     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4384   }
4385 
4386   // a += ((b ^ c) ^ d) + x + ac;
4387   // a = Integer.rotateLeft(a, s) + b;
4388   void md5_HH(BufRegCache& reg_cache,
4389               Register a, Register b, Register c, Register d,
4390               int k, int s, int t,
4391               Register rtmp1, Register rtmp2) {
4392     // rtmp1 = (b ^ c) ^ d
4393     __ xorr(rtmp2, b, c);
4394     __ xorr(rtmp1, rtmp2, d);
4395 
4396     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4397   }
4398 
4399   // a += (c ^ (b | (~d))) + x + ac;
4400   // a = Integer.rotateLeft(a, s) + b;
4401   void md5_II(BufRegCache& reg_cache,
4402               Register a, Register b, Register c, Register d,
4403               int k, int s, int t,
4404               Register rtmp1, Register rtmp2) {
4405     // rtmp1 = c ^ (b | (~d))
4406     __ orn(rtmp2, b, d);
4407     __ xorr(rtmp1, c, rtmp2);
4408 
4409     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
4410   }
4411 
4412   // Arguments:
4413   //
4414   // Inputs:
4415   //   c_rarg0   - byte[]  source+offset
4416   //   c_rarg1   - int[]   SHA.state
4417   //   c_rarg2   - int     offset  (multi_block == True)
4418   //   c_rarg3   - int     limit   (multi_block == True)
4419   //
4420   // Registers:
4421   //    x0   zero  (zero)
4422   //    x1     ra  (return address)
4423   //    x2     sp  (stack pointer)
4424   //    x3     gp  (global pointer)
4425   //    x4     tp  (thread pointer)
4426   //    x5     t0  (tmp register)
4427   //    x6     t1  (tmp register)
4428   //    x7     t2  state0
4429   //    x8  f0/s0  (frame pointer)
4430   //    x9     s1
4431   //   x10     a0  rtmp1 / c_rarg0
4432   //   x11     a1  rtmp2 / c_rarg1
4433   //   x12     a2  a     / c_rarg2
4434   //   x13     a3  b     / c_rarg3
4435   //   x14     a4  c
4436   //   x15     a5  d
4437   //   x16     a6  buf
4438   //   x17     a7  state
4439   //   x18     s2  ofs     [saved-reg]  (multi_block == True)
4440   //   x19     s3  limit   [saved-reg]  (multi_block == True)
4441   //   x20     s4  state1  [saved-reg]
4442   //   x21     s5  state2  [saved-reg]
4443   //   x22     s6  state3  [saved-reg]
4444   //   x23     s7
4445   //   x24     s8  buf0    [saved-reg]
4446   //   x25     s9  buf1    [saved-reg]
4447   //   x26    s10  buf2    [saved-reg]
4448   //   x27    s11  buf3    [saved-reg]
4449   //   x28     t3  buf4
4450   //   x29     t4  buf5
4451   //   x30     t5  buf6
4452   //   x31     t6  buf7
4453   address generate_md5_implCompress(bool multi_block, const char *name) {
4454     __ align(CodeEntryAlignment);
4455     StubCodeMark mark(this, "StubRoutines", name);
4456     address start = __ pc();
4457 
4458     // rotation constants
4459     const int S11 = 7;
4460     const int S12 = 12;
4461     const int S13 = 17;
4462     const int S14 = 22;
4463     const int S21 = 5;
4464     const int S22 = 9;
4465     const int S23 = 14;
4466     const int S24 = 20;
4467     const int S31 = 4;
4468     const int S32 = 11;
4469     const int S33 = 16;
4470     const int S34 = 23;
4471     const int S41 = 6;
4472     const int S42 = 10;
4473     const int S43 = 15;
4474     const int S44 = 21;
4475 
4476     const int64_t mask32 = 0xffffffff;
4477 
4478     Register buf_arg   = c_rarg0; // a0
4479     Register state_arg = c_rarg1; // a1
4480     Register ofs_arg   = c_rarg2; // a2
4481     Register limit_arg = c_rarg3; // a3
4482 
4483     // we'll copy the args to these registers to free up a0-a3
4484     // to use for other values manipulated by instructions
4485     // that can be compressed
4486     Register buf       = x16; // a6
4487     Register state     = x17; // a7
4488     Register ofs       = x18; // s2
4489     Register limit     = x19; // s3
4490 
4491     // using x12->15 to allow compressed instructions
4492     Register a         = x12; // a2
4493     Register b         = x13; // a3
4494     Register c         = x14; // a4
4495     Register d         = x15; // a5
4496 
4497     Register state0    =  x7; // t2
4498     Register state1    = x20; // s4
4499     Register state2    = x21; // s5
4500     Register state3    = x22; // s6
4501 
4502     // using x10->x11 to allow compressed instructions
4503     Register rtmp1     = x10; // a0
4504     Register rtmp2     = x11; // a1
4505 
4506     RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
4507     RegSet reg_cache_regs;
4508     reg_cache_regs += reg_cache_saved_regs;
4509     reg_cache_regs += RegSet::of(x28, x29, x30, x31); // t3, t4, t5, t6
4510     BufRegCache reg_cache(_masm, reg_cache_regs);
4511 
4512     RegSet saved_regs;
4513     if (multi_block) {
4514       saved_regs += RegSet::of(ofs, limit);
4515     }
4516     saved_regs += RegSet::of(state1, state2, state3);
4517     saved_regs += reg_cache_saved_regs;
4518 
4519     __ push_reg(saved_regs, sp);
4520 
4521     __ mv(buf, buf_arg);
4522     __ mv(state, state_arg);
4523     if (multi_block) {
4524       __ mv(ofs, ofs_arg);
4525       __ mv(limit, limit_arg);
4526     }
4527 
4528     // to minimize the number of memory operations:
4529     // read the 4 state 4-byte values in pairs, with a single ld,
4530     // and split them into 2 registers.
4531     //
4532     // And, as the core algorithm of md5 works on 32-bits words, so
4533     // in the following code, it does not care about the content of
4534     // higher 32-bits in state[x]. Based on this observation,
4535     // we can apply further optimization, which is to just ignore the
4536     // higher 32-bits in state0/state2, rather than set the higher
4537     // 32-bits of state0/state2 to zero explicitly with extra instructions.
4538     __ ld(state0, Address(state));
4539     __ srli(state1, state0, 32);
4540     __ ld(state2, Address(state, 8));
4541     __ srli(state3, state2, 32);
4542 
4543     Label md5_loop;
4544     __ BIND(md5_loop);
4545 
4546     __ mv(a, state0);
4547     __ mv(b, state1);
4548     __ mv(c, state2);
4549     __ mv(d, state3);
4550 
4551     // Round 1
4552     reg_cache.gen_load(0, buf);
4553     md5_FF(reg_cache, a, b, c, d,  0, S11, 0xd76aa478, rtmp1, rtmp2);
4554     md5_FF(reg_cache, d, a, b, c,  1, S12, 0xe8c7b756, rtmp1, rtmp2);
4555     reg_cache.gen_load(1, buf);
4556     md5_FF(reg_cache, c, d, a, b,  2, S13, 0x242070db, rtmp1, rtmp2);
4557     md5_FF(reg_cache, b, c, d, a,  3, S14, 0xc1bdceee, rtmp1, rtmp2);
4558     reg_cache.gen_load(2, buf);
4559     md5_FF(reg_cache, a, b, c, d,  4, S11, 0xf57c0faf, rtmp1, rtmp2);
4560     md5_FF(reg_cache, d, a, b, c,  5, S12, 0x4787c62a, rtmp1, rtmp2);
4561     reg_cache.gen_load(3, buf);
4562     md5_FF(reg_cache, c, d, a, b,  6, S13, 0xa8304613, rtmp1, rtmp2);
4563     md5_FF(reg_cache, b, c, d, a,  7, S14, 0xfd469501, rtmp1, rtmp2);
4564     reg_cache.gen_load(4, buf);
4565     md5_FF(reg_cache, a, b, c, d,  8, S11, 0x698098d8, rtmp1, rtmp2);
4566     md5_FF(reg_cache, d, a, b, c,  9, S12, 0x8b44f7af, rtmp1, rtmp2);
4567     reg_cache.gen_load(5, buf);
4568     md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
4569     md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
4570     reg_cache.gen_load(6, buf);
4571     md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
4572     md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
4573     reg_cache.gen_load(7, buf);
4574     md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
4575     md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
4576 
4577     // Round 2
4578     md5_GG(reg_cache, a, b, c, d,  1, S21, 0xf61e2562, rtmp1, rtmp2);
4579     md5_GG(reg_cache, d, a, b, c,  6, S22, 0xc040b340, rtmp1, rtmp2);
4580     md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
4581     md5_GG(reg_cache, b, c, d, a,  0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
4582     md5_GG(reg_cache, a, b, c, d,  5, S21, 0xd62f105d, rtmp1, rtmp2);
4583     md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
4584     md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
4585     md5_GG(reg_cache, b, c, d, a,  4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
4586     md5_GG(reg_cache, a, b, c, d,  9, S21, 0x21e1cde6, rtmp1, rtmp2);
4587     md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
4588     md5_GG(reg_cache, c, d, a, b,  3, S23, 0xf4d50d87, rtmp1, rtmp2);
4589     md5_GG(reg_cache, b, c, d, a,  8, S24, 0x455a14ed, rtmp1, rtmp2);
4590     md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
4591     md5_GG(reg_cache, d, a, b, c,  2, S22, 0xfcefa3f8, rtmp1, rtmp2);
4592     md5_GG(reg_cache, c, d, a, b,  7, S23, 0x676f02d9, rtmp1, rtmp2);
4593     md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
4594 
4595     // Round 3
4596     md5_HH(reg_cache, a, b, c, d,  5, S31, 0xfffa3942, rtmp1, rtmp2);
4597     md5_HH(reg_cache, d, a, b, c,  8, S32, 0x8771f681, rtmp1, rtmp2);
4598     md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
4599     md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
4600     md5_HH(reg_cache, a, b, c, d,  1, S31, 0xa4beea44, rtmp1, rtmp2);
4601     md5_HH(reg_cache, d, a, b, c,  4, S32, 0x4bdecfa9, rtmp1, rtmp2);
4602     md5_HH(reg_cache, c, d, a, b,  7, S33, 0xf6bb4b60, rtmp1, rtmp2);
4603     md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
4604     md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
4605     md5_HH(reg_cache, d, a, b, c,  0, S32, 0xeaa127fa, rtmp1, rtmp2);
4606     md5_HH(reg_cache, c, d, a, b,  3, S33, 0xd4ef3085, rtmp1, rtmp2);
4607     md5_HH(reg_cache, b, c, d, a,  6, S34, 0x04881d05, rtmp1, rtmp2);
4608     md5_HH(reg_cache, a, b, c, d,  9, S31, 0xd9d4d039, rtmp1, rtmp2);
4609     md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
4610     md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
4611     md5_HH(reg_cache, b, c, d, a,  2, S34, 0xc4ac5665, rtmp1, rtmp2);
4612 
4613     // Round 4
4614     md5_II(reg_cache, a, b, c, d,  0, S41, 0xf4292244, rtmp1, rtmp2);
4615     md5_II(reg_cache, d, a, b, c,  7, S42, 0x432aff97, rtmp1, rtmp2);
4616     md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
4617     md5_II(reg_cache, b, c, d, a,  5, S44, 0xfc93a039, rtmp1, rtmp2);
4618     md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
4619     md5_II(reg_cache, d, a, b, c,  3, S42, 0x8f0ccc92, rtmp1, rtmp2);
4620     md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
4621     md5_II(reg_cache, b, c, d, a,  1, S44, 0x85845dd1, rtmp1, rtmp2);
4622     md5_II(reg_cache, a, b, c, d,  8, S41, 0x6fa87e4f, rtmp1, rtmp2);
4623     md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
4624     md5_II(reg_cache, c, d, a, b,  6, S43, 0xa3014314, rtmp1, rtmp2);
4625     md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
4626     md5_II(reg_cache, a, b, c, d,  4, S41, 0xf7537e82, rtmp1, rtmp2);
4627     md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
4628     md5_II(reg_cache, c, d, a, b,  2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
4629     md5_II(reg_cache, b, c, d, a,  9, S44, 0xeb86d391, rtmp1, rtmp2);
4630 
4631     __ addw(state0, state0, a);
4632     __ addw(state1, state1, b);
4633     __ addw(state2, state2, c);
4634     __ addw(state3, state3, d);
4635 
4636     if (multi_block) {
4637       __ addi(buf, buf, 64);
4638       __ addi(ofs, ofs, 64);
4639       // if (ofs <= limit) goto m5_loop
4640       __ bge(limit, ofs, md5_loop);
4641       __ mv(c_rarg0, ofs); // return ofs
4642     }
4643 
4644     // to minimize the number of memory operations:
4645     // write back the 4 state 4-byte values in pairs, with a single sd
4646     __ mv(t0, mask32);
4647     __ andr(state0, state0, t0);
4648     __ slli(state1, state1, 32);
4649     __ orr(state0, state0, state1);
4650     __ sd(state0, Address(state));
4651     __ andr(state2, state2, t0);
4652     __ slli(state3, state3, 32);
4653     __ orr(state2, state2, state3);
4654     __ sd(state2, Address(state, 8));
4655 
4656     __ pop_reg(saved_regs, sp);
4657     __ ret();
4658 
4659     return (address) start;
4660   }
4661 
4662   /**
4663    * Perform the quarter round calculations on values contained within four vector registers.
4664    *
4665    * @param aVec the SIMD register containing only the "a" values
4666    * @param bVec the SIMD register containing only the "b" values
4667    * @param cVec the SIMD register containing only the "c" values
4668    * @param dVec the SIMD register containing only the "d" values
4669    * @param tmp_vr temporary vector register holds intermedia values.
4670    */
4671   void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
4672                           VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
4673     // a += b, d ^= a, d <<<= 16
4674     __ vadd_vv(aVec, aVec, bVec);
4675     __ vxor_vv(dVec, dVec, aVec);
4676     __ vrole32_vi(dVec, 16, tmp_vr);
4677 
4678     // c += d, b ^= c, b <<<= 12
4679     __ vadd_vv(cVec, cVec, dVec);
4680     __ vxor_vv(bVec, bVec, cVec);
4681     __ vrole32_vi(bVec, 12, tmp_vr);
4682 
4683     // a += b, d ^= a, d <<<= 8
4684     __ vadd_vv(aVec, aVec, bVec);
4685     __ vxor_vv(dVec, dVec, aVec);
4686     __ vrole32_vi(dVec, 8, tmp_vr);
4687 
4688     // c += d, b ^= c, b <<<= 7
4689     __ vadd_vv(cVec, cVec, dVec);
4690     __ vxor_vv(bVec, bVec, cVec);
4691     __ vrole32_vi(bVec, 7, tmp_vr);
4692   }
4693 
4694   /**
4695    * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
4696    *
4697    *  Input arguments:
4698    *  c_rarg0   - state, the starting state
4699    *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function
4700    *
4701    *  Implementation Note:
4702    *   Parallelization is achieved by loading individual state elements into vectors for N blocks.
4703    *   N depends on single vector register length.
4704    */
4705   address generate_chacha20Block() {
4706     Label L_Rounds;
4707 
4708     __ align(CodeEntryAlignment);
4709     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4710     address start = __ pc();
4711     __ enter();
4712 
4713     const int states_len = 16;
4714     const int step = 4;
4715     const Register state = c_rarg0;
4716     const Register key_stream = c_rarg1;
4717     const Register tmp_addr = t0;
4718     const Register length = t1;
4719 
4720     // Organize vector registers in an array that facilitates
4721     // putting repetitive opcodes into loop structures below.
4722     const VectorRegister work_vrs[16] = {
4723       v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
4724       v8, v9, v10, v11, v12, v13, v14, v15
4725     };
4726     const VectorRegister tmp_vr = v16;
4727     const VectorRegister counter_vr = v17;
4728 
4729     {
4730       // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
4731       // in java level.
4732       __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
4733     }
4734 
4735     // Load from source state.
4736     // Every element in source state is duplicated to all elements in the corresponding vector.
4737     __ mv(tmp_addr, state);
4738     for (int i = 0; i < states_len; i += 1) {
4739       __ vlse32_v(work_vrs[i], tmp_addr, zr);
4740       __ addi(tmp_addr, tmp_addr, step);
4741     }
4742     // Adjust counter for every individual block.
4743     __ vid_v(counter_vr);
4744     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
4745 
4746     // Perform 10 iterations of the 8 quarter round set
4747     {
4748       const Register loop = t2; // share t2 with other non-overlapping usages.
4749       __ mv(loop, 10);
4750       __ BIND(L_Rounds);
4751 
4752       chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8],  work_vrs[12], tmp_vr);
4753       chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9],  work_vrs[13], tmp_vr);
4754       chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
4755       chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
4756 
4757       chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
4758       chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
4759       chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8],  work_vrs[13], tmp_vr);
4760       chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9],  work_vrs[14], tmp_vr);
4761 
4762       __ sub(loop, loop, 1);
4763       __ bnez(loop, L_Rounds);
4764     }
4765 
4766     // Add the original state into the end working state.
4767     // We do this by first duplicating every element in source state array to the corresponding
4768     // vector, then adding it to the post-loop working state.
4769     __ mv(tmp_addr, state);
4770     for (int i = 0; i < states_len; i += 1) {
4771       __ vlse32_v(tmp_vr, tmp_addr, zr);
4772       __ addi(tmp_addr, tmp_addr, step);
4773       __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
4774     }
4775     // Add the counter overlay onto work_vrs[12] at the end.
4776     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
4777 
4778     // Store result to key stream.
4779     {
4780       const Register stride = t2; // share t2 with other non-overlapping usages.
4781       // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
4782       __ mv(stride, 64);
4783       for (int i = 0; i < states_len; i += 1) {
4784         __ vsse32_v(work_vrs[i], key_stream, stride);
4785         __ addi(key_stream, key_stream, step);
4786       }
4787     }
4788 
4789     // Return length of output key_stream
4790     __ slli(c_rarg0, length, 6);
4791 
4792     __ leave();
4793     __ ret();
4794 
4795     return (address) start;
4796   }
4797 
4798 
4799   // ------------------------ SHA-1 intrinsic ------------------------
4800 
4801   // K't =
4802   //    5a827999, 0  <= t <= 19
4803   //    6ed9eba1, 20 <= t <= 39
4804   //    8f1bbcdc, 40 <= t <= 59
4805   //    ca62c1d6, 60 <= t <= 79
4806   void sha1_prepare_k(Register cur_k, int round) {
4807     assert(round >= 0 && round < 80, "must be");
4808 
4809     static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
4810     if ((round % 20) == 0) {
4811       __ mv(cur_k, ks[round/20]);
4812     }
4813   }
4814 
4815   // W't =
4816   //    M't,                                      0 <=  t <= 15
4817   //    ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
4818   void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
4819     assert(round >= 0 && round < 80, "must be");
4820 
4821     if (round < 16) {
4822       // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
4823       //   in ws[0], high part contains W't-0, low part contains W't-1,
4824       //   in ws[1], high part contains W't-2, low part contains W't-3,
4825       //   ...
4826       //   in ws[7], high part contains W't-14, low part contains W't-15.
4827 
4828       if ((round % 2) == 0) {
4829         __ ld(ws[round/2], Address(buf, (round/2) * 8));
4830         // reverse bytes, as SHA-1 is defined in big-endian.
4831         __ revb(ws[round/2], ws[round/2]);
4832         __ srli(cur_w, ws[round/2], 32);
4833       } else {
4834         __ mv(cur_w, ws[round/2]);
4835       }
4836 
4837       return;
4838     }
4839 
4840     if ((round % 2) == 0) {
4841       int idx = 16;
4842       // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
4843       __ srli(t1, ws[(idx-8)/2], 32);
4844       __ xorr(t0, ws[(idx-3)/2], t1);
4845 
4846       __ srli(t1, ws[(idx-14)/2], 32);
4847       __ srli(cur_w, ws[(idx-16)/2], 32);
4848       __ xorr(cur_w, cur_w, t1);
4849 
4850       __ xorr(cur_w, cur_w, t0);
4851       __ rolw_imm(cur_w, cur_w, 1, t0);
4852 
4853       // copy the cur_w value to ws[8].
4854       // now, valid w't values are at:
4855       //  w0:       ws[0]'s lower 32 bits
4856       //  w1 ~ w14: ws[1] ~ ws[7]
4857       //  w15:      ws[8]'s higher 32 bits
4858       __ slli(ws[idx/2], cur_w, 32);
4859 
4860       return;
4861     }
4862 
4863     int idx = 17;
4864     // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
4865     __ srli(t1, ws[(idx-3)/2], 32);
4866     __ xorr(t0, t1, ws[(idx-8)/2]);
4867 
4868     __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
4869 
4870     __ xorr(cur_w, cur_w, t0);
4871     __ rolw_imm(cur_w, cur_w, 1, t0);
4872 
4873     // copy the cur_w value to ws[8]
4874     __ zero_extend(cur_w, cur_w, 32);
4875     __ orr(ws[idx/2], ws[idx/2], cur_w);
4876 
4877     // shift the w't registers, so they start from ws[0] again.
4878     // now, valid w't values are at:
4879     //  w0 ~ w15: ws[0] ~ ws[7]
4880     Register ws_0 = ws[0];
4881     for (int i = 0; i < 16/2; i++) {
4882       ws[i] = ws[i+1];
4883     }
4884     ws[8] = ws_0;
4885   }
4886 
4887   // f't(x, y, z) =
4888   //    Ch(x, y, z)     = (x & y) ^ (~x & z)            , 0  <= t <= 19
4889   //    Parity(x, y, z) = x ^ y ^ z                     , 20 <= t <= 39
4890   //    Maj(x, y, z)    = (x & y) ^ (x & z) ^ (y & z)   , 40 <= t <= 59
4891   //    Parity(x, y, z) = x ^ y ^ z                     , 60 <= t <= 79
4892   void sha1_f(Register dst, Register x, Register y, Register z, int round) {
4893     assert(round >= 0 && round < 80, "must be");
4894     assert_different_registers(dst, x, y, z, t0, t1);
4895 
4896     if (round < 20) {
4897       // (x & y) ^ (~x & z)
4898       __ andr(t0, x, y);
4899       __ andn(dst, z, x);
4900       __ xorr(dst, dst, t0);
4901     } else if (round >= 40 && round < 60) {
4902       // (x & y) ^ (x & z) ^ (y & z)
4903       __ andr(t0, x, y);
4904       __ andr(t1, x, z);
4905       __ andr(dst, y, z);
4906       __ xorr(dst, dst, t0);
4907       __ xorr(dst, dst, t1);
4908     } else {
4909       // x ^ y ^ z
4910       __ xorr(dst, x, y);
4911       __ xorr(dst, dst, z);
4912     }
4913   }
4914 
4915   // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
4916   // e = d
4917   // d = c
4918   // c = ROTL'30(b)
4919   // b = a
4920   // a = T
4921   void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
4922                           Register cur_k, Register cur_w, Register tmp, int round) {
4923     assert(round >= 0 && round < 80, "must be");
4924     assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
4925 
4926     // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
4927 
4928     // cur_w will be recalculated at the beginning of each round,
4929     // so, we can reuse it as a temp register here.
4930     Register tmp2 = cur_w;
4931 
4932     // reuse e as a temporary register, as we will mv new value into it later
4933     Register tmp3 = e;
4934     __ add(tmp2, cur_k, tmp2);
4935     __ add(tmp3, tmp3, tmp2);
4936     __ rolw_imm(tmp2, a, 5, t0);
4937 
4938     sha1_f(tmp, b, c, d, round);
4939 
4940     __ add(tmp2, tmp2, tmp);
4941     __ add(tmp2, tmp2, tmp3);
4942 
4943     // e = d
4944     // d = c
4945     // c = ROTL'30(b)
4946     // b = a
4947     // a = T
4948     __ mv(e, d);
4949     __ mv(d, c);
4950 
4951     __ rolw_imm(c, b, 30);
4952     __ mv(b, a);
4953     __ mv(a, tmp2);
4954   }
4955 
4956   // H(i)0 = a + H(i-1)0
4957   // H(i)1 = b + H(i-1)1
4958   // H(i)2 = c + H(i-1)2
4959   // H(i)3 = d + H(i-1)3
4960   // H(i)4 = e + H(i-1)4
4961   void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
4962                               Register prev_ab, Register prev_cd, Register prev_e) {
4963     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
4964 
4965     __ add(a, a, prev_ab);
4966     __ srli(prev_ab, prev_ab, 32);
4967     __ add(b, b, prev_ab);
4968 
4969     __ add(c, c, prev_cd);
4970     __ srli(prev_cd, prev_cd, 32);
4971     __ add(d, d, prev_cd);
4972 
4973     __ add(e, e, prev_e);
4974   }
4975 
4976   void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
4977                                 Register prev_ab, Register prev_cd, Register prev_e) {
4978     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
4979 
4980     __ slli(t0, b, 32);
4981     __ zero_extend(prev_ab, a, 32);
4982     __ orr(prev_ab, prev_ab, t0);
4983 
4984     __ slli(t0, d, 32);
4985     __ zero_extend(prev_cd, c, 32);
4986     __ orr(prev_cd, prev_cd, t0);
4987 
4988     __ mv(prev_e, e);
4989   }
4990 
4991   // Intrinsic for:
4992   //   void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
4993   //   void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
4994   //
4995   // Arguments:
4996   //
4997   // Inputs:
4998   //   c_rarg0: byte[]  src array + offset
4999   //   c_rarg1: int[]   SHA.state
5000   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5001   //   c_rarg2: int     offset
5002   //   c_rarg3: int     limit
5003   //
5004   // Outputs:
5005   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5006   //   c_rarg0: int offset, when (multi_block == true)
5007   //
5008   address generate_sha1_implCompress(bool multi_block, const char *name) {
5009     __ align(CodeEntryAlignment);
5010     StubCodeMark mark(this, "StubRoutines", name);
5011 
5012     address start = __ pc();
5013     __ enter();
5014 
5015     RegSet saved_regs = RegSet::range(x18, x27);
5016     if (multi_block) {
5017       // use x9 as src below.
5018       saved_regs += RegSet::of(x9);
5019     }
5020     __ push_reg(saved_regs, sp);
5021 
5022     // c_rarg0 - c_rarg3: x10 - x13
5023     Register buf    = c_rarg0;
5024     Register state  = c_rarg1;
5025     Register offset = c_rarg2;
5026     Register limit  = c_rarg3;
5027     // use src to contain the original start point of the array.
5028     Register src    = x9;
5029 
5030     if (multi_block) {
5031       __ sub(limit, limit, offset);
5032       __ add(limit, limit, buf);
5033       __ sub(src, buf, offset);
5034     }
5035 
5036     // [args-reg]:  x14 - x17
5037     // [temp-reg]:  x28 - x31
5038     // [saved-reg]: x18 - x27
5039 
5040     // h0/1/2/3/4
5041     const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5042     // w0, w1, ... w15
5043     // put two adjecent w's in one register:
5044     //    one at high word part, another at low word part
5045     // at different round (even or odd), w't value reside in different items in ws[].
5046     // w0 ~ w15, either reside in
5047     //    ws[0] ~ ws[7], where
5048     //      w0 at higher 32 bits of ws[0],
5049     //      w1 at lower 32 bits of ws[0],
5050     //      ...
5051     //      w14 at higher 32 bits of ws[7],
5052     //      w15 at lower 32 bits of ws[7].
5053     // or, reside in
5054     //    w0:       ws[0]'s lower 32 bits
5055     //    w1 ~ w14: ws[1] ~ ws[7]
5056     //    w15:      ws[8]'s higher 32 bits
5057     Register ws[9] = {x29, x30, x31, x18,
5058                       x19, x20, x21, x22,
5059                       x23}; // auxiliary register for calculating w's value
5060     // current k't's value
5061     const Register cur_k = x24;
5062     // current w't's value
5063     const Register cur_w = x25;
5064     // values of a, b, c, d, e in the previous round
5065     const Register prev_ab = x26, prev_cd = x27;
5066     const Register prev_e = offset; // reuse offset/c_rarg2
5067 
5068     // load 5 words state into a, b, c, d, e.
5069     //
5070     // To minimize the number of memory operations, we apply following
5071     // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5072     // with a single ld, and split them into 2 registers.
5073     //
5074     // And, as the core algorithm of SHA-1 works on 32-bits words, so
5075     // in the following code, it does not care about the content of
5076     // higher 32-bits in a/b/c/d/e. Based on this observation,
5077     // we can apply further optimization, which is to just ignore the
5078     // higher 32-bits in a/c/e, rather than set the higher
5079     // 32-bits of a/c/e to zero explicitly with extra instructions.
5080     __ ld(a, Address(state, 0));
5081     __ srli(b, a, 32);
5082     __ ld(c, Address(state, 8));
5083     __ srli(d, c, 32);
5084     __ lw(e, Address(state, 16));
5085 
5086     Label L_sha1_loop;
5087     if (multi_block) {
5088       __ BIND(L_sha1_loop);
5089     }
5090 
5091     sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5092 
5093     for (int round = 0; round < 80; round++) {
5094       // prepare K't value
5095       sha1_prepare_k(cur_k, round);
5096 
5097       // prepare W't value
5098       sha1_prepare_w(cur_w, ws, buf, round);
5099 
5100       // one round process
5101       sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
5102     }
5103 
5104     // compute the intermediate hash value
5105     sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5106 
5107     if (multi_block) {
5108       int64_t block_bytes = 16 * 4;
5109       __ addi(buf, buf, block_bytes);
5110 
5111       __ bge(limit, buf, L_sha1_loop, true);
5112     }
5113 
5114     // store back the state.
5115     __ zero_extend(a, a, 32);
5116     __ slli(b, b, 32);
5117     __ orr(a, a, b);
5118     __ sd(a, Address(state, 0));
5119     __ zero_extend(c, c, 32);
5120     __ slli(d, d, 32);
5121     __ orr(c, c, d);
5122     __ sd(c, Address(state, 8));
5123     __ sw(e, Address(state, 16));
5124 
5125     // return offset
5126     if (multi_block) {
5127       __ sub(c_rarg0, buf, src);
5128     }
5129 
5130     __ pop_reg(saved_regs, sp);
5131 
5132     __ leave();
5133     __ ret();
5134 
5135     return (address) start;
5136   }
5137 
5138   /**
5139    * vector registers:
5140    *   input VectorRegister's:  intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
5141    *   index VectorRegister's:  idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
5142    *   output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
5143    *
5144    * NOTE: each field will occupy a vector register group
5145    */
5146   void base64_vector_encode_round(Register src, Register dst, Register codec,
5147                     Register size, Register stepSrc, Register stepDst,
5148                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
5149                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
5150                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
5151                     Assembler::LMUL lmul) {
5152     // set vector register type/len
5153     __ vsetvli(x0, size, Assembler::e8, lmul);
5154 
5155     // segmented load src into v registers: mem(src) => vr(3)
5156     __ vlseg3e8_v(inputV1, src);
5157 
5158     // src = src + register_group_len_bytes * 3
5159     __ add(src, src, stepSrc);
5160 
5161     // encoding
5162     //   1. compute index into lookup table: vr(3) => vr(4)
5163     __ vsrl_vi(idxV1, inputV1, 2);
5164 
5165     __ vsrl_vi(idxV2, inputV2, 2);
5166     __ vsll_vi(inputV1, inputV1, 6);
5167     __ vor_vv(idxV2, idxV2, inputV1);
5168     __ vsrl_vi(idxV2, idxV2, 2);
5169 
5170     __ vsrl_vi(idxV3, inputV3, 4);
5171     __ vsll_vi(inputV2, inputV2, 4);
5172     __ vor_vv(idxV3, inputV2, idxV3);
5173     __ vsrl_vi(idxV3, idxV3, 2);
5174 
5175     __ vsll_vi(idxV4, inputV3, 2);
5176     __ vsrl_vi(idxV4, idxV4, 2);
5177 
5178     //   2. indexed load: vr(4) => vr(4)
5179     __ vluxei8_v(outputV1, codec, idxV1);
5180     __ vluxei8_v(outputV2, codec, idxV2);
5181     __ vluxei8_v(outputV3, codec, idxV3);
5182     __ vluxei8_v(outputV4, codec, idxV4);
5183 
5184     // segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
5185     __ vsseg4e8_v(outputV1, dst);
5186 
5187     // dst = dst + register_group_len_bytes * 4
5188     __ add(dst, dst, stepDst);
5189   }
5190 
5191   /**
5192    *  void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
5193    *
5194    *  Input arguments:
5195    *  c_rarg0   - src, source array
5196    *  c_rarg1   - sp, src start offset
5197    *  c_rarg2   - sl, src end offset
5198    *  c_rarg3   - dst, dest array
5199    *  c_rarg4   - dp, dst start offset
5200    *  c_rarg5   - isURL, Base64 or URL character set
5201    */
5202   address generate_base64_encodeBlock() {
5203     alignas(64) static const char toBase64[64] = {
5204       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5205       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5206       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5207       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5208       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5209     };
5210 
5211     alignas(64) static const char toBase64URL[64] = {
5212       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5213       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5214       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5215       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5216       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5217     };
5218 
5219     __ align(CodeEntryAlignment);
5220     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5221     address start = __ pc();
5222     __ enter();
5223 
5224     Register src    = c_rarg0;
5225     Register soff   = c_rarg1;
5226     Register send   = c_rarg2;
5227     Register dst    = c_rarg3;
5228     Register doff   = c_rarg4;
5229     Register isURL  = c_rarg5;
5230 
5231     Register codec  = c_rarg6;
5232     Register length = c_rarg7; // total length of src data in bytes
5233 
5234     Label ProcessData, Exit;
5235 
5236     // length should be multiple of 3
5237     __ sub(length, send, soff);
5238     // real src/dst to process data
5239     __ add(src, src, soff);
5240     __ add(dst, dst, doff);
5241 
5242     // load the codec base address
5243     __ la(codec, ExternalAddress((address) toBase64));
5244     __ beqz(isURL, ProcessData);
5245     __ la(codec, ExternalAddress((address) toBase64URL));
5246     __ BIND(ProcessData);
5247 
5248     // vector version
5249     if (UseRVV) {
5250       Label ProcessM2, ProcessM1, ProcessScalar;
5251 
5252       Register size      = soff;
5253       Register stepSrcM1 = send;
5254       Register stepSrcM2 = doff;
5255       Register stepDst   = isURL;
5256 
5257       __ mv(size, MaxVectorSize * 2);
5258       __ mv(stepSrcM1, MaxVectorSize * 3);
5259       __ slli(stepSrcM2, stepSrcM1, 1);
5260       __ mv(stepDst, MaxVectorSize * 2 * 4);
5261 
5262       __ blt(length, stepSrcM2, ProcessM1);
5263 
5264       __ BIND(ProcessM2);
5265       base64_vector_encode_round(src, dst, codec,
5266                     size, stepSrcM2, stepDst,
5267                     v2, v4, v6,         // inputs
5268                     v8, v10, v12, v14,  // indexes
5269                     v16, v18, v20, v22, // outputs
5270                     Assembler::m2);
5271 
5272       __ sub(length, length, stepSrcM2);
5273       __ bge(length, stepSrcM2, ProcessM2);
5274 
5275       __ BIND(ProcessM1);
5276       __ blt(length, stepSrcM1, ProcessScalar);
5277 
5278       __ srli(size, size, 1);
5279       __ srli(stepDst, stepDst, 1);
5280       base64_vector_encode_round(src, dst, codec,
5281                     size, stepSrcM1, stepDst,
5282                     v1, v2, v3,         // inputs
5283                     v4, v5, v6, v7,     // indexes
5284                     v8, v9, v10, v11,   // outputs
5285                     Assembler::m1);
5286       __ sub(length, length, stepSrcM1);
5287 
5288       __ BIND(ProcessScalar);
5289     }
5290 
5291     // scalar version
5292     {
5293       Register byte1 = soff, byte0 = send, byte2 = doff;
5294       Register combined24Bits = isURL;
5295 
5296       __ beqz(length, Exit);
5297 
5298       Label ScalarLoop;
5299       __ BIND(ScalarLoop);
5300       {
5301         // plain:   [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
5302         // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
5303 
5304         // load 3 bytes src data
5305         __ lbu(byte0, Address(src, 0));
5306         __ lbu(byte1, Address(src, 1));
5307         __ lbu(byte2, Address(src, 2));
5308         __ addi(src, src, 3);
5309 
5310         // construct 24 bits from 3 bytes
5311         __ slliw(byte0, byte0, 16);
5312         __ slliw(byte1, byte1, 8);
5313         __ orr(combined24Bits, byte0, byte1);
5314         __ orr(combined24Bits, combined24Bits, byte2);
5315 
5316         // get codec index and encode(ie. load from codec by index)
5317         __ slliw(byte0, combined24Bits, 8);
5318         __ srliw(byte0, byte0, 26);
5319         __ add(byte0, codec, byte0);
5320         __ lbu(byte0, byte0);
5321 
5322         __ slliw(byte1, combined24Bits, 14);
5323         __ srliw(byte1, byte1, 26);
5324         __ add(byte1, codec, byte1);
5325         __ lbu(byte1, byte1);
5326 
5327         __ slliw(byte2, combined24Bits, 20);
5328         __ srliw(byte2, byte2, 26);
5329         __ add(byte2, codec, byte2);
5330         __ lbu(byte2, byte2);
5331 
5332         __ andi(combined24Bits, combined24Bits, 0x3f);
5333         __ add(combined24Bits, codec, combined24Bits);
5334         __ lbu(combined24Bits, combined24Bits);
5335 
5336         // store 4 bytes encoded data
5337         __ sb(byte0, Address(dst, 0));
5338         __ sb(byte1, Address(dst, 1));
5339         __ sb(byte2, Address(dst, 2));
5340         __ sb(combined24Bits, Address(dst, 3));
5341 
5342         __ sub(length, length, 3);
5343         __ addi(dst, dst, 4);
5344         // loop back
5345         __ bnez(length, ScalarLoop);
5346       }
5347     }
5348 
5349     __ BIND(Exit);
5350 
5351     __ leave();
5352     __ ret();
5353 
5354     return (address) start;
5355   }
5356 
5357   /**
5358    * vector registers:
5359    * input VectorRegister's:  intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
5360    * index VectorRegister's:  idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
5361    * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
5362    *
5363    * NOTE: each field will occupy a single vector register group
5364    */
5365   void base64_vector_decode_round(Register src, Register dst, Register codec,
5366                     Register size, Register stepSrc, Register stepDst, Register failedIdx, Register minusOne,
5367                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
5368                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
5369                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
5370                     Assembler::LMUL lmul) {
5371     // set vector register type/len
5372     __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
5373 
5374     // segmented load src into v registers: mem(src) => vr(4)
5375     __ vlseg4e8_v(inputV1, src);
5376 
5377     // src = src + register_group_len_bytes * 4
5378     __ add(src, src, stepSrc);
5379 
5380     // decoding
5381     //   1. indexed load: vr(4) => vr(4)
5382     __ vluxei8_v(idxV1, codec, inputV1);
5383     __ vluxei8_v(idxV2, codec, inputV2);
5384     __ vluxei8_v(idxV3, codec, inputV3);
5385     __ vluxei8_v(idxV4, codec, inputV4);
5386 
5387     //   2. check wrong data
5388     __ vor_vv(outputV1, idxV1, idxV2);
5389     __ vor_vv(outputV2, idxV3, idxV4);
5390     __ vor_vv(outputV1, outputV1, outputV2);
5391     __ vmseq_vi(v0, outputV1, -1);
5392     __ vfirst_m(failedIdx, v0);
5393     Label NoFailure;
5394     __ beq(failedIdx, minusOne, NoFailure);
5395     __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
5396     __ slli(stepDst, failedIdx, 1);
5397     __ add(stepDst, failedIdx, stepDst);
5398     __ BIND(NoFailure);
5399 
5400     //   3. compute the decoded data: vr(4) => vr(3)
5401     __ vsll_vi(idxV1, idxV1, 2);
5402     __ vsrl_vi(outputV1, idxV2, 4);
5403     __ vor_vv(outputV1, outputV1, idxV1);
5404 
5405     __ vsll_vi(idxV2, idxV2, 4);
5406     __ vsrl_vi(outputV2, idxV3, 2);
5407     __ vor_vv(outputV2, outputV2, idxV2);
5408 
5409     __ vsll_vi(idxV3, idxV3, 6);
5410     __ vor_vv(outputV3, idxV4, idxV3);
5411 
5412     // segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
5413     __ vsseg3e8_v(outputV1, dst);
5414 
5415     // dst = dst + register_group_len_bytes * 3
5416     __ add(dst, dst, stepDst);
5417   }
5418 
5419   /**
5420    * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
5421    *
5422    *  Input arguments:
5423    *  c_rarg0   - src, source array
5424    *  c_rarg1   - sp, src start offset
5425    *  c_rarg2   - sl, src end offset
5426    *  c_rarg3   - dst, dest array
5427    *  c_rarg4   - dp, dst start offset
5428    *  c_rarg5   - isURL, Base64 or URL character set
5429    *  c_rarg6   - isMIME, Decoding MIME block
5430    */
5431   address generate_base64_decodeBlock() {
5432 
5433     static const uint8_t fromBase64[256] = {
5434         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5435         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5436         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5437         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5438         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5439         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
5440         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5441         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5442         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5443         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5444         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5445         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5446         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5447         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5448         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5449         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5450     };
5451 
5452     static const uint8_t fromBase64URL[256] = {
5453         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5454         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5455         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5456         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5457         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5458         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
5459         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5460         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5461         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5462         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5463         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5464         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5465         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5466         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5467         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5468         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5469     };
5470 
5471     __ align(CodeEntryAlignment);
5472     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
5473     address start = __ pc();
5474     __ enter();
5475 
5476     Register src    = c_rarg0;
5477     Register soff   = c_rarg1;
5478     Register send   = c_rarg2;
5479     Register dst    = c_rarg3;
5480     Register doff   = c_rarg4;
5481     Register isURL  = c_rarg5;
5482     Register isMIME = c_rarg6;
5483 
5484     Register codec     = c_rarg7;
5485     Register dstBackup = x31;
5486     Register length    = x28;     // t3, total length of src data in bytes
5487 
5488     Label ProcessData, Exit;
5489     Label ProcessScalar, ScalarLoop;
5490 
5491     // passed in length (send - soff) is guaranteed to be > 4,
5492     // and in this intrinsic we only process data of length in multiple of 4,
5493     // it's not guaranteed to be multiple of 4 by java level, so do it explicitly
5494     __ sub(length, send, soff);
5495     __ andi(length, length, -4);
5496     // real src/dst to process data
5497     __ add(src, src, soff);
5498     __ add(dst, dst, doff);
5499     // backup of dst, used to calculate the return value at exit
5500     __ mv(dstBackup, dst);
5501 
5502     // load the codec base address
5503     __ la(codec, ExternalAddress((address) fromBase64));
5504     __ beqz(isURL, ProcessData);
5505     __ la(codec, ExternalAddress((address) fromBase64URL));
5506     __ BIND(ProcessData);
5507 
5508     // vector version
5509     if (UseRVV) {
5510       // for MIME case, it has a default length limit of 76 which could be
5511       // different(smaller) from (send - soff), so in MIME case, we go through
5512       // the scalar code path directly.
5513       __ bnez(isMIME, ScalarLoop);
5514 
5515       Label ProcessM1, ProcessM2;
5516 
5517       Register failedIdx = soff;
5518       Register stepSrcM1 = send;
5519       Register stepSrcM2 = doff;
5520       Register stepDst   = isURL;
5521       Register size      = x29;   // t4
5522       Register minusOne  = x30;   // t5
5523 
5524       __ mv(minusOne, -1);
5525       __ mv(size, MaxVectorSize * 2);
5526       __ mv(stepSrcM1, MaxVectorSize * 4);
5527       __ slli(stepSrcM2, stepSrcM1, 1);
5528       __ mv(stepDst, MaxVectorSize * 2 * 3);
5529 
5530       __ blt(length, stepSrcM2, ProcessM1);
5531 
5532 
5533       // Assembler::m2
5534       __ BIND(ProcessM2);
5535       base64_vector_decode_round(src, dst, codec,
5536                     size, stepSrcM2, stepDst, failedIdx, minusOne,
5537                     v2, v4, v6, v8,      // inputs
5538                     v10, v12, v14, v16,  // indexes
5539                     v18, v20, v22,       // outputs
5540                     Assembler::m2);
5541       __ sub(length, length, stepSrcM2);
5542 
5543       // error check
5544       __ bne(failedIdx, minusOne, Exit);
5545 
5546       __ bge(length, stepSrcM2, ProcessM2);
5547 
5548 
5549       // Assembler::m1
5550       __ BIND(ProcessM1);
5551       __ blt(length, stepSrcM1, ProcessScalar);
5552 
5553       __ srli(size, size, 1);
5554       __ srli(stepDst, stepDst, 1);
5555       base64_vector_decode_round(src, dst, codec,
5556                     size, stepSrcM1, stepDst, failedIdx, minusOne,
5557                     v1, v2, v3, v4,      // inputs
5558                     v5, v6, v7, v8,      // indexes
5559                     v9, v10, v11,        // outputs
5560                     Assembler::m1);
5561       __ sub(length, length, stepSrcM1);
5562 
5563       // error check
5564       __ bne(failedIdx, minusOne, Exit);
5565 
5566       __ BIND(ProcessScalar);
5567       __ beqz(length, Exit);
5568     }
5569 
5570     // scalar version
5571     {
5572       Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
5573       Register combined32Bits = x29; // t5
5574 
5575       // encoded:   [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
5576       // plain:     [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
5577       __ BIND(ScalarLoop);
5578 
5579       // load 4 bytes encoded src data
5580       __ lbu(byte0, Address(src, 0));
5581       __ lbu(byte1, Address(src, 1));
5582       __ lbu(byte2, Address(src, 2));
5583       __ lbu(byte3, Address(src, 3));
5584       __ addi(src, src, 4);
5585 
5586       // get codec index and decode (ie. load from codec by index)
5587       __ add(byte0, codec, byte0);
5588       __ add(byte1, codec, byte1);
5589       __ lb(byte0, Address(byte0, 0));
5590       __ lb(byte1, Address(byte1, 0));
5591       __ add(byte2, codec, byte2);
5592       __ add(byte3, codec, byte3);
5593       __ lb(byte2, Address(byte2, 0));
5594       __ lb(byte3, Address(byte3, 0));
5595       __ slliw(byte0, byte0, 18);
5596       __ slliw(byte1, byte1, 12);
5597       __ orr(byte0, byte0, byte1);
5598       __ orr(byte0, byte0, byte3);
5599       __ slliw(byte2, byte2, 6);
5600       // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
5601       //  1. error check below
5602       //  2. decode below
5603       __ orr(combined32Bits, byte0, byte2);
5604 
5605       // error check
5606       __ bltz(combined32Bits, Exit);
5607 
5608       // store 3 bytes decoded data
5609       __ sraiw(byte0, combined32Bits, 16);
5610       __ sraiw(byte1, combined32Bits, 8);
5611       __ sb(byte0, Address(dst, 0));
5612       __ sb(byte1, Address(dst, 1));
5613       __ sb(combined32Bits, Address(dst, 2));
5614 
5615       __ sub(length, length, 4);
5616       __ addi(dst, dst, 3);
5617       // loop back
5618       __ bnez(length, ScalarLoop);
5619     }
5620 
5621     __ BIND(Exit);
5622     __ sub(c_rarg0, dst, dstBackup);
5623 
5624     __ leave();
5625     __ ret();
5626 
5627     return (address) start;
5628   }
5629 
5630   void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
5631     VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
5632     Register temp0, Register temp1, Register temp2,  Register temp3,
5633     VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
5634 
5635     assert((lmul == Assembler::m4 && step == 64) ||
5636            (lmul == Assembler::m2 && step == 32) ||
5637            (lmul == Assembler::m1 && step == 16),
5638            "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
5639     // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
5640     // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
5641     // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
5642     // In non-vectorized code, we update s1 and s2 as:
5643     //   s1 <- s1 + b1
5644     //   s2 <- s2 + s1
5645     //   s1 <- s1 + b2
5646     //   s2 <- s2 + b1
5647     //   ...
5648     //   s1 <- s1 + b64
5649     //   s2 <- s2 + s1
5650     // Putting above assignments together, we have:
5651     //   s1_new = s1 + b1 + b2 + ... + b64
5652     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
5653     //          = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
5654     //          = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
5655 
5656     __ mv(temp3, step);
5657     // Load data
5658     __ vsetvli(temp0, temp3, Assembler::e8, lmul);
5659     __ vle8_v(vbytes, buff);
5660     __ addi(buff, buff, step);
5661 
5662     // Upper bound reduction sum for s1_new:
5663     // 0xFF * 64 = 0x3FC0, so:
5664     // 1. Need to do vector-widening reduction sum
5665     // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
5666     __ vwredsumu_vs(vs1acc, vbytes, vzero);
5667     // Multiplication for s2_new
5668     __ vwmulu_vv(vs2acc, vtable, vbytes);
5669 
5670     // s2 = s2 + s1 * log2(step)
5671     __ slli(temp1, s1, exact_log2(step));
5672     __ add(s2, s2, temp1);
5673 
5674     // Summing up calculated results for s2_new
5675     if (MaxVectorSize > 16) {
5676       __ vsetvli(temp0, temp3, Assembler::e16, lmul);
5677     } else {
5678       // Half of vector-widening multiplication result is in successor of vs2acc
5679       // group for vlen == 16, in which case we need to double vector register
5680       // group width in order to reduction sum all of them
5681       Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
5682                                (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
5683       __ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
5684     }
5685     // Upper bound for reduction sum:
5686     // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
5687     // 1. Need to do vector-widening reduction sum
5688     // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
5689     __ vwredsumu_vs(vtemp1, vs2acc, vzero);
5690 
5691     // Extracting results for:
5692     // s1_new
5693     __ vmv_x_s(temp0, vs1acc);
5694     __ add(s1, s1, temp0);
5695     // s2_new
5696     __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
5697     __ vmv_x_s(temp1, vtemp1);
5698     __ add(s2, s2, temp1);
5699   }
5700 
5701   /***
5702    *  int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
5703    *
5704    *  Arguments:
5705    *
5706    *  Inputs:
5707    *   c_rarg0   - int   adler
5708    *   c_rarg1   - byte* buff (b + off)
5709    *   c_rarg2   - int   len
5710    *
5711    *  Output:
5712    *   c_rarg0   - int adler result
5713    */
5714   address generate_updateBytesAdler32() {
5715     __ align(CodeEntryAlignment);
5716     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
5717     address start = __ pc();
5718 
5719     Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
5720       L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
5721 
5722     // Aliases
5723     Register adler  = c_rarg0;
5724     Register s1     = c_rarg0;
5725     Register s2     = c_rarg3;
5726     Register buff   = c_rarg1;
5727     Register len    = c_rarg2;
5728     Register nmax  = c_rarg4;
5729     Register base  = c_rarg5;
5730     Register count = c_rarg6;
5731     Register temp0 = x28; // t3
5732     Register temp1 = x29; // t4
5733     Register temp2 = x30; // t5
5734     Register temp3 = x31; // t6
5735 
5736     VectorRegister vzero = v31;
5737     VectorRegister vbytes = v8; // group: v8, v9, v10, v11
5738     VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
5739     VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
5740     VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
5741     VectorRegister vtable_32 = v4; // group: v4, v5
5742     VectorRegister vtable_16 = v30;
5743     VectorRegister vtemp1 = v28;
5744     VectorRegister vtemp2 = v29;
5745 
5746     // Max number of bytes we can process before having to take the mod
5747     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
5748     const uint64_t BASE = 0xfff1;
5749     const uint64_t NMAX = 0x15B0;
5750 
5751     // Loops steps
5752     int step_64 = 64;
5753     int step_32 = 32;
5754     int step_16 = 16;
5755     int step_1  = 1;
5756 
5757     __ enter(); // Required for proper stackwalking of RuntimeStub frame
5758     __ mv(temp1, 64);
5759     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
5760 
5761     // Generating accumulation coefficients for further calculations
5762     // vtable_64:
5763     __ vid_v(vtemp1);
5764     __ vrsub_vx(vtable_64, vtemp1, temp1);
5765     // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
5766 
5767     // vtable_32:
5768     __ mv(temp1, 32);
5769     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
5770     __ vid_v(vtemp1);
5771     __ vrsub_vx(vtable_32, vtemp1, temp1);
5772     // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
5773 
5774     __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
5775     // vtable_16:
5776     __ mv(temp1, 16);
5777     __ vid_v(vtemp1);
5778     __ vrsub_vx(vtable_16, vtemp1, temp1);
5779     // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
5780 
5781     __ vmv_v_i(vzero, 0);
5782 
5783     __ mv(base, BASE);
5784     __ mv(nmax, NMAX);
5785 
5786     // s1 is initialized to the lower 16 bits of adler
5787     // s2 is initialized to the upper 16 bits of adler
5788     __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
5789     __ zero_extend(s1, adler, 16); // s1 = (adler & 0xffff)
5790 
5791     // The pipelined loop needs at least 16 elements for 1 iteration
5792     // It does check this, but it is more effective to skip to the cleanup loop
5793     __ mv(temp0, step_16);
5794     __ bgeu(len, temp0, L_nmax);
5795     __ beqz(len, L_combine);
5796 
5797     // Jumping to L_by1_loop
5798     __ sub(len, len, step_1);
5799     __ j(L_by1_loop);
5800 
5801   __ bind(L_nmax);
5802     __ sub(len, len, nmax);
5803     __ sub(count, nmax, 16);
5804     __ bltz(len, L_by16);
5805 
5806   // Align L_nmax loop by 64
5807   __ bind(L_nmax_loop_entry);
5808     __ sub(count, count, 32);
5809 
5810   __ bind(L_nmax_loop);
5811     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
5812       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5813       vtemp1, vtemp2, step_64, Assembler::m4);
5814     __ sub(count, count, step_64);
5815     __ bgtz(count, L_nmax_loop);
5816 
5817     // There are three iterations left to do
5818     adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
5819       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5820       vtemp1, vtemp2, step_32, Assembler::m2);
5821     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
5822       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5823       vtemp1, vtemp2, step_16, Assembler::m1);
5824 
5825     // s1 = s1 % BASE
5826     __ remuw(s1, s1, base);
5827     // s2 = s2 % BASE
5828     __ remuw(s2, s2, base);
5829 
5830     __ sub(len, len, nmax);
5831     __ sub(count, nmax, 16);
5832     __ bgez(len, L_nmax_loop_entry);
5833 
5834   __ bind(L_by16);
5835     __ add(len, len, count);
5836     __ bltz(len, L_by1);
5837     // Trying to unroll
5838     __ mv(temp3, step_64);
5839     __ blt(len, temp3, L_by16_loop);
5840 
5841   __ bind(L_by16_loop_unroll);
5842     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
5843       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5844       vtemp1, vtemp2, step_64, Assembler::m4);
5845     __ sub(len, len, step_64);
5846     // By now the temp3 should still be 64
5847     __ bge(len, temp3, L_by16_loop_unroll);
5848 
5849   __ bind(L_by16_loop);
5850     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
5851       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5852       vtemp1, vtemp2, step_16, Assembler::m1);
5853     __ sub(len, len, step_16);
5854     __ bgez(len, L_by16_loop);
5855 
5856   __ bind(L_by1);
5857     __ add(len, len, 15);
5858     __ bltz(len, L_do_mod);
5859 
5860   __ bind(L_by1_loop);
5861     __ lbu(temp0, Address(buff, 0));
5862     __ addi(buff, buff, step_1);
5863     __ add(s1, temp0, s1);
5864     __ add(s2, s2, s1);
5865     __ sub(len, len, step_1);
5866     __ bgez(len, L_by1_loop);
5867 
5868   __ bind(L_do_mod);
5869     // s1 = s1 % BASE
5870     __ remuw(s1, s1, base);
5871     // s2 = s2 % BASE
5872     __ remuw(s2, s2, base);
5873 
5874     // Combine lower bits and higher bits
5875     // adler = s1 | (s2 << 16)
5876   __ bind(L_combine);
5877     __ slli(s2, s2, 16);
5878     __ orr(s1, s1, s2);
5879 
5880     __ leave(); // Required for proper stackwalking of RuntimeStub frame
5881     __ ret();
5882 
5883     return start;
5884   }
5885 
5886 #endif // COMPILER2_OR_JVMCI
5887 
5888 #ifdef COMPILER2
5889 
5890 static const int64_t right_2_bits = right_n_bits(2);
5891 static const int64_t right_3_bits = right_n_bits(3);
5892 
5893   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
5894   // are represented as long[5], with BITS_PER_LIMB = 26.
5895   // Pack five 26-bit limbs into three 64-bit registers.
5896   void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
5897     assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
5898 
5899     // The goal is to have 128-bit value in dest2:dest1:dest0
5900     __ ld(dest0, Address(src, 0));    // 26 bits in dest0
5901 
5902     __ ld(tmp1, Address(src, sizeof(jlong)));
5903     __ slli(tmp1, tmp1, 26);
5904     __ add(dest0, dest0, tmp1);       // 52 bits in dest0
5905 
5906     __ ld(tmp2, Address(src, 2 * sizeof(jlong)));
5907     __ slli(tmp1, tmp2, 52);
5908     __ add(dest0, dest0, tmp1);       // dest0 is full
5909 
5910     __ srli(dest1, tmp2, 12);         // 14-bit in dest1
5911 
5912     __ ld(tmp1, Address(src, 3 * sizeof(jlong)));
5913     __ slli(tmp1, tmp1, 14);
5914     __ add(dest1, dest1, tmp1);       // 40-bit in dest1
5915 
5916     __ ld(tmp1, Address(src, 4 * sizeof(jlong)));
5917     __ slli(tmp2, tmp1, 40);
5918     __ add(dest1, dest1, tmp2);       // dest1 is full
5919 
5920     if (dest2->is_valid()) {
5921       __ srli(tmp1, tmp1, 24);
5922       __ mv(dest2, tmp1);               // 2 bits in dest2
5923     } else {
5924 #ifdef ASSERT
5925       Label OK;
5926       __ srli(tmp1, tmp1, 24);
5927       __ beq(zr, tmp1, OK);           // 2 bits
5928       __ stop("high bits of Poly1305 integer should be zero");
5929       __ should_not_reach_here();
5930       __ bind(OK);
5931 #endif
5932     }
5933   }
5934 
5935   // As above, but return only a 128-bit integer, packed into two
5936   // 64-bit registers.
5937   void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
5938     poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
5939   }
5940 
5941   // U_2:U_1:U_0: += (U_2 >> 2) * 5
5942   void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
5943     assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
5944 
5945     // First, U_2:U_1:U_0 += (U_2 >> 2)
5946     __ srli(tmp1, U_2, 2);
5947     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
5948     __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
5949     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
5950     __ add(U_2, U_2, tmp2);
5951 
5952     // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
5953     __ slli(tmp1, tmp1, 2);
5954     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
5955     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
5956     __ add(U_2, U_2, tmp2);
5957   }
5958 
5959   // Poly1305, RFC 7539
5960   // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
5961 
5962   // Arguments:
5963   //    c_rarg0:   input_start -- where the input is stored
5964   //    c_rarg1:   length
5965   //    c_rarg2:   acc_start -- where the output will be stored
5966   //    c_rarg3:   r_start -- where the randomly generated 128-bit key is stored
5967 
5968   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
5969   // description of the tricks used to simplify and accelerate this
5970   // computation.
5971 
5972   address generate_poly1305_processBlocks() {
5973     __ align(CodeEntryAlignment);
5974     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
5975     address start = __ pc();
5976     __ enter();
5977     Label here;
5978 
5979     RegSet saved_regs = RegSet::range(x18, x21);
5980     RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
5981     __ push_reg(saved_regs, sp);
5982 
5983     // Arguments
5984     const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
5985 
5986     // R_n is the 128-bit randomly-generated key, packed into two
5987     // registers. The caller passes this key to us as long[5], with
5988     // BITS_PER_LIMB = 26.
5989     const Register R_0 = *regs, R_1 = *++regs;
5990     poly1305_pack_26(R_0, R_1, r_start, t1, t2);
5991 
5992     // RR_n is (R_n >> 2) * 5
5993     const Register RR_0 = *++regs, RR_1 = *++regs;
5994     __ srli(t1, R_0, 2);
5995     __ shadd(RR_0, t1, t1, t2, 2);
5996     __ srli(t1, R_1, 2);
5997     __ shadd(RR_1, t1, t1, t2, 2);
5998 
5999     // U_n is the current checksum
6000     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
6001     poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
6002 
6003     static constexpr int BLOCK_LENGTH = 16;
6004     Label DONE, LOOP;
6005 
6006     __ mv(t1, BLOCK_LENGTH);
6007     __ blt(length, t1, DONE); {
6008       __ bind(LOOP);
6009 
6010       // S_n is to be the sum of U_n and the next block of data
6011       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
6012       __ ld(S_0, Address(input_start, 0));
6013       __ ld(S_1, Address(input_start, wordSize));
6014 
6015       __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
6016       __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
6017       __ add(S_2, U_2, t1);
6018 
6019       __ addi(S_2, S_2, 1);
6020 
6021       const Register U_0HI = *++regs, U_1HI = *++regs;
6022 
6023       // NB: this logic depends on some of the special properties of
6024       // Poly1305 keys. In particular, because we know that the top
6025       // four bits of R_0 and R_1 are zero, we can add together
6026       // partial products without any risk of needing to propagate a
6027       // carry out.
6028       __ wide_mul(U_0, U_0HI, S_0, R_0);
6029       __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
6030       __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
6031 
6032       __ wide_mul(U_1, U_1HI, S_0, R_1);
6033       __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
6034       __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
6035 
6036       __ andi(U_2, R_0, right_2_bits);
6037       __ mul(U_2, S_2, U_2);
6038 
6039       // Partial reduction mod 2**130 - 5
6040       __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
6041       __ adc(U_2, U_2, U_1HI, t1);
6042       // Sum is now in U_2:U_1:U_0.
6043 
6044       // U_2:U_1:U_0: += (U_2 >> 2) * 5
6045       poly1305_reduce(U_2, U_1, U_0, t1, t2);
6046 
6047       __ sub(length, length, BLOCK_LENGTH);
6048       __ addi(input_start, input_start, BLOCK_LENGTH);
6049       __ mv(t1, BLOCK_LENGTH);
6050       __ bge(length, t1, LOOP);
6051     }
6052 
6053     // Further reduce modulo 2^130 - 5
6054     poly1305_reduce(U_2, U_1, U_0, t1, t2);
6055 
6056     // Unpack the sum into five 26-bit limbs and write to memory.
6057     // First 26 bits is the first limb
6058     __ slli(t1, U_0, 38); // Take lowest 26 bits
6059     __ srli(t1, t1, 38);
6060     __ sd(t1, Address(acc_start)); // First 26-bit limb
6061 
6062     // 27-52 bits of U_0 is the second limb
6063     __ slli(t1, U_0, 12); // Take next 27-52 bits
6064     __ srli(t1, t1, 38);
6065     __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
6066 
6067     // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
6068     __ srli(t1, U_0, 52);
6069     __ slli(t2, U_1, 50);
6070     __ srli(t2, t2, 38);
6071     __ add(t1, t1, t2);
6072     __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
6073 
6074     // Storing 15-40 bits of U_1
6075     __ slli(t1, U_1, 24); // Already used up 14 bits
6076     __ srli(t1, t1, 38); // Clear all other bits from t1
6077     __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
6078 
6079     // Storing 41-64 bits of U_1 and first three bits from U_2 in one register
6080     __ srli(t1, U_1, 40);
6081     __ andi(t2, U_2, right_3_bits);
6082     __ slli(t2, t2, 24);
6083     __ add(t1, t1, t2);
6084     __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
6085 
6086     __ bind(DONE);
6087     __ pop_reg(saved_regs, sp);
6088     __ leave(); // Required for proper stackwalking
6089     __ ret();
6090 
6091     return start;
6092   }
6093 
6094 #endif // COMPILER2
6095 
6096   /**
6097    *  Arguments:
6098    *
6099    * Inputs:
6100    *   c_rarg0   - int crc
6101    *   c_rarg1   - byte* buf
6102    *   c_rarg2   - int length
6103    *
6104    * Output:
6105    *   c_rarg0   - int crc result
6106    */
6107   address generate_updateBytesCRC32() {
6108     assert(UseCRC32Intrinsics, "what are we doing here?");
6109 
6110     __ align(CodeEntryAlignment);
6111     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6112 
6113     address start = __ pc();
6114 
6115     const Register crc    = c_rarg0;  // crc
6116     const Register buf    = c_rarg1;  // source java byte array address
6117     const Register len    = c_rarg2;  // length
6118     const Register table0 = c_rarg3;  // crc_table address
6119     const Register table1 = c_rarg4;
6120     const Register table2 = c_rarg5;
6121     const Register table3 = c_rarg6;
6122 
6123     const Register tmp1 = c_rarg7;
6124     const Register tmp2 = t2;
6125     const Register tmp3 = x28; // t3
6126     const Register tmp4 = x29; // t4
6127     const Register tmp5 = x30; // t5
6128     const Register tmp6 = x31; // t6
6129 
6130     BLOCK_COMMENT("Entry:");
6131     __ enter(); // required for proper stackwalking of RuntimeStub frame
6132 
6133     __ kernel_crc32(crc, buf, len, table0, table1, table2,
6134                     table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
6135 
6136     __ leave(); // required for proper stackwalking of RuntimeStub frame
6137     __ ret();
6138 
6139     return start;
6140   }
6141 
6142   // exception handler for upcall stubs
6143   address generate_upcall_stub_exception_handler() {
6144     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
6145     address start = __ pc();
6146 
6147     // Native caller has no idea how to handle exceptions,
6148     // so we just crash here. Up to callee to catch exceptions.
6149     __ verify_oop(x10); // return a exception oop in a0
6150     __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
6151     __ should_not_reach_here();
6152 
6153     return start;
6154   }
6155 
6156 #undef __
6157 
6158   // Initialization
6159   void generate_initial_stubs() {
6160     // Generate initial stubs and initializes the entry points
6161 
6162     // entry points that exist in all platforms Note: This is code
6163     // that could be shared among different platforms - however the
6164     // benefit seems to be smaller than the disadvantage of having a
6165     // much more complicated generator structure. See also comment in
6166     // stubRoutines.hpp.
6167 
6168     StubRoutines::_forward_exception_entry = generate_forward_exception();
6169 
6170     if (UnsafeMemoryAccess::_table == nullptr) {
6171       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
6172     }
6173 
6174     StubRoutines::_call_stub_entry =
6175       generate_call_stub(StubRoutines::_call_stub_return_address);
6176 
6177     // is referenced by megamorphic call
6178     StubRoutines::_catch_exception_entry = generate_catch_exception();
6179 
6180     if (UseCRC32Intrinsics) {
6181       // set table address before stub generation which use it
6182       StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table;
6183       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
6184     }
6185   }
6186 
6187   void generate_continuation_stubs() {
6188     // Continuation stubs:
6189     StubRoutines::_cont_thaw             = generate_cont_thaw();
6190     StubRoutines::_cont_returnBarrier    = generate_cont_returnBarrier();
6191     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
6192     StubRoutines::_cont_preempt_stub     = generate_cont_preempt_stub();
6193   }
6194 
6195   void generate_final_stubs() {
6196     // support for verify_oop (must happen after universe_init)
6197     if (VerifyOops) {
6198       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
6199     }
6200 
6201     // arraycopy stubs used by compilers
6202     generate_arraycopy_stubs();
6203 
6204     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
6205     if (bs_nm != nullptr) {
6206       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
6207     }
6208 
6209 #ifdef COMPILER2
6210     if (UseSecondarySupersTable) {
6211       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
6212       if (!InlineSecondarySupersTest) {
6213         for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
6214           StubRoutines::_lookup_secondary_supers_table_stubs[slot]
6215             = generate_lookup_secondary_supers_table_stub(slot);
6216         }
6217       }
6218     }
6219 #endif // COMPILER2
6220 
6221     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
6222 
6223     StubRoutines::riscv::set_completed();
6224   }
6225 
6226   void generate_compiler_stubs() {
6227 #ifdef COMPILER2
6228     if (UseMulAddIntrinsic) {
6229       StubRoutines::_mulAdd = generate_mulAdd();
6230     }
6231 
6232     if (UseMultiplyToLenIntrinsic) {
6233       StubRoutines::_multiplyToLen = generate_multiplyToLen();
6234     }
6235 
6236     if (UseSquareToLenIntrinsic) {
6237       StubRoutines::_squareToLen = generate_squareToLen();
6238     }
6239 
6240     if (UseMontgomeryMultiplyIntrinsic) {
6241       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
6242       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
6243       StubRoutines::_montgomeryMultiply = g.generate_multiply();
6244     }
6245 
6246     if (UseMontgomerySquareIntrinsic) {
6247       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
6248       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
6249       StubRoutines::_montgomerySquare = g.generate_square();
6250     }
6251 
6252     if (UsePoly1305Intrinsics) {
6253       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
6254     }
6255 
6256     if (UseRVVForBigIntegerShiftIntrinsics) {
6257       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
6258       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
6259     }
6260 
6261     if (UseSHA256Intrinsics) {
6262       Sha2Generator sha2(_masm, this);
6263       StubRoutines::_sha256_implCompress   = sha2.generate_sha256_implCompress(false);
6264       StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true);
6265     }
6266 
6267     if (UseSHA512Intrinsics) {
6268       Sha2Generator sha2(_masm, this);
6269       StubRoutines::_sha512_implCompress   = sha2.generate_sha512_implCompress(false);
6270       StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true);
6271     }
6272 
6273     if (UseMD5Intrinsics) {
6274       StubRoutines::_md5_implCompress   = generate_md5_implCompress(false, "md5_implCompress");
6275       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true,  "md5_implCompressMB");
6276     }
6277 
6278     if (UseChaCha20Intrinsics) {
6279       StubRoutines::_chacha20Block = generate_chacha20Block();
6280     }
6281 
6282     if (UseSHA1Intrinsics) {
6283       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false, "sha1_implCompress");
6284       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true, "sha1_implCompressMB");
6285     }
6286 
6287     if (UseBASE64Intrinsics) {
6288       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6289       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
6290     }
6291 
6292     if (UseAdler32Intrinsics) {
6293       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6294     }
6295 
6296     generate_compare_long_strings();
6297 
6298     generate_string_indexof_stubs();
6299 
6300 #endif // COMPILER2
6301   }
6302 
6303  public:
6304   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
6305     switch(kind) {
6306     case Initial_stubs:
6307       generate_initial_stubs();
6308       break;
6309      case Continuation_stubs:
6310       generate_continuation_stubs();
6311       break;
6312     case Compiler_stubs:
6313       generate_compiler_stubs();
6314       break;
6315     case Final_stubs:
6316       generate_final_stubs();
6317       break;
6318     default:
6319       fatal("unexpected stubs kind: %d", kind);
6320       break;
6321     };
6322   }
6323 }; // end class declaration
6324 
6325 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
6326   StubGenerator g(code, kind);
6327 }