New src/hotspot/cpu/ppc/stubGenerator

   1 /*
   2  * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2026 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/macroAssembler.inline.hpp"
  27 #include "compiler/oopMap.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "gc/shared/barrierSetNMethod.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_ppc.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "prims/upcallLinker.hpp"
  39 #include "runtime/continuation.hpp"
  40 #include "runtime/continuationEntry.inline.hpp"
  41 #include "runtime/frame.inline.hpp"
  42 #include "runtime/handles.inline.hpp"
  43 #include "runtime/javaThread.hpp"
  44 #include "runtime/sharedRuntime.hpp"
  45 #include "runtime/stubCodeGenerator.hpp"
  46 #include "runtime/stubRoutines.hpp"
  47 #include "runtime/vm_version.hpp"
  48 #include "utilities/align.hpp"
  49 #include "utilities/powerOfTwo.hpp"
  50 #if INCLUDE_ZGC
  51 #include "gc/z/zBarrierSetAssembler.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp.
  57 
  58 #define __ _masm->
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) // nothing
  62 #else
  63 #define BLOCK_COMMENT(str) __ block_comment(str)
  64 #endif
  65 
  66 #if defined(ABI_ELFv2)
  67 #define STUB_ENTRY(name) StubRoutines::name
  68 #else
  69 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name)->entry()
  70 #endif
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75   // Call stubs are used to call Java from C
  76   //
  77   // Arguments:
  78   //
  79   //   R3  - call wrapper address     : address
  80   //   R4  - result                   : intptr_t*
  81   //   R5  - result type              : BasicType
  82   //   R6  - method                   : Method
  83   //   R7  - frame mgr entry point    : address
  84   //   R8  - parameter block          : intptr_t*
  85   //   R9  - parameter count in words : int
  86   //   R10 - thread                   : Thread*
  87   //
  88   address generate_call_stub(address& return_address) {
  89     // Setup a new c frame, copy java arguments, call template interpreter or
  90     // native_entry, and process result.
  91 
  92     StubId stub_id = StubId::stubgen_call_stub_id;
  93     StubCodeMark mark(this, stub_id);
  94 
  95     address start = __ function_entry();
  96 
  97     int save_nonvolatile_registers_size = __ save_nonvolatile_registers_size(true, SuperwordUseVSX);
  98 
  99     // some sanity checks
 100     STATIC_ASSERT(StackAlignmentInBytes == 16);
 101     assert((sizeof(frame::native_abi_minframe) % 16) == 0,    "unaligned");
 102     assert((sizeof(frame::native_abi_reg_args) % 16) == 0,    "unaligned");
 103     assert((save_nonvolatile_registers_size % 16) == 0,       "unaligned");
 104     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
 105     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");
 106 
 107     Register r_arg_call_wrapper_addr        = R3;
 108     Register r_arg_result_addr              = R4;
 109     Register r_arg_result_type              = R5;
 110     Register r_arg_method                   = R6;
 111     Register r_arg_entry                    = R7;
 112     Register r_arg_argument_addr            = R8;
 113     Register r_arg_argument_count           = R9;
 114     Register r_arg_thread                   = R10;
 115 
 116     Register r_entryframe_fp                = R2; // volatile
 117     Register r_argument_size                = R11_scratch1; // volatile
 118     Register r_top_of_arguments_addr        = R21_tmp1;
 119 
 120     {
 121       // Stack on entry to call_stub:
 122       //
 123       //      F1      [C_FRAME]
 124       //              ...
 125       Register r_frame_size  = R12_scratch2; // volatile
 126       Label arguments_copied;
 127 
 128       // Save LR/CR to caller's C_FRAME.
 129       __ save_LR_CR(R0);
 130 
 131       // Keep copy of our frame pointer (caller's SP).
 132       __ mr(r_entryframe_fp, R1_SP);
 133 
 134       // calculate frame size
 135       STATIC_ASSERT(Interpreter::logStackElementSize == 3);
 136 
 137       // space for arguments aligned up: ((arg_count + 1) * 8) &~ 15
 138       __ addi(r_frame_size, r_arg_argument_count, 1);
 139       __ rldicr(r_frame_size, r_frame_size, 3, 63 - 4);
 140 
 141       // this is the pure space for arguments (excluding alignment padding)
 142       __ sldi(r_argument_size, r_arg_argument_count, 3);
 143 
 144       __ addi(r_frame_size, r_frame_size,
 145               save_nonvolatile_registers_size + frame::entry_frame_locals_size + frame::top_ijava_frame_abi_size);
 146 
 147       // push ENTRY_FRAME
 148       __ push_frame(r_frame_size, R0);
 149 
 150       // Save non-volatiles registers to ENTRY_FRAME.
 151       __ save_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
 152                                     true, SuperwordUseVSX);
 153 
 154       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
 155       // Push ENTRY_FRAME including arguments:
 156       //
 157       //      F0      [TOP_IJAVA_FRAME_ABI]
 158       //              alignment (optional)
 159       //              [outgoing Java arguments]
 160       //              [non-volatiles]
 161       //              [ENTRY_FRAME_LOCALS]
 162       //      F1      [C_FRAME]
 163       //              ...
 164 
 165       // initialize call_stub locals (step 1)
 166       __ std(r_arg_call_wrapper_addr, _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
 167       __ std(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
 168       __ std(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
 169       // we will save arguments_tos_address later
 170 
 171       BLOCK_COMMENT("Copy Java arguments");
 172       // copy Java arguments
 173 
 174       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
 175       __ addi(r_top_of_arguments_addr, r_entryframe_fp,
 176               -(save_nonvolatile_registers_size + frame::entry_frame_locals_size));
 177       __ sub(r_top_of_arguments_addr, r_top_of_arguments_addr, r_argument_size);
 178 
 179       // any arguments to copy?
 180       __ cmpdi(CR0, r_arg_argument_count, 0);
 181       __ beq(CR0, arguments_copied);
 182 
 183       // prepare loop and copy arguments in reverse order
 184       {
 185         Register r_argument_addr     = R22_tmp2;
 186         Register r_argumentcopy_addr = R23_tmp3;
 187         // init CTR with arg_argument_count
 188         __ mtctr(r_arg_argument_count);
 189 
 190         // let r_argumentcopy_addr point to last outgoing Java arguments P
 191         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
 192 
 193         // let r_argument_addr point to last incoming java argument
 194         __ add(r_argument_addr, r_arg_argument_addr, r_argument_size);
 195         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 196 
 197         // now loop while CTR > 0 and copy arguments
 198         {
 199           Label next_argument;
 200           __ bind(next_argument);
 201 
 202           __ ld(R0, 0, r_argument_addr);
 203           // argument_addr--;
 204           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 205           __ std(R0, 0, r_argumentcopy_addr);
 206           // argumentcopy_addr++;
 207           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
 208 
 209           __ bdnz(next_argument);
 210         }
 211       }
 212 
 213       // Arguments copied, continue.
 214       __ bind(arguments_copied);
 215     }
 216 
 217     {
 218       BLOCK_COMMENT("Call template interpreter or native entry.");
 219       assert_different_registers(r_arg_entry, r_top_of_arguments_addr, r_arg_method, r_arg_thread);
 220 
 221       // Register state on entry to template interpreter / native entry:
 222       //
 223       //   tos         -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
 224       //   R19_method  -  Method
 225       //   R16_thread  -  JavaThread*
 226 
 227       // Tos must point to last argument - element_size.
 228       const Register tos = R15_esp;
 229 
 230       __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
 231 
 232       // initialize call_stub locals (step 2)
 233       // now save tos as arguments_tos_address
 234       __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
 235 
 236       // load argument registers for call
 237       __ mr(R19_method, r_arg_method);
 238       __ mr(R16_thread, r_arg_thread);
 239       assert(tos != r_arg_method, "trashed r_arg_method");
 240       assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
 241 
 242       // Set R15_prev_state to 0 for simplifying checks in callee.
 243       __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R0);
 244       // Stack on entry to template interpreter / native entry:
 245       //
 246       //      F0      [TOP_IJAVA_FRAME_ABI]
 247       //              alignment (optional)
 248       //              [outgoing Java arguments]
 249       //              [non-volatiles]
 250       //              [ENTRY_FRAME_LOCALS]
 251       //      F1      [C_FRAME]
 252       //              ...
 253       //
 254 
 255       // global toc register
 256       __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R0);
 257       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
 258       // when called via a c2i.
 259 
 260       // Pass initial_caller_sp to framemanager.
 261       __ mr(R21_sender_SP, R1_SP);
 262 
 263       // Do a light-weight C-call here, r_arg_entry holds the address
 264       // of the interpreter entry point (template interpreter or native entry)
 265       // and save runtime-value of LR in return_address.
 266       assert(r_arg_entry != tos && r_arg_entry != R19_method && r_arg_entry != R16_thread,
 267              "trashed r_arg_entry");
 268       return_address = __ call_stub(r_arg_entry);
 269     }
 270 
 271     {
 272       BLOCK_COMMENT("Returned from template interpreter or native entry.");
 273       // Now pop frame, process result, and return to caller.
 274 
 275       // Stack on exit from template interpreter / native entry:
 276       //
 277       //      F0      [ABI]
 278       //              ...
 279       //              [non-volatiles]
 280       //              [ENTRY_FRAME_LOCALS]
 281       //      F1      [C_FRAME]
 282       //              ...
 283       //
 284       // Just pop the topmost frame ...
 285       //
 286 
 287       Label ret_is_object;
 288       Label ret_is_long;
 289       Label ret_is_float;
 290       Label ret_is_double;
 291 
 292       Register r_lr = R11_scratch1;
 293       Register r_cr = R12_scratch2;
 294 
 295       // Reload some volatile registers which we've spilled before the call
 296       // to template interpreter / native entry.
 297       // Access all locals via frame pointer, because we know nothing about
 298       // the topmost frame's size.
 299       __ ld(r_entryframe_fp, _abi0(callers_sp), R1_SP); // restore after call
 300       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
 301       __ ld(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
 302       __ ld(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
 303       __ ld(r_cr, _abi0(cr), r_entryframe_fp);
 304       __ ld(r_lr, _abi0(lr), r_entryframe_fp);
 305       __ mtcr(r_cr); // restore CR
 306       __ mtlr(r_lr); // restore LR
 307 
 308       // Store result depending on type. Everything that is not
 309       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
 310       // Using volatile CRs.
 311       __ cmpwi(CR1, r_arg_result_type, T_OBJECT);
 312       __ cmpwi(CR5, r_arg_result_type, T_LONG);
 313       __ cmpwi(CR6, r_arg_result_type, T_FLOAT);
 314       __ cmpwi(CR7, r_arg_result_type, T_DOUBLE);
 315 
 316       __ pop_cont_fastpath(); // kills CR0, uses R16_thread
 317 
 318       // restore non-volatile registers
 319       __ restore_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
 320                                        true, SuperwordUseVSX);
 321 
 322       // pop frame
 323       __ mr(R1_SP, r_entryframe_fp);
 324 
 325       // Stack on exit from call_stub:
 326       //
 327       //      0       [C_FRAME]
 328       //              ...
 329       //
 330       //  no call_stub frames left.
 331 
 332       __ beq(CR1, ret_is_object);
 333       __ beq(CR5, ret_is_long);
 334       __ beq(CR6, ret_is_float);
 335       __ beq(CR7, ret_is_double);
 336 
 337       // default:
 338       __ stw(R3_RET, 0, r_arg_result_addr);
 339       __ blr(); // return to caller
 340 
 341       // case T_OBJECT:
 342       __ bind(ret_is_object);
 343       if (InlineTypeReturnedAsFields) {
 344         // Check for scalarized return value
 345         __ cmpdi(CR0, R3_RET, 0);
 346         __ beq(CR0, ret_is_long);
 347         // Load pack handler address
 348         __ untested("call stub InlineTypeReturnedAsFields"); // TODO: check return registers usage
 349         __ andi(R12_scratch2, R3_RET, -2);
 350         __ ld(R12_scratch2, InlineKlass::adr_members_offset(), R12_scratch2);
 351         __ ld(R12_scratch2, InlineKlass::pack_handler_jobject_offset(), R12_scratch2);
 352         __ mtctr(R12_scratch2);
 353         __ bctr(); // tail call
 354       } // else fall through
 355 
 356       // case T_LONG:
 357       __ bind(ret_is_long);
 358       __ std(R3_RET, 0, r_arg_result_addr);
 359       __ blr(); // return to caller
 360 
 361       // case T_FLOAT:
 362       __ bind(ret_is_float);
 363       __ stfs(F1_RET, 0, r_arg_result_addr);
 364       __ blr(); // return to caller
 365 
 366       // case T_DOUBLE:
 367       __ bind(ret_is_double);
 368       __ stfd(F1_RET, 0, r_arg_result_addr);
 369       __ blr(); // return to caller
 370     }
 371 
 372     return start;
 373   }
 374 
 375   // Return point for a Java call if there's an exception thrown in
 376   // Java code.  The exception is caught and transformed into a
 377   // pending exception stored in JavaThread that can be tested from
 378   // within the VM.
 379   //
 380   address generate_catch_exception() {
 381     StubId stub_id = StubId::stubgen_catch_exception_id;
 382     StubCodeMark mark(this, stub_id);
 383 
 384     address start = __ pc();
 385 
 386     // Registers alive
 387     //
 388     //  R16_thread
 389     //  R3_ARG1 - address of pending exception
 390     //  R4_ARG2 - return address in call stub
 391 
 392     const Register exception_file = R21_tmp1;
 393     const Register exception_line = R22_tmp2;
 394 
 395     __ load_const(exception_file, (void*)__FILE__);
 396     __ load_const(exception_line, (void*)__LINE__);
 397 
 398     __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
 399     // store into `char *'
 400     __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
 401     // store into `int'
 402     __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
 403 
 404     // complete return to VM
 405     assert(StubRoutines::_call_stub_return_address != nullptr, "must have been generated before");
 406 
 407     __ mtlr(R4_ARG2);
 408     // continue in call stub
 409     __ blr();
 410 
 411     return start;
 412   }
 413 
 414   // Continuation point for runtime calls returning with a pending
 415   // exception.  The pending exception check happened in the runtime
 416   // or native call stub.  The pending exception in Thread is
 417   // converted into a Java-level exception.
 418   //
 419   // Read:
 420   //
 421   //   LR:     The pc the runtime library callee wants to return to.
 422   //           Since the exception occurred in the callee, the return pc
 423   //           from the point of view of Java is the exception pc.
 424   //   thread: Needed for method handles.
 425   //
 426   // Invalidate:
 427   //
 428   //   volatile registers (except below).
 429   //
 430   // Update:
 431   //
 432   //   R4_ARG2: exception
 433   //
 434   // (LR is unchanged and is live out).
 435   //
 436   address generate_forward_exception() {
 437     StubId stub_id = StubId::stubgen_forward_exception_id;
 438     StubCodeMark mark(this, stub_id);
 439     address start = __ pc();
 440 
 441     if (VerifyOops) {
 442       // Get pending exception oop.
 443       __ ld(R3_ARG1,
 444                 in_bytes(Thread::pending_exception_offset()),
 445                 R16_thread);
 446       // Make sure that this code is only executed if there is a pending exception.
 447       {
 448         Label L;
 449         __ cmpdi(CR0, R3_ARG1, 0);
 450         __ bne(CR0, L);
 451         __ stop("StubRoutines::forward exception: no pending exception (1)");
 452         __ bind(L);
 453       }
 454       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
 455     }
 456 
 457     // Save LR/CR and copy exception pc (LR) into R4_ARG2.
 458     __ save_LR(R4_ARG2);
 459     __ push_frame_reg_args(0, R0);
 460     // Find exception handler.
 461     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 462                      SharedRuntime::exception_handler_for_return_address),
 463                     R16_thread,
 464                     R4_ARG2);
 465     // Copy handler's address.
 466     __ mtctr(R3_RET);
 467     __ pop_frame();
 468     __ restore_LR(R0);
 469 
 470     // Set up the arguments for the exception handler:
 471     //  - R3_ARG1: exception oop
 472     //  - R4_ARG2: exception pc.
 473 
 474     // Load pending exception oop.
 475     __ ld(R3_ARG1,
 476               in_bytes(Thread::pending_exception_offset()),
 477               R16_thread);
 478 
 479     // The exception pc is the return address in the caller.
 480     // Must load it into R4_ARG2.
 481     __ mflr(R4_ARG2);
 482 
 483 #ifdef ASSERT
 484     // Make sure exception is set.
 485     {
 486       Label L;
 487       __ cmpdi(CR0, R3_ARG1, 0);
 488       __ bne(CR0, L);
 489       __ stop("StubRoutines::forward exception: no pending exception (2)");
 490       __ bind(L);
 491     }
 492 #endif
 493 
 494     // Clear the pending exception.
 495     __ li(R0, 0);
 496     __ std(R0,
 497                in_bytes(Thread::pending_exception_offset()),
 498                R16_thread);
 499     // Jump to exception handler.
 500     __ bctr();
 501 
 502     return start;
 503   }
 504 
 505 #undef __
 506 #define __ _masm->
 507 
 508 #if !defined(PRODUCT)
 509   // Wrapper which calls oopDesc::is_oop_or_null()
 510   // Only called by MacroAssembler::verify_oop
 511   static void verify_oop_helper(const char* message, oopDesc* o) {
 512     if (!oopDesc::is_oop_or_null(o)) {
 513       fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
 514     }
 515     ++ StubRoutines::_verify_oop_count;
 516   }
 517 #endif
 518 
 519   // Return address of code to be called from code generated by
 520   // MacroAssembler::verify_oop.
 521   //
 522   // Don't generate, rather use C++ code.
 523   address generate_verify_oop() {
 524     // this is actually a `FunctionDescriptor*'.
 525     address start = nullptr;
 526 
 527 #if !defined(PRODUCT)
 528     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 529 #endif
 530 
 531     return start;
 532   }
 533 
 534   // Computes the Galois/Counter Mode (GCM) product and reduction.
 535   //
 536   // This function performs polynomial multiplication of the subkey H with
 537   // the current GHASH state using vectorized polynomial multiplication (`vpmsumd`).
 538   // The subkey H is divided into lower, middle, and higher halves.
 539   // The multiplication results are reduced using `vConstC2` to stay within GF(2^128).
 540   // The final computed value is stored back into `vState`.
 541   static void computeGCMProduct(MacroAssembler* _masm,
 542                                 VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH,
 543                                 VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState,
 544                                 VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct,
 545                                 VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9,
 546                                 VectorRegister vCombinedResult, VectorRegister vSwappedH) {
 547     __ vxor(vH, vH, vState);
 548     __ vpmsumd(vLowProduct, vLowerH, vH);                          // L : Lower Half of subkey H
 549     __ vpmsumd(vMidProduct, vSwappedH, vH);                        // M : Combined halves of subkey H
 550     __ vpmsumd(vHighProduct, vHigherH, vH);                        // H : Higher Half of subkey H
 551     __ vpmsumd(vReducedLow, vLowProduct, vConstC2);                // Reduction
 552     __ vsldoi(vTmp8, vMidProduct, vZero, 8);                       // mL : Extract the lower 64 bits of M
 553     __ vsldoi(vTmp9, vZero, vMidProduct, 8);                       // mH : Extract the higher 64 bits of M
 554     __ vxor(vLowProduct, vLowProduct, vTmp8);                      // LL + mL : Partial result for lower half
 555     __ vxor(vHighProduct, vHighProduct, vTmp9);                    // HH + mH : Partial result for upper half
 556     __ vsldoi(vLowProduct, vLowProduct, vLowProduct, 8);           // Swap
 557     __ vxor(vLowProduct, vLowProduct, vReducedLow);
 558     __ vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8);       // Swap
 559     __ vpmsumd(vLowProduct, vLowProduct, vConstC2);                // Reduction using constant
 560     __ vxor(vCombinedResult, vCombinedResult, vHighProduct);       // Combine reduced Low & High products
 561     __ vxor(vState, vLowProduct, vCombinedResult);
 562   }
 563 
 564   // Generate stub for ghash process blocks.
 565   //
 566   // Arguments for generated stub:
 567   //      state:    R3_ARG1 (long[] state)
 568   //      subkeyH:  R4_ARG2 (long[] subH)
 569   //      data:     R5_ARG3 (byte[] data)
 570   //      blocks:   R6_ARG4 (number of 16-byte blocks to process)
 571   //
 572   // The polynomials are processed in bit-reflected order for efficiency reasons.
 573   // This optimization leverages the structure of the Galois field arithmetic
 574   // to minimize the number of bit manipulations required during multiplication.
 575   // For an explanation of how this works, refer :
 576   // Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich,
 577   // Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel®
 578   // Architecture Processor"
 579   // http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf
 580   //
 581   //
 582   address generate_ghash_processBlocks() {
 583     StubCodeMark mark(this, "StubRoutines", "ghash");
 584     address start = __ function_entry();
 585 
 586     // Registers for parameters
 587     Register state = R3_ARG1;                     // long[] state
 588     Register subkeyH = R4_ARG2;                   // long[] subH
 589     Register data = R5_ARG3;                      // byte[] data
 590     Register blocks = R6_ARG4;
 591     Register temp1 = R8;
 592     // Vector Registers
 593     VectorRegister vZero = VR0;
 594     VectorRegister vH = VR1;
 595     VectorRegister vLowerH = VR2;
 596     VectorRegister vHigherH = VR3;
 597     VectorRegister vLowProduct = VR4;
 598     VectorRegister vMidProduct = VR5;
 599     VectorRegister vHighProduct = VR6;
 600     VectorRegister vReducedLow = VR7;
 601     VectorRegister vTmp8 = VR8;
 602     VectorRegister vTmp9 = VR9;
 603     VectorRegister vTmp10 = VR10;
 604     VectorRegister vSwappedH = VR11;
 605     VectorRegister vTmp12 = VR12;
 606     VectorRegister loadOrder = VR13;
 607     VectorRegister vHigh = VR14;
 608     VectorRegister vLow = VR15;
 609     VectorRegister vState = VR16;
 610     VectorRegister vPerm = VR17;
 611     VectorRegister vCombinedResult = VR18;
 612     VectorRegister vConstC2 = VR19;
 613 
 614     __ li(temp1, 0xc2);
 615     __ sldi(temp1, temp1, 56);
 616     __ vspltisb(vZero, 0);
 617     __ mtvrd(vConstC2, temp1);
 618     __ lxvd2x(vH->to_vsr(), subkeyH);
 619     __ lxvd2x(vState->to_vsr(), state);
 620     // Operations to obtain lower and higher bytes of subkey H.
 621     __ vspltisb(vReducedLow, 1);
 622     __ vspltisb(vTmp10, 7);
 623     __ vsldoi(vTmp8, vZero, vReducedLow, 1);            // 0x1
 624     __ vor(vTmp8, vConstC2, vTmp8);                     // 0xC2...1
 625     __ vsplt(vTmp9, 0, vH);                             // MSB of H
 626     __ vsl(vH, vH, vReducedLow);                        // Carry = H<<7
 627     __ vsrab(vTmp9, vTmp9, vTmp10);
 628     __ vand(vTmp9, vTmp9, vTmp8);                       // Carry
 629     __ vxor(vTmp10, vH, vTmp9);
 630     __ vsldoi(vConstC2, vZero, vConstC2, 8);
 631     __ vsldoi(vSwappedH, vTmp10, vTmp10, 8);            // swap Lower and Higher Halves of subkey H
 632     __ vsldoi(vLowerH, vZero, vSwappedH, 8);            // H.L
 633     __ vsldoi(vHigherH, vSwappedH, vZero, 8);           // H.H
 634 #ifdef ASSERT
 635     __ cmpwi(CR0, blocks, 0);                           // Compare 'blocks' (R6_ARG4) with zero
 636     __ asm_assert_ne("blocks should NOT be zero");
 637 #endif
 638     __ clrldi(blocks, blocks, 32);
 639     __ mtctr(blocks);
 640     __ lvsl(loadOrder, temp1);
 641 #ifdef VM_LITTLE_ENDIAN
 642     __ vspltisb(vTmp12, 0xf);
 643     __ vxor(loadOrder, loadOrder, vTmp12);
 644 #define LE_swap_bytes(x) __ vec_perm(x, x, x, loadOrder)
 645 #else
 646 #define LE_swap_bytes(x)
 647 #endif
 648 
 649     // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation.
 650     //
 651     // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts,
 652     // performing three 128-bit multiplications and combining the results efficiently.
 653     //
 654     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
 655     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
 656     //
 657     // Inputs:
 658     // - vH:       The data vector (state), containing both B0 (lower half) and B1 (higher half).
 659     // - vLowerH:  Lower half of the subkey H (A0).
 660     // - vHigherH: Higher half of the subkey H (A1).
 661     // - vConstC2: Constant used for reduction (for final processing).
 662     //
 663     // References:
 664     // Shay Gueron, Michael E. Kounavis.
 665     // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode"
 666     // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918
 667     //
 668     Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop;
 669     __ andi(temp1, data, 15);
 670     __ cmpwi(CR0, temp1, 0);
 671     __ bne(CR0, L_initialize_unaligned_loop);
 672 
 673     __ bind(L_aligned_loop);
 674       __ lvx(vH, temp1, data);
 675       LE_swap_bytes(vH);
 676       computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
 677                     vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
 678       __ addi(data, data, 16);
 679     __ bdnz(L_aligned_loop);
 680     __ b(L_store);
 681 
 682     __ bind(L_initialize_unaligned_loop);
 683     __ li(temp1, 0);
 684     __ lvsl(vPerm, temp1, data);
 685     __ lvx(vHigh, temp1, data);
 686 #ifdef VM_LITTLE_ENDIAN
 687     __ vspltisb(vTmp12, -1);
 688     __ vxor(vPerm, vPerm, vTmp12);
 689 #endif
 690     __ bind(L_unaligned_loop);
 691       __ addi(data, data, 16);
 692       __ lvx(vLow, temp1, data);
 693       __ vec_perm(vH, vHigh, vLow, vPerm);
 694       computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
 695                     vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
 696       __ vmr(vHigh, vLow);
 697     __ bdnz(L_unaligned_loop);
 698 
 699     __ bind(L_store);
 700     __ stxvd2x(vState->to_vsr(), state);
 701     __ blr();
 702 
 703     return start;
 704   }
 705   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
 706   //
 707   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
 708   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
 709   //
 710   // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
 711   // for turning on loop predication optimization, and hence the behavior of "array range check"
 712   // and "loop invariant check" could be influenced, which potentially boosted JVM98.
 713   //
 714   // Generate stub for disjoint short fill. If "aligned" is true, the
 715   // "to" address is assumed to be heapword aligned.
 716   //
 717   // Arguments for generated stub:
 718   //   to:    R3_ARG1
 719   //   value: R4_ARG2
 720   //   count: R5_ARG3 treated as signed
 721   //
 722   address generate_fill(StubId stub_id) {
 723     BasicType t;
 724     bool aligned;
 725 
 726     switch (stub_id) {
 727     case StubId::stubgen_jbyte_fill_id:
 728       t = T_BYTE;
 729       aligned = false;
 730       break;
 731     case StubId::stubgen_jshort_fill_id:
 732       t = T_SHORT;
 733       aligned = false;
 734       break;
 735     case StubId::stubgen_jint_fill_id:
 736       t = T_INT;
 737       aligned = false;
 738       break;
 739     case StubId::stubgen_arrayof_jbyte_fill_id:
 740       t = T_BYTE;
 741       aligned = true;
 742       break;
 743     case StubId::stubgen_arrayof_jshort_fill_id:
 744       t = T_SHORT;
 745       aligned = true;
 746       break;
 747     case StubId::stubgen_arrayof_jint_fill_id:
 748       t = T_INT;
 749       aligned = true;
 750       break;
 751     default:
 752       ShouldNotReachHere();
 753     }
 754 
 755     StubCodeMark mark(this, stub_id);
 756     address start = __ function_entry();
 757 
 758     const Register to    = R3_ARG1;   // source array address
 759     const Register value = R4_ARG2;   // fill value
 760     const Register count = R5_ARG3;   // elements count
 761     const Register temp  = R6_ARG4;   // temp register
 762 
 763     //assert_clean_int(count, O3);    // Make sure 'count' is clean int.
 764 
 765     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
 766     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
 767 
 768     int shift = -1;
 769     switch (t) {
 770        case T_BYTE:
 771         shift = 2;
 772         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 773         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit
 774         __ cmpdi(CR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 775         __ blt(CR0, L_fill_elements);
 776         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 777         break;
 778        case T_SHORT:
 779         shift = 1;
 780         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 781         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 782         __ cmpdi(CR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 783         __ blt(CR0, L_fill_elements);
 784         break;
 785       case T_INT:
 786         shift = 0;
 787         __ cmpdi(CR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 788         __ blt(CR0, L_fill_4_bytes);
 789         break;
 790       default: ShouldNotReachHere();
 791     }
 792 
 793     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
 794       // Align source address at 4 bytes address boundary.
 795       if (t == T_BYTE) {
 796         // One byte misalignment happens only for byte arrays.
 797         __ andi_(temp, to, 1);
 798         __ beq(CR0, L_skip_align1);
 799         __ stb(value, 0, to);
 800         __ addi(to, to, 1);
 801         __ addi(count, count, -1);
 802         __ bind(L_skip_align1);
 803       }
 804       // Two bytes misalignment happens only for byte and short (char) arrays.
 805       __ andi_(temp, to, 2);
 806       __ beq(CR0, L_skip_align2);
 807       __ sth(value, 0, to);
 808       __ addi(to, to, 2);
 809       __ addi(count, count, -(1 << (shift - 1)));
 810       __ bind(L_skip_align2);
 811     }
 812 
 813     if (!aligned) {
 814       // Align to 8 bytes, we know we are 4 byte aligned to start.
 815       __ andi_(temp, to, 7);
 816       __ beq(CR0, L_fill_32_bytes);
 817       __ stw(value, 0, to);
 818       __ addi(to, to, 4);
 819       __ addi(count, count, -(1 << shift));
 820       __ bind(L_fill_32_bytes);
 821     }
 822 
 823     __ li(temp, 8<<shift);                  // Prepare for 32 byte loop.
 824     // Clone bytes int->long as above.
 825     __ rldimi(value, value, 32, 0);         // 32 bit -> 64 bit
 826 
 827     Label L_check_fill_8_bytes;
 828     // Fill 32-byte chunks.
 829     __ subf_(count, temp, count);
 830     __ blt(CR0, L_check_fill_8_bytes);
 831 
 832     Label L_fill_32_bytes_loop;
 833     __ align(32);
 834     __ bind(L_fill_32_bytes_loop);
 835 
 836     __ std(value, 0, to);
 837     __ std(value, 8, to);
 838     __ subf_(count, temp, count);           // Update count.
 839     __ std(value, 16, to);
 840     __ std(value, 24, to);
 841 
 842     __ addi(to, to, 32);
 843     __ bge(CR0, L_fill_32_bytes_loop);
 844 
 845     __ bind(L_check_fill_8_bytes);
 846     __ add_(count, temp, count);
 847     __ beq(CR0, L_exit);
 848     __ addic_(count, count, -(2 << shift));
 849     __ blt(CR0, L_fill_4_bytes);
 850 
 851     //
 852     // Length is too short, just fill 8 bytes at a time.
 853     //
 854     Label L_fill_8_bytes_loop;
 855     __ bind(L_fill_8_bytes_loop);
 856     __ std(value, 0, to);
 857     __ addic_(count, count, -(2 << shift));
 858     __ addi(to, to, 8);
 859     __ bge(CR0, L_fill_8_bytes_loop);
 860 
 861     // Fill trailing 4 bytes.
 862     __ bind(L_fill_4_bytes);
 863     __ andi_(temp, count, 1<<shift);
 864     __ beq(CR0, L_fill_2_bytes);
 865 
 866     __ stw(value, 0, to);
 867     if (t == T_BYTE || t == T_SHORT) {
 868       __ addi(to, to, 4);
 869       // Fill trailing 2 bytes.
 870       __ bind(L_fill_2_bytes);
 871       __ andi_(temp, count, 1<<(shift-1));
 872       __ beq(CR0, L_fill_byte);
 873       __ sth(value, 0, to);
 874       if (t == T_BYTE) {
 875         __ addi(to, to, 2);
 876         // Fill trailing byte.
 877         __ bind(L_fill_byte);
 878         __ andi_(count, count, 1);
 879         __ beq(CR0, L_exit);
 880         __ stb(value, 0, to);
 881       } else {
 882         __ bind(L_fill_byte);
 883       }
 884     } else {
 885       __ bind(L_fill_2_bytes);
 886     }
 887     __ bind(L_exit);
 888     __ blr();
 889 
 890     // Handle copies less than 8 bytes. Int is handled elsewhere.
 891     if (t == T_BYTE) {
 892       __ bind(L_fill_elements);
 893       Label L_fill_2, L_fill_4;
 894       __ andi_(temp, count, 1);
 895       __ beq(CR0, L_fill_2);
 896       __ stb(value, 0, to);
 897       __ addi(to, to, 1);
 898       __ bind(L_fill_2);
 899       __ andi_(temp, count, 2);
 900       __ beq(CR0, L_fill_4);
 901       __ stb(value, 0, to);
 902       __ stb(value, 0, to);
 903       __ addi(to, to, 2);
 904       __ bind(L_fill_4);
 905       __ andi_(temp, count, 4);
 906       __ beq(CR0, L_exit);
 907       __ stb(value, 0, to);
 908       __ stb(value, 1, to);
 909       __ stb(value, 2, to);
 910       __ stb(value, 3, to);
 911       __ blr();
 912     }
 913 
 914     if (t == T_SHORT) {
 915       Label L_fill_2;
 916       __ bind(L_fill_elements);
 917       __ andi_(temp, count, 1);
 918       __ beq(CR0, L_fill_2);
 919       __ sth(value, 0, to);
 920       __ addi(to, to, 2);
 921       __ bind(L_fill_2);
 922       __ andi_(temp, count, 2);
 923       __ beq(CR0, L_exit);
 924       __ sth(value, 0, to);
 925       __ sth(value, 2, to);
 926       __ blr();
 927     }
 928     return start;
 929   }
 930 
 931   inline void assert_positive_int(Register count) {
 932 #ifdef ASSERT
 933     __ srdi_(R0, count, 31);
 934     __ asm_assert_eq("missing zero extend");
 935 #endif
 936   }
 937 
 938   // Generate overlap test for array copy stubs.
 939   //
 940   // Input:
 941   //   R3_ARG1    -  from
 942   //   R4_ARG2    -  to
 943   //   R5_ARG3    -  element count
 944   //
 945   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
 946     Register tmp1 = R6_ARG4;
 947     Register tmp2 = R7_ARG5;
 948 
 949     assert_positive_int(R5_ARG3);
 950 
 951     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
 952     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
 953     __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
 954     __ cmpld(CR1, tmp1, tmp2);
 955     __ crnand(CR0, Assembler::less, CR1, Assembler::less);
 956     // Overlaps if Src before dst and distance smaller than size.
 957     // Branch to forward copy routine otherwise (within range of 32kB).
 958     __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::less), no_overlap_target);
 959 
 960     // need to copy backwards
 961   }
 962 
 963   // This is common errorexit stub for UnsafeMemoryAccess.
 964   address generate_unsafecopy_common_error_exit() {
 965     address start_pc = __ pc();
 966     Register tmp1 = R6_ARG4;
 967     // probably copy stub would have changed value reset it.
 968     if (VM_Version::has_mfdscr()) {
 969       __ load_const_optimized(tmp1, VM_Version::_dscr_val);
 970       __ mtdscr(tmp1);
 971     }
 972     __ li(R3_RET, 0); // return 0
 973     __ blr();
 974     return start_pc;
 975   }
 976 
 977   // The guideline in the implementations of generate_disjoint_xxx_copy
 978   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
 979   // single instructions, but to avoid alignment interrupts (see subsequent
 980   // comment). Furthermore, we try to minimize misaligned access, even
 981   // though they cause no alignment interrupt.
 982   //
 983   // In Big-Endian mode, the PowerPC architecture requires implementations to
 984   // handle automatically misaligned integer halfword and word accesses,
 985   // word-aligned integer doubleword accesses, and word-aligned floating-point
 986   // accesses. Other accesses may or may not generate an Alignment interrupt
 987   // depending on the implementation.
 988   // Alignment interrupt handling may require on the order of hundreds of cycles,
 989   // so every effort should be made to avoid misaligned memory values.
 990   //
 991   //
 992   // Generate stub for disjoint byte copy.  If "aligned" is true, the
 993   // "from" and "to" addresses are assumed to be heapword aligned.
 994   //
 995   // Arguments for generated stub:
 996   //      from:  R3_ARG1
 997   //      to:    R4_ARG2
 998   //      count: R5_ARG3 treated as signed
 999   //
1000   address generate_disjoint_byte_copy(StubId stub_id) {
1001     bool aligned;
1002     switch (stub_id) {
1003     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1004       aligned = false;
1005       break;
1006     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1007       aligned = true;
1008       break;
1009     default:
1010       ShouldNotReachHere();
1011     }
1012 
1013     StubCodeMark mark(this, stub_id);
1014     address start = __ function_entry();
1015     assert_positive_int(R5_ARG3);
1016 
1017     Register tmp1 = R6_ARG4;
1018     Register tmp2 = R7_ARG5;
1019     Register tmp3 = R8_ARG6;
1020     Register tmp4 = R9_ARG7;
1021 
1022     VectorSRegister tmp_vsr1  = VSR1;
1023     VectorSRegister tmp_vsr2  = VSR2;
1024 
1025     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
1026     {
1027       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1028       UnsafeMemoryAccessMark umam(this, !aligned, false);
1029 
1030       // Don't try anything fancy if arrays don't have many elements.
1031       __ li(tmp3, 0);
1032       __ cmpwi(CR0, R5_ARG3, 17);
1033       __ ble(CR0, l_6); // copy 4 at a time
1034 
1035       if (!aligned) {
1036         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1037         __ andi_(tmp1, tmp1, 3);
1038         __ bne(CR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1039 
1040         // Copy elements if necessary to align to 4 bytes.
1041         __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1042         __ andi_(tmp1, tmp1, 3);
1043         __ beq(CR0, l_2);
1044 
1045         __ subf(R5_ARG3, tmp1, R5_ARG3);
1046         __ bind(l_9);
1047         __ lbz(tmp2, 0, R3_ARG1);
1048         __ addic_(tmp1, tmp1, -1);
1049         __ stb(tmp2, 0, R4_ARG2);
1050         __ addi(R3_ARG1, R3_ARG1, 1);
1051         __ addi(R4_ARG2, R4_ARG2, 1);
1052         __ bne(CR0, l_9);
1053 
1054         __ bind(l_2);
1055       }
1056 
1057       // copy 8 elements at a time
1058       __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1059       __ andi_(tmp1, tmp2, 7);
1060       __ bne(CR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1061 
1062       // copy a 2-element word if necessary to align to 8 bytes
1063       __ andi_(R0, R3_ARG1, 7);
1064       __ beq(CR0, l_7);
1065 
1066       __ lwzx(tmp2, R3_ARG1, tmp3);
1067       __ addi(R5_ARG3, R5_ARG3, -4);
1068       __ stwx(tmp2, R4_ARG2, tmp3);
1069       { // FasterArrayCopy
1070         __ addi(R3_ARG1, R3_ARG1, 4);
1071         __ addi(R4_ARG2, R4_ARG2, 4);
1072       }
1073       __ bind(l_7);
1074 
1075       { // FasterArrayCopy
1076         __ cmpwi(CR0, R5_ARG3, 31);
1077         __ ble(CR0, l_6); // copy 2 at a time if less than 32 elements remain
1078 
1079         __ srdi(tmp1, R5_ARG3, 5);
1080         __ andi_(R5_ARG3, R5_ARG3, 31);
1081         __ mtctr(tmp1);
1082 
1083 
1084         // Prefetch the data into the L2 cache.
1085         __ dcbt(R3_ARG1, 0);
1086 
1087         // If supported set DSCR pre-fetch to deepest.
1088         if (VM_Version::has_mfdscr()) {
1089           __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1090           __ mtdscr(tmp2);
1091         }
1092         __ li(tmp1, 16);
1093 
1094         // Backbranch target aligned to 32-byte. Not 16-byte align as
1095         // loop contains < 8 instructions that fit inside a single
1096         // i-cache sector.
1097         __ align(32);
1098 
1099         __ bind(l_10);
1100         // Use loop with VSX load/store instructions to
1101         // copy 32 elements a time.
1102         __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1103         __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1104         __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1105         __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1106         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1107         __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1108         __ bdnz(l_10);                       // Dec CTR and loop if not zero.
1109 
1110         // Restore DSCR pre-fetch value.
1111         if (VM_Version::has_mfdscr()) {
1112           __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1113           __ mtdscr(tmp2);
1114         }
1115 
1116      } // FasterArrayCopy
1117 
1118       __ bind(l_6);
1119 
1120       // copy 4 elements at a time
1121       __ cmpwi(CR0, R5_ARG3, 4);
1122       __ blt(CR0, l_1);
1123       __ srdi(tmp1, R5_ARG3, 2);
1124       __ mtctr(tmp1); // is > 0
1125       __ andi_(R5_ARG3, R5_ARG3, 3);
1126 
1127       { // FasterArrayCopy
1128         __ addi(R3_ARG1, R3_ARG1, -4);
1129         __ addi(R4_ARG2, R4_ARG2, -4);
1130         __ bind(l_3);
1131         __ lwzu(tmp2, 4, R3_ARG1);
1132         __ stwu(tmp2, 4, R4_ARG2);
1133         __ bdnz(l_3);
1134         __ addi(R3_ARG1, R3_ARG1, 4);
1135         __ addi(R4_ARG2, R4_ARG2, 4);
1136       }
1137 
1138       // do single element copy
1139       __ bind(l_1);
1140       __ cmpwi(CR0, R5_ARG3, 0);
1141       __ beq(CR0, l_4);
1142 
1143       { // FasterArrayCopy
1144         __ mtctr(R5_ARG3);
1145         __ addi(R3_ARG1, R3_ARG1, -1);
1146         __ addi(R4_ARG2, R4_ARG2, -1);
1147 
1148         __ bind(l_5);
1149         __ lbzu(tmp2, 1, R3_ARG1);
1150         __ stbu(tmp2, 1, R4_ARG2);
1151         __ bdnz(l_5);
1152       }
1153     }
1154 
1155     __ bind(l_4);
1156     __ li(R3_RET, 0); // return 0
1157     __ blr();
1158 
1159     return start;
1160   }
1161 
1162   // Generate stub for conjoint byte copy.  If "aligned" is true, the
1163   // "from" and "to" addresses are assumed to be heapword aligned.
1164   //
1165   // Arguments for generated stub:
1166   //      from:  R3_ARG1
1167   //      to:    R4_ARG2
1168   //      count: R5_ARG3 treated as signed
1169   //
1170   address generate_conjoint_byte_copy(StubId stub_id) {
1171     bool aligned;
1172     switch (stub_id) {
1173     case StubId::stubgen_jbyte_arraycopy_id:
1174       aligned = false;
1175       break;
1176     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1177       aligned = true;
1178       break;
1179     default:
1180       ShouldNotReachHere();
1181     }
1182 
1183     StubCodeMark mark(this, stub_id);
1184     address start = __ function_entry();
1185     assert_positive_int(R5_ARG3);
1186 
1187     Register tmp1 = R6_ARG4;
1188     Register tmp2 = R7_ARG5;
1189     Register tmp3 = R8_ARG6;
1190 
1191     address nooverlap_target = aligned ?
1192       STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy()) :
1193       STUB_ENTRY(jbyte_disjoint_arraycopy());
1194 
1195     array_overlap_test(nooverlap_target, 0);
1196     // Do reverse copy. We assume the case of actual overlap is rare enough
1197     // that we don't have to optimize it.
1198     Label l_1, l_2;
1199     {
1200       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1201       UnsafeMemoryAccessMark umam(this, !aligned, false);
1202       __ b(l_2);
1203       __ bind(l_1);
1204       __ stbx(tmp1, R4_ARG2, R5_ARG3);
1205       __ bind(l_2);
1206       __ addic_(R5_ARG3, R5_ARG3, -1);
1207       __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1208       __ bge(CR0, l_1);
1209     }
1210     __ li(R3_RET, 0); // return 0
1211     __ blr();
1212 
1213     return start;
1214   }
1215 
1216   // Generate stub for disjoint short copy.  If "aligned" is true, the
1217   // "from" and "to" addresses are assumed to be heapword aligned.
1218   //
1219   // Arguments for generated stub:
1220   //      from:  R3_ARG1
1221   //      to:    R4_ARG2
1222   //  elm.count: R5_ARG3 treated as signed
1223   //
1224   // Strategy for aligned==true:
1225   //
1226   //  If length <= 9:
1227   //     1. copy 2 elements at a time (l_6)
1228   //     2. copy last element if original element count was odd (l_1)
1229   //
1230   //  If length > 9:
1231   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
1232   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
1233   //     3. copy last element if one was left in step 2. (l_1)
1234   //
1235   //
1236   // Strategy for aligned==false:
1237   //
1238   //  If length <= 9: same as aligned==true case, but NOTE: load/stores
1239   //                  can be unaligned (see comment below)
1240   //
1241   //  If length > 9:
1242   //     1. continue with step 6. if the alignment of from and to mod 4
1243   //        is different.
1244   //     2. align from and to to 4 bytes by copying 1 element if necessary
1245   //     3. at l_2 from and to are 4 byte aligned; continue with
1246   //        5. if they cannot be aligned to 8 bytes because they have
1247   //        got different alignment mod 8.
1248   //     4. at this point we know that both, from and to, have the same
1249   //        alignment mod 8, now copy one element if necessary to get
1250   //        8 byte alignment of from and to.
1251   //     5. copy 4 elements at a time until less than 4 elements are
1252   //        left; depending on step 3. all load/stores are aligned or
1253   //        either all loads or all stores are unaligned.
1254   //     6. copy 2 elements at a time until less than 2 elements are
1255   //        left (l_6); arriving here from step 1., there is a chance
1256   //        that all accesses are unaligned.
1257   //     7. copy last element if one was left in step 6. (l_1)
1258   //
1259   //  There are unaligned data accesses using integer load/store
1260   //  instructions in this stub. POWER allows such accesses.
1261   //
1262   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1263   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
1264   //  integer load/stores have good performance. Only unaligned
1265   //  floating point load/stores can have poor performance.
1266   //
1267   //  TODO:
1268   //
1269   //  1. check if aligning the backbranch target of loops is beneficial
1270   //
1271   address generate_disjoint_short_copy(StubId stub_id) {
1272     bool aligned;
1273     switch (stub_id) {
1274     case StubId::stubgen_jshort_disjoint_arraycopy_id:
1275       aligned = false;
1276       break;
1277     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1278       aligned = true;
1279       break;
1280     default:
1281       ShouldNotReachHere();
1282     }
1283 
1284     StubCodeMark mark(this, stub_id);
1285 
1286     Register tmp1 = R6_ARG4;
1287     Register tmp2 = R7_ARG5;
1288     Register tmp3 = R8_ARG6;
1289     Register tmp4 = R9_ARG7;
1290 
1291     VectorSRegister tmp_vsr1  = VSR1;
1292     VectorSRegister tmp_vsr2  = VSR2;
1293 
1294     address start = __ function_entry();
1295     assert_positive_int(R5_ARG3);
1296 
1297     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1298     {
1299       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1300       UnsafeMemoryAccessMark umam(this, !aligned, false);
1301       // don't try anything fancy if arrays don't have many elements
1302       __ li(tmp3, 0);
1303       __ cmpwi(CR0, R5_ARG3, 9);
1304       __ ble(CR0, l_6); // copy 2 at a time
1305 
1306       if (!aligned) {
1307         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1308         __ andi_(tmp1, tmp1, 3);
1309         __ bne(CR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1310 
1311         // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1312 
1313         // Copy 1 element if necessary to align to 4 bytes.
1314         __ andi_(tmp1, R3_ARG1, 3);
1315         __ beq(CR0, l_2);
1316 
1317         __ lhz(tmp2, 0, R3_ARG1);
1318         __ addi(R3_ARG1, R3_ARG1, 2);
1319         __ sth(tmp2, 0, R4_ARG2);
1320         __ addi(R4_ARG2, R4_ARG2, 2);
1321         __ addi(R5_ARG3, R5_ARG3, -1);
1322         __ bind(l_2);
1323 
1324         // At this point the positions of both, from and to, are at least 4 byte aligned.
1325 
1326         // Copy 4 elements at a time.
1327         // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1328         __ xorr(tmp2, R3_ARG1, R4_ARG2);
1329         __ andi_(tmp1, tmp2, 7);
1330         __ bne(CR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1331 
1332         // Copy a 2-element word if necessary to align to 8 bytes.
1333         __ andi_(R0, R3_ARG1, 7);
1334         __ beq(CR0, l_7);
1335 
1336         __ lwzx(tmp2, R3_ARG1, tmp3);
1337         __ addi(R5_ARG3, R5_ARG3, -2);
1338         __ stwx(tmp2, R4_ARG2, tmp3);
1339         { // FasterArrayCopy
1340           __ addi(R3_ARG1, R3_ARG1, 4);
1341           __ addi(R4_ARG2, R4_ARG2, 4);
1342         }
1343       }
1344 
1345       __ bind(l_7);
1346 
1347       // Copy 4 elements at a time; either the loads or the stores can
1348       // be unaligned if aligned == false.
1349 
1350       { // FasterArrayCopy
1351         __ cmpwi(CR0, R5_ARG3, 15);
1352         __ ble(CR0, l_6); // copy 2 at a time if less than 16 elements remain
1353 
1354         __ srdi(tmp1, R5_ARG3, 4);
1355         __ andi_(R5_ARG3, R5_ARG3, 15);
1356         __ mtctr(tmp1);
1357 
1358 
1359         // Processor supports VSX, so use it to mass copy.
1360 
1361           // Prefetch src data into L2 cache.
1362           __ dcbt(R3_ARG1, 0);
1363 
1364           // If supported set DSCR pre-fetch to deepest.
1365           if (VM_Version::has_mfdscr()) {
1366             __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1367             __ mtdscr(tmp2);
1368           }
1369           __ li(tmp1, 16);
1370 
1371           // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1372           // as loop contains < 8 instructions that fit inside a single
1373           // i-cache sector.
1374           __ align(32);
1375 
1376           __ bind(l_9);
1377           // Use loop with VSX load/store instructions to
1378           // copy 16 elements a time.
1379           __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load from src.
1380           __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
1381           __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
1382           __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1383           __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
1384           __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
1385           __ bdnz(l_9);                        // Dec CTR and loop if not zero.
1386 
1387           // Restore DSCR pre-fetch value.
1388           if (VM_Version::has_mfdscr()) {
1389             __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1390             __ mtdscr(tmp2);
1391           }
1392 
1393       } // FasterArrayCopy
1394       __ bind(l_6);
1395 
1396       // copy 2 elements at a time
1397       { // FasterArrayCopy
1398         __ cmpwi(CR0, R5_ARG3, 2);
1399         __ blt(CR0, l_1);
1400         __ srdi(tmp1, R5_ARG3, 1);
1401         __ andi_(R5_ARG3, R5_ARG3, 1);
1402 
1403         __ addi(R3_ARG1, R3_ARG1, -4);
1404         __ addi(R4_ARG2, R4_ARG2, -4);
1405         __ mtctr(tmp1);
1406 
1407         __ bind(l_3);
1408         __ lwzu(tmp2, 4, R3_ARG1);
1409         __ stwu(tmp2, 4, R4_ARG2);
1410         __ bdnz(l_3);
1411 
1412         __ addi(R3_ARG1, R3_ARG1, 4);
1413         __ addi(R4_ARG2, R4_ARG2, 4);
1414       }
1415 
1416       // do single element copy
1417       __ bind(l_1);
1418       __ cmpwi(CR0, R5_ARG3, 0);
1419       __ beq(CR0, l_4);
1420 
1421       { // FasterArrayCopy
1422         __ mtctr(R5_ARG3);
1423         __ addi(R3_ARG1, R3_ARG1, -2);
1424         __ addi(R4_ARG2, R4_ARG2, -2);
1425 
1426         __ bind(l_5);
1427         __ lhzu(tmp2, 2, R3_ARG1);
1428         __ sthu(tmp2, 2, R4_ARG2);
1429         __ bdnz(l_5);
1430       }
1431     }
1432 
1433     __ bind(l_4);
1434     __ li(R3_RET, 0); // return 0
1435     __ blr();
1436 
1437     return start;
1438   }
1439 
1440   // Generate stub for conjoint short copy.  If "aligned" is true, the
1441   // "from" and "to" addresses are assumed to be heapword aligned.
1442   //
1443   // Arguments for generated stub:
1444   //      from:  R3_ARG1
1445   //      to:    R4_ARG2
1446   //      count: R5_ARG3 treated as signed
1447   //
1448   address generate_conjoint_short_copy(StubId stub_id) {
1449     bool aligned;
1450     switch (stub_id) {
1451     case StubId::stubgen_jshort_arraycopy_id:
1452       aligned = false;
1453       break;
1454     case StubId::stubgen_arrayof_jshort_arraycopy_id:
1455       aligned = true;
1456       break;
1457     default:
1458       ShouldNotReachHere();
1459     }
1460 
1461     StubCodeMark mark(this, stub_id);
1462     address start = __ function_entry();
1463     assert_positive_int(R5_ARG3);
1464 
1465     Register tmp1 = R6_ARG4;
1466     Register tmp2 = R7_ARG5;
1467     Register tmp3 = R8_ARG6;
1468 
1469     address nooverlap_target = aligned ?
1470       STUB_ENTRY(arrayof_jshort_disjoint_arraycopy()) :
1471       STUB_ENTRY(jshort_disjoint_arraycopy());
1472 
1473     array_overlap_test(nooverlap_target, 1);
1474 
1475     Label l_1, l_2;
1476     {
1477       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1478       UnsafeMemoryAccessMark umam(this, !aligned, false);
1479       __ sldi(tmp1, R5_ARG3, 1);
1480       __ b(l_2);
1481       __ bind(l_1);
1482       __ sthx(tmp2, R4_ARG2, tmp1);
1483       __ bind(l_2);
1484       __ addic_(tmp1, tmp1, -2);
1485       __ lhzx(tmp2, R3_ARG1, tmp1);
1486       __ bge(CR0, l_1);
1487     }
1488     __ li(R3_RET, 0); // return 0
1489     __ blr();
1490 
1491     return start;
1492   }
1493 
1494   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"
1495   // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1496   //
1497   // Arguments:
1498   //      from:  R3_ARG1
1499   //      to:    R4_ARG2
1500   //      count: R5_ARG3 treated as signed
1501   //
1502   void generate_disjoint_int_copy_core(bool aligned) {
1503     Register tmp1 = R6_ARG4;
1504     Register tmp2 = R7_ARG5;
1505     Register tmp3 = R8_ARG6;
1506     Register tmp4 = R0;
1507 
1508     VectorSRegister tmp_vsr1  = VSR1;
1509     VectorSRegister tmp_vsr2  = VSR2;
1510 
1511     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1512 
1513     // for short arrays, just do single element copy
1514     __ li(tmp3, 0);
1515     __ cmpwi(CR0, R5_ARG3, 5);
1516     __ ble(CR0, l_2);
1517 
1518     if (!aligned) {
1519         // check if arrays have same alignment mod 8.
1520         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1521         __ andi_(R0, tmp1, 7);
1522         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1523         __ bne(CR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1524 
1525         // copy 1 element to align to and from on an 8 byte boundary
1526         __ andi_(R0, R3_ARG1, 7);
1527         __ beq(CR0, l_4);
1528 
1529         __ lwzx(tmp2, R3_ARG1, tmp3);
1530         __ addi(R5_ARG3, R5_ARG3, -1);
1531         __ stwx(tmp2, R4_ARG2, tmp3);
1532         { // FasterArrayCopy
1533           __ addi(R3_ARG1, R3_ARG1, 4);
1534           __ addi(R4_ARG2, R4_ARG2, 4);
1535         }
1536         __ bind(l_4);
1537       }
1538 
1539     { // FasterArrayCopy
1540       __ cmpwi(CR0, R5_ARG3, 7);
1541       __ ble(CR0, l_2); // copy 1 at a time if less than 8 elements remain
1542 
1543       __ srdi(tmp1, R5_ARG3, 3);
1544       __ andi_(R5_ARG3, R5_ARG3, 7);
1545       __ mtctr(tmp1);
1546 
1547     // Processor supports VSX, so use it to mass copy.
1548 
1549     // Prefetch the data into the L2 cache.
1550     __ dcbt(R3_ARG1, 0);
1551 
1552     // Set DSCR pre-fetch to deepest.
1553     if (VM_Version::has_mfdscr()) {
1554       __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1555       __ mtdscr(tmp2);
1556     }
1557     __ li(tmp1, 16);
1558 
1559     // Backbranch target aligned to 32-byte. Not 16-byte align as
1560     // loop contains < 8 instructions that fit inside a single
1561     // i-cache sector.
1562     __ align(32);
1563 
1564     __ bind(l_7);
1565     // Use loop with VSX load/store instructions to
1566     // copy 8 elements a time.
1567     __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1568     __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1569     __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1570     __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1571     __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1572     __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1573     __ bdnz(l_7);                        // Dec CTR and loop if not zero.
1574 
1575     // Restore DSCR pre-fetch value.
1576     if (VM_Version::has_mfdscr()) {
1577       __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1578       __ mtdscr(tmp2);
1579     }
1580 
1581    } // FasterArrayCopy
1582 
1583     // copy 1 element at a time
1584     __ bind(l_2);
1585     __ cmpwi(CR0, R5_ARG3, 0);
1586     __ beq(CR0, l_1);
1587 
1588     { // FasterArrayCopy
1589       __ mtctr(R5_ARG3);
1590       __ addi(R3_ARG1, R3_ARG1, -4);
1591       __ addi(R4_ARG2, R4_ARG2, -4);
1592 
1593       __ bind(l_3);
1594       __ lwzu(tmp2, 4, R3_ARG1);
1595       __ stwu(tmp2, 4, R4_ARG2);
1596       __ bdnz(l_3);
1597     }
1598 
1599     __ bind(l_1);
1600     return;
1601   }
1602 
1603   // Generate stub for disjoint int copy.  If "aligned" is true, the
1604   // "from" and "to" addresses are assumed to be heapword aligned.
1605   //
1606   // Arguments for generated stub:
1607   //      from:  R3_ARG1
1608   //      to:    R4_ARG2
1609   //      count: R5_ARG3 treated as signed
1610   //
1611   address generate_disjoint_int_copy(StubId stub_id) {
1612     bool aligned;
1613     switch (stub_id) {
1614     case StubId::stubgen_jint_disjoint_arraycopy_id:
1615       aligned = false;
1616       break;
1617     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1618       aligned = true;
1619       break;
1620     default:
1621       ShouldNotReachHere();
1622     }
1623 
1624     StubCodeMark mark(this, stub_id);
1625     address start = __ function_entry();
1626     assert_positive_int(R5_ARG3);
1627     {
1628       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1629       UnsafeMemoryAccessMark umam(this, !aligned, false);
1630       generate_disjoint_int_copy_core(aligned);
1631     }
1632     __ li(R3_RET, 0); // return 0
1633     __ blr();
1634     return start;
1635   }
1636 
1637   // Generate core code for conjoint int copy (and oop copy on
1638   // 32-bit).  If "aligned" is true, the "from" and "to" addresses
1639   // are assumed to be heapword aligned.
1640   //
1641   // Arguments:
1642   //      from:  R3_ARG1
1643   //      to:    R4_ARG2
1644   //      count: R5_ARG3 treated as signed
1645   //
1646   void generate_conjoint_int_copy_core(bool aligned) {
1647     // Do reverse copy.  We assume the case of actual overlap is rare enough
1648     // that we don't have to optimize it.
1649 
1650     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1651 
1652     Register tmp1 = R6_ARG4;
1653     Register tmp2 = R7_ARG5;
1654     Register tmp3 = R8_ARG6;
1655     Register tmp4 = R0;
1656 
1657     VectorSRegister tmp_vsr1  = VSR1;
1658     VectorSRegister tmp_vsr2  = VSR2;
1659 
1660     { // FasterArrayCopy
1661       __ cmpwi(CR0, R5_ARG3, 0);
1662       __ beq(CR0, l_6);
1663 
1664       __ sldi(R5_ARG3, R5_ARG3, 2);
1665       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1666       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1667       __ srdi(R5_ARG3, R5_ARG3, 2);
1668 
1669       if (!aligned) {
1670         // check if arrays have same alignment mod 8.
1671         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1672         __ andi_(R0, tmp1, 7);
1673         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1674         __ bne(CR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1675 
1676         // copy 1 element to align to and from on an 8 byte boundary
1677         __ andi_(R0, R3_ARG1, 7);
1678         __ beq(CR0, l_7);
1679 
1680         __ addi(R3_ARG1, R3_ARG1, -4);
1681         __ addi(R4_ARG2, R4_ARG2, -4);
1682         __ addi(R5_ARG3, R5_ARG3, -1);
1683         __ lwzx(tmp2, R3_ARG1);
1684         __ stwx(tmp2, R4_ARG2);
1685         __ bind(l_7);
1686       }
1687 
1688       __ cmpwi(CR0, R5_ARG3, 7);
1689       __ ble(CR0, l_5); // copy 1 at a time if less than 8 elements remain
1690 
1691       __ srdi(tmp1, R5_ARG3, 3);
1692       __ andi(R5_ARG3, R5_ARG3, 7);
1693       __ mtctr(tmp1);
1694 
1695       // Processor supports VSX, so use it to mass copy.
1696       // Prefetch the data into the L2 cache.
1697       __ dcbt(R3_ARG1, 0);
1698 
1699       // Set DSCR pre-fetch to deepest.
1700       if (VM_Version::has_mfdscr()) {
1701         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1702         __ mtdscr(tmp2);
1703       }
1704       __ li(tmp1, 16);
1705 
1706       // Backbranch target aligned to 32-byte. Not 16-byte align as
1707       // loop contains < 8 instructions that fit inside a single
1708       // i-cache sector.
1709       __ align(32);
1710 
1711       __ bind(l_4);
1712       // Use loop with VSX load/store instructions to
1713       // copy 8 elements a time.
1714       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1715       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1716       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1717       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1718       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1719       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1720       __ bdnz(l_4);
1721 
1722       // Restore DSCR pre-fetch value.
1723       if (VM_Version::has_mfdscr()) {
1724         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1725         __ mtdscr(tmp2);
1726       }
1727 
1728       __ cmpwi(CR0, R5_ARG3, 0);
1729       __ beq(CR0, l_6);
1730 
1731       __ bind(l_5);
1732       __ mtctr(R5_ARG3);
1733       __ bind(l_3);
1734       __ lwz(R0, -4, R3_ARG1);
1735       __ stw(R0, -4, R4_ARG2);
1736       __ addi(R3_ARG1, R3_ARG1, -4);
1737       __ addi(R4_ARG2, R4_ARG2, -4);
1738       __ bdnz(l_3);
1739 
1740       __ bind(l_6);
1741     }
1742   }
1743 
1744   // Generate stub for conjoint int copy.  If "aligned" is true, the
1745   // "from" and "to" addresses are assumed to be heapword aligned.
1746   //
1747   // Arguments for generated stub:
1748   //      from:  R3_ARG1
1749   //      to:    R4_ARG2
1750   //      count: R5_ARG3 treated as signed
1751   //
1752   address generate_conjoint_int_copy(StubId stub_id) {
1753     bool aligned;
1754     switch (stub_id) {
1755     case StubId::stubgen_jint_arraycopy_id:
1756       aligned = false;
1757       break;
1758     case StubId::stubgen_arrayof_jint_arraycopy_id:
1759       aligned = true;
1760       break;
1761     default:
1762       ShouldNotReachHere();
1763     }
1764 
1765     StubCodeMark mark(this, stub_id);
1766     address start = __ function_entry();
1767     assert_positive_int(R5_ARG3);
1768     address nooverlap_target = aligned ?
1769       STUB_ENTRY(arrayof_jint_disjoint_arraycopy()) :
1770       STUB_ENTRY(jint_disjoint_arraycopy());
1771 
1772     array_overlap_test(nooverlap_target, 2);
1773     {
1774       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1775       UnsafeMemoryAccessMark umam(this, !aligned, false);
1776       generate_conjoint_int_copy_core(aligned);
1777     }
1778 
1779     __ li(R3_RET, 0); // return 0
1780     __ blr();
1781 
1782     return start;
1783   }
1784 
1785   // Generate core code for disjoint long copy (and oop copy on
1786   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1787   // are assumed to be heapword aligned.
1788   //
1789   // Arguments:
1790   //      from:  R3_ARG1
1791   //      to:    R4_ARG2
1792   //      count: R5_ARG3 treated as signed
1793   //
1794   void generate_disjoint_long_copy_core(bool aligned) {
1795     Register tmp1 = R6_ARG4;
1796     Register tmp2 = R7_ARG5;
1797     Register tmp3 = R8_ARG6;
1798     Register tmp4 = R0;
1799 
1800     Label l_1, l_2, l_3, l_4, l_5;
1801 
1802     VectorSRegister tmp_vsr1  = VSR1;
1803     VectorSRegister tmp_vsr2  = VSR2;
1804 
1805     { // FasterArrayCopy
1806       __ cmpwi(CR0, R5_ARG3, 3);
1807       __ ble(CR0, l_3); // copy 1 at a time if less than 4 elements remain
1808 
1809       __ srdi(tmp1, R5_ARG3, 2);
1810       __ andi_(R5_ARG3, R5_ARG3, 3);
1811       __ mtctr(tmp1);
1812 
1813       // Processor supports VSX, so use it to mass copy.
1814 
1815       // Prefetch the data into the L2 cache.
1816       __ dcbt(R3_ARG1, 0);
1817 
1818       // Set DSCR pre-fetch to deepest.
1819       if (VM_Version::has_mfdscr()) {
1820         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1821         __ mtdscr(tmp2);
1822       }
1823       __ li(tmp1, 16);
1824 
1825       // Backbranch target aligned to 32-byte. Not 16-byte align as
1826       // loop contains < 8 instructions that fit inside a single
1827       // i-cache sector.
1828       __ align(32);
1829 
1830       __ bind(l_5);
1831       // Use loop with VSX load/store instructions to
1832       // copy 4 elements a time.
1833       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1834       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1835       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1836       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1837       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1838       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1839       __ bdnz(l_5);                        // Dec CTR and loop if not zero.
1840 
1841       // Restore DSCR pre-fetch value.
1842       if (VM_Version::has_mfdscr()) {
1843         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1844         __ mtdscr(tmp2);
1845       }
1846 
1847    } // FasterArrayCopy
1848 
1849     // copy 1 element at a time
1850     __ bind(l_3);
1851     __ cmpwi(CR0, R5_ARG3, 0);
1852     __ beq(CR0, l_1);
1853 
1854     { // FasterArrayCopy
1855       __ mtctr(R5_ARG3);
1856       __ addi(R3_ARG1, R3_ARG1, -8);
1857       __ addi(R4_ARG2, R4_ARG2, -8);
1858 
1859       __ bind(l_2);
1860       __ ldu(R0, 8, R3_ARG1);
1861       __ stdu(R0, 8, R4_ARG2);
1862       __ bdnz(l_2);
1863 
1864     }
1865     __ bind(l_1);
1866   }
1867 
1868   // Generate stub for disjoint long copy.  If "aligned" is true, the
1869   // "from" and "to" addresses are assumed to be heapword aligned.
1870   //
1871   // Arguments for generated stub:
1872   //      from:  R3_ARG1
1873   //      to:    R4_ARG2
1874   //      count: R5_ARG3 treated as signed
1875   //
1876   address generate_disjoint_long_copy(StubId stub_id) {
1877     bool aligned;
1878     switch (stub_id) {
1879     case StubId::stubgen_jlong_disjoint_arraycopy_id:
1880       aligned = false;
1881       break;
1882     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1883       aligned = true;
1884       break;
1885     default:
1886       ShouldNotReachHere();
1887     }
1888 
1889     StubCodeMark mark(this, stub_id);
1890     address start = __ function_entry();
1891     assert_positive_int(R5_ARG3);
1892     {
1893       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1894       UnsafeMemoryAccessMark umam(this, !aligned, false);
1895       generate_disjoint_long_copy_core(aligned);
1896     }
1897     __ li(R3_RET, 0); // return 0
1898     __ blr();
1899 
1900   return start;
1901   }
1902 
1903   // Generate core code for conjoint long copy (and oop copy on
1904   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1905   // are assumed to be heapword aligned.
1906   //
1907   // Arguments:
1908   //      from:  R3_ARG1
1909   //      to:    R4_ARG2
1910   //      count: R5_ARG3 treated as signed
1911   //
1912   void generate_conjoint_long_copy_core(bool aligned) {
1913     Register tmp1 = R6_ARG4;
1914     Register tmp2 = R7_ARG5;
1915     Register tmp3 = R8_ARG6;
1916     Register tmp4 = R0;
1917 
1918     VectorSRegister tmp_vsr1  = VSR1;
1919     VectorSRegister tmp_vsr2  = VSR2;
1920 
1921     Label l_1, l_2, l_3, l_4, l_5;
1922 
1923     __ cmpwi(CR0, R5_ARG3, 0);
1924     __ beq(CR0, l_1);
1925 
1926     { // FasterArrayCopy
1927       __ sldi(R5_ARG3, R5_ARG3, 3);
1928       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1929       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1930       __ srdi(R5_ARG3, R5_ARG3, 3);
1931 
1932       __ cmpwi(CR0, R5_ARG3, 3);
1933       __ ble(CR0, l_5); // copy 1 at a time if less than 4 elements remain
1934 
1935       __ srdi(tmp1, R5_ARG3, 2);
1936       __ andi(R5_ARG3, R5_ARG3, 3);
1937       __ mtctr(tmp1);
1938 
1939       // Processor supports VSX, so use it to mass copy.
1940       // Prefetch the data into the L2 cache.
1941       __ dcbt(R3_ARG1, 0);
1942 
1943       // Set DSCR pre-fetch to deepest.
1944       if (VM_Version::has_mfdscr()) {
1945         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1946         __ mtdscr(tmp2);
1947       }
1948       __ li(tmp1, 16);
1949 
1950       // Backbranch target aligned to 32-byte. Not 16-byte align as
1951       // loop contains < 8 instructions that fit inside a single
1952       // i-cache sector.
1953       __ align(32);
1954 
1955       __ bind(l_4);
1956       // Use loop with VSX load/store instructions to
1957       // copy 4 elements a time.
1958       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1959       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1960       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1961       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1962       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1963       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1964       __ bdnz(l_4);
1965 
1966       // Restore DSCR pre-fetch value.
1967       if (VM_Version::has_mfdscr()) {
1968         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1969         __ mtdscr(tmp2);
1970       }
1971 
1972       __ cmpwi(CR0, R5_ARG3, 0);
1973       __ beq(CR0, l_1);
1974 
1975       __ bind(l_5);
1976       __ mtctr(R5_ARG3);
1977       __ bind(l_3);
1978       __ ld(R0, -8, R3_ARG1);
1979       __ std(R0, -8, R4_ARG2);
1980       __ addi(R3_ARG1, R3_ARG1, -8);
1981       __ addi(R4_ARG2, R4_ARG2, -8);
1982       __ bdnz(l_3);
1983 
1984     }
1985     __ bind(l_1);
1986   }
1987 
1988   // Generate stub for conjoint long copy.  If "aligned" is true, the
1989   // "from" and "to" addresses are assumed to be heapword aligned.
1990   //
1991   // Arguments for generated stub:
1992   //      from:  R3_ARG1
1993   //      to:    R4_ARG2
1994   //      count: R5_ARG3 treated as signed
1995   //
1996   address generate_conjoint_long_copy(StubId stub_id) {
1997     bool aligned;
1998     switch (stub_id) {
1999     case StubId::stubgen_jlong_arraycopy_id:
2000       aligned = false;
2001       break;
2002     case StubId::stubgen_arrayof_jlong_arraycopy_id:
2003       aligned = true;
2004       break;
2005     default:
2006       ShouldNotReachHere();
2007     }
2008 
2009     StubCodeMark mark(this, stub_id);
2010     address start = __ function_entry();
2011     assert_positive_int(R5_ARG3);
2012     address nooverlap_target = aligned ?
2013       STUB_ENTRY(arrayof_jlong_disjoint_arraycopy()) :
2014       STUB_ENTRY(jlong_disjoint_arraycopy());
2015 
2016     array_overlap_test(nooverlap_target, 3);
2017     {
2018       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
2019       UnsafeMemoryAccessMark umam(this, !aligned, false);
2020       generate_conjoint_long_copy_core(aligned);
2021     }
2022     __ li(R3_RET, 0); // return 0
2023     __ blr();
2024 
2025     return start;
2026   }
2027 
2028   // Generate stub for conjoint oop copy.  If "aligned" is true, the
2029   // "from" and "to" addresses are assumed to be heapword aligned.
2030   //
2031   // Arguments for generated stub:
2032   //      from:  R3_ARG1
2033   //      to:    R4_ARG2
2034   //      count: R5_ARG3 treated as signed
2035   //      dest_uninitialized: G1 support
2036   //
2037   address generate_conjoint_oop_copy(StubId stub_id) {
2038     bool aligned;
2039     bool dest_uninitialized;
2040     switch (stub_id) {
2041     case StubId::stubgen_oop_arraycopy_id:
2042       aligned = false;
2043       dest_uninitialized = false;
2044       break;
2045     case StubId::stubgen_arrayof_oop_arraycopy_id:
2046       aligned = true;
2047       dest_uninitialized = false;
2048       break;
2049     case StubId::stubgen_oop_arraycopy_uninit_id:
2050       aligned = false;
2051       dest_uninitialized = true;
2052       break;
2053     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
2054       aligned = true;
2055       dest_uninitialized = true;
2056       break;
2057     default:
2058       ShouldNotReachHere();
2059     }
2060 
2061     StubCodeMark mark(this, stub_id);
2062     address start = __ function_entry();
2063     assert_positive_int(R5_ARG3);
2064     address nooverlap_target = aligned ?
2065       STUB_ENTRY(arrayof_oop_disjoint_arraycopy(dest_uninitialized)) :
2066       STUB_ENTRY(oop_disjoint_arraycopy(dest_uninitialized));
2067 
2068     array_overlap_test(nooverlap_target, UseCompressedOops ? 2 : 3);
2069 
2070     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2071     if (dest_uninitialized) {
2072       decorators |= IS_DEST_UNINITIALIZED;
2073     }
2074     if (aligned) {
2075       decorators |= ARRAYCOPY_ALIGNED;
2076     }
2077 
2078     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2079     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2080 
2081     if (UseCompressedOops) {
2082       generate_conjoint_int_copy_core(aligned);
2083     } else {
2084 #if INCLUDE_ZGC
2085       if (UseZGC) {
2086         ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
2087         zbs->generate_conjoint_oop_copy(_masm, dest_uninitialized);
2088       } else
2089 #endif
2090       generate_conjoint_long_copy_core(aligned);
2091     }
2092 
2093     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2094     __ li(R3_RET, 0); // return 0
2095     __ blr();
2096     return start;
2097   }
2098 
2099   // Generate stub for disjoint oop copy.  If "aligned" is true, the
2100   // "from" and "to" addresses are assumed to be heapword aligned.
2101   //
2102   // Arguments for generated stub:
2103   //      from:  R3_ARG1
2104   //      to:    R4_ARG2
2105   //      count: R5_ARG3 treated as signed
2106   //      dest_uninitialized: G1 support
2107   //
2108   address generate_disjoint_oop_copy(StubId stub_id) {
2109     bool aligned;
2110     bool dest_uninitialized;
2111     switch (stub_id) {
2112     case StubId::stubgen_oop_disjoint_arraycopy_id:
2113       aligned = false;
2114       dest_uninitialized = false;
2115       break;
2116     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
2117       aligned = true;
2118       dest_uninitialized = false;
2119       break;
2120     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2121       aligned = false;
2122       dest_uninitialized = true;
2123       break;
2124     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
2125       aligned = true;
2126       dest_uninitialized = true;
2127       break;
2128     default:
2129       ShouldNotReachHere();
2130     }
2131 
2132     StubCodeMark mark(this, stub_id);
2133     address start = __ function_entry();
2134     assert_positive_int(R5_ARG3);
2135 
2136     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2137     if (dest_uninitialized) {
2138       decorators |= IS_DEST_UNINITIALIZED;
2139     }
2140     if (aligned) {
2141       decorators |= ARRAYCOPY_ALIGNED;
2142     }
2143 
2144     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2145     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2146 
2147     if (UseCompressedOops) {
2148       generate_disjoint_int_copy_core(aligned);
2149     } else {
2150 #if INCLUDE_ZGC
2151       if (UseZGC) {
2152         ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
2153         zbs->generate_disjoint_oop_copy(_masm, dest_uninitialized);
2154       } else
2155 #endif
2156       generate_disjoint_long_copy_core(aligned);
2157     }
2158 
2159     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2160     __ li(R3_RET, 0); // return 0
2161     __ blr();
2162 
2163     return start;
2164   }
2165 
2166 
2167   // Helper for generating a dynamic type check.
2168   // Smashes only the given temp registers.
2169   void generate_type_check(Register sub_klass,
2170                            Register super_check_offset,
2171                            Register super_klass,
2172                            Register temp1,
2173                            Register temp2,
2174                            Label& L_success) {
2175     assert_different_registers(sub_klass, super_check_offset, super_klass);
2176 
2177     BLOCK_COMMENT("type_check:");
2178 
2179     Label L_miss;
2180 
2181     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp1, temp2, &L_success, &L_miss, nullptr,
2182                                      super_check_offset);
2183     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success);
2184 
2185     // Fall through on failure!
2186     __ bind(L_miss);
2187   }
2188 
2189 
2190   //  Generate stub for checked oop copy.
2191   //
2192   // Arguments for generated stub:
2193   //      from:  R3
2194   //      to:    R4
2195   //      count: R5 treated as signed
2196   //      ckoff: R6 (super_check_offset)
2197   //      ckval: R7 (super_klass)
2198   //      ret:   R3 zero for success; (-1^K) where K is partial transfer count
2199   //
2200   address generate_checkcast_copy(StubId stub_id) {
2201     const Register R3_from   = R3_ARG1;      // source array address
2202     const Register R4_to     = R4_ARG2;      // destination array address
2203     const Register R5_count  = R5_ARG3;      // elements count
2204     const Register R6_ckoff  = R6_ARG4;      // super_check_offset
2205     const Register R7_ckval  = R7_ARG5;      // super_klass
2206 
2207     const Register R8_offset = R8_ARG6;      // loop var, with stride wordSize
2208     const Register R9_remain = R9_ARG7;      // loop var, with stride -1
2209     const Register R10_oop   = R10_ARG8;     // actual oop copied
2210     const Register R11_klass = R11_scratch1; // oop._klass
2211     const Register R12_tmp   = R12_scratch2;
2212     const Register R2_tmp    = R2;
2213 
2214     bool dest_uninitialized;
2215     switch (stub_id) {
2216     case StubId::stubgen_checkcast_arraycopy_id:
2217       dest_uninitialized = false;
2218       break;
2219     case StubId::stubgen_checkcast_arraycopy_uninit_id:
2220       dest_uninitialized = true;
2221       break;
2222     default:
2223       ShouldNotReachHere();
2224     }
2225     //__ align(CodeEntryAlignment);
2226     StubCodeMark mark(this, stub_id);
2227     address start = __ function_entry();
2228 
2229     // Assert that int is 64 bit sign extended and arrays are not conjoint.
2230 #ifdef ASSERT
2231     {
2232     assert_positive_int(R5_ARG3);
2233     const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2;
2234     Label no_overlap;
2235     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
2236     __ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes
2237     __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
2238     __ cmpld(CR1, tmp1, tmp2);
2239     __ crnand(CR0, Assembler::less, CR1, Assembler::less);
2240     // Overlaps if Src before dst and distance smaller than size.
2241     // Branch to forward copy routine otherwise.
2242     __ blt(CR0, no_overlap);
2243     __ stop("overlap in checkcast_copy");
2244     __ bind(no_overlap);
2245     }
2246 #endif
2247 
2248     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2249     if (dest_uninitialized) {
2250       decorators |= IS_DEST_UNINITIALIZED;
2251     }
2252 
2253     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2254     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_from, R4_to, R5_count, /* preserve: */ R6_ckoff, R7_ckval);
2255 
2256     //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
2257 
2258     Label load_element, store_element, store_null, success, do_epilogue;
2259     __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
2260     __ li(R8_offset, 0);                   // Offset from start of arrays.
2261     __ bne(CR0, load_element);
2262 
2263     // Empty array: Nothing to do.
2264     __ li(R3_RET, 0);           // Return 0 on (trivial) success.
2265     __ blr();
2266 
2267     // ======== begin loop ========
2268     // (Entry is load_element.)
2269     __ align(OptoLoopAlignment);
2270     __ bind(store_element);
2271     if (UseCompressedOops) {
2272       __ encode_heap_oop_not_null(R10_oop);
2273       __ bind(store_null);
2274       __ stw(R10_oop, R8_offset, R4_to);
2275     } else {
2276       __ bind(store_null);
2277 #if INCLUDE_ZGC
2278       if (UseZGC) {
2279         __ store_heap_oop(R10_oop, R8_offset, R4_to, R11_scratch1, R12_tmp, noreg,
2280                           MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2281                           dest_uninitialized ? IS_DEST_UNINITIALIZED : 0);
2282       } else
2283 #endif
2284       __ std(R10_oop, R8_offset, R4_to);
2285     }
2286 
2287     __ addi(R8_offset, R8_offset, heapOopSize);   // Step to next offset.
2288     __ addic_(R9_remain, R9_remain, -1);          // Decrement the count.
2289     __ beq(CR0, success);
2290 
2291     // ======== loop entry is here ========
2292     __ bind(load_element);
2293 #if INCLUDE_ZGC
2294     if (UseZGC) {
2295       __ load_heap_oop(R10_oop, R8_offset, R3_from,
2296                        R11_scratch1, R12_tmp,
2297                        MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2298                        0, &store_null);
2299     } else
2300 #endif
2301     __ load_heap_oop(R10_oop, R8_offset, R3_from,
2302                      R11_scratch1, R12_tmp,
2303                      MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2304                      AS_RAW, &store_null);
2305 
2306     __ load_klass(R11_klass, R10_oop); // Query the object klass.
2307 
2308     generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, R2_tmp,
2309                         // Branch to this on success:
2310                         store_element);
2311     // ======== end loop ========
2312 
2313     // It was a real error; we must depend on the caller to finish the job.
2314     // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
2315     // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
2316     // and report their number to the caller.
2317     __ subf_(R5_count, R9_remain, R5_count);
2318     __ nand(R3_RET, R5_count, R5_count);   // report (-1^K) to caller
2319     __ bne(CR0, do_epilogue);
2320     __ blr();
2321 
2322     __ bind(success);
2323     __ li(R3_RET, 0);
2324 
2325     __ bind(do_epilogue);
2326     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_to, R5_count, /* preserve */ R3_RET);
2327 
2328     __ blr();
2329     return start;
2330   }
2331 
2332 
2333   //  Generate 'unsafe' array copy stub.
2334   //  Though just as safe as the other stubs, it takes an unscaled
2335   //  size_t argument instead of an element count.
2336   //
2337   // Arguments for generated stub:
2338   //      from:  R3
2339   //      to:    R4
2340   //      count: R5 byte count, treated as ssize_t, can be zero
2341   //
2342   // Examines the alignment of the operands and dispatches
2343   // to a long, int, short, or byte copy loop.
2344   //
2345   address generate_unsafe_copy(address byte_copy_entry,
2346                                address short_copy_entry,
2347                                address int_copy_entry,
2348                                address long_copy_entry) {
2349 
2350     const Register R3_from   = R3_ARG1;      // source array address
2351     const Register R4_to     = R4_ARG2;      // destination array address
2352     const Register R5_count  = R5_ARG3;      // elements count (as long on PPC64)
2353 
2354     const Register R6_bits   = R6_ARG4;      // test copy of low bits
2355     const Register R7_tmp    = R7_ARG5;
2356 
2357     //__ align(CodeEntryAlignment);
2358     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2359     StubCodeMark mark(this, stub_id);
2360     address start = __ function_entry();
2361 
2362     // Bump this on entry, not on exit:
2363     //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
2364 
2365     Label short_copy, int_copy, long_copy;
2366 
2367     __ orr(R6_bits, R3_from, R4_to);
2368     __ orr(R6_bits, R6_bits, R5_count);
2369     __ andi_(R0, R6_bits, (BytesPerLong-1));
2370     __ beq(CR0, long_copy);
2371 
2372     __ andi_(R0, R6_bits, (BytesPerInt-1));
2373     __ beq(CR0, int_copy);
2374 
2375     __ andi_(R0, R6_bits, (BytesPerShort-1));
2376     __ beq(CR0, short_copy);
2377 
2378     // byte_copy:
2379     __ b(byte_copy_entry);
2380 
2381     __ bind(short_copy);
2382     __ srwi(R5_count, R5_count, LogBytesPerShort);
2383     __ b(short_copy_entry);
2384 
2385     __ bind(int_copy);
2386     __ srwi(R5_count, R5_count, LogBytesPerInt);
2387     __ b(int_copy_entry);
2388 
2389     __ bind(long_copy);
2390     __ srwi(R5_count, R5_count, LogBytesPerLong);
2391     __ b(long_copy_entry);
2392 
2393     return start;
2394   }
2395 
2396 
2397   // Perform range checks on the proposed arraycopy.
2398   // Kills the two temps, but nothing else.
2399   // Also, clean the sign bits of src_pos and dst_pos.
2400   void arraycopy_range_checks(Register src,     // source array oop
2401                               Register src_pos, // source position
2402                               Register dst,     // destination array oop
2403                               Register dst_pos, // destination position
2404                               Register length,  // length of copy
2405                               Register temp1, Register temp2,
2406                               Label& L_failed) {
2407     BLOCK_COMMENT("arraycopy_range_checks:");
2408 
2409     const Register array_length = temp1;  // scratch
2410     const Register end_pos      = temp2;  // scratch
2411 
2412     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2413     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
2414     __ add(end_pos, src_pos, length);  // src_pos + length
2415     __ cmpd(CR0, end_pos, array_length);
2416     __ bgt(CR0, L_failed);
2417 
2418     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2419     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
2420     __ add(end_pos, dst_pos, length);  // src_pos + length
2421     __ cmpd(CR0, end_pos, array_length);
2422     __ bgt(CR0, L_failed);
2423 
2424     BLOCK_COMMENT("arraycopy_range_checks done");
2425   }
2426 
2427 
2428   // Helper for generate_unsafe_setmemory
2429   //
2430   // Atomically fill an array of memory using 1-, 2-, 4-, or 8-byte chunks and return.
2431   static void do_setmemory_atomic_loop(int elem_size, Register dest, Register size, Register byteVal,
2432                                        MacroAssembler *_masm) {
2433 
2434     Label L_Loop, L_Tail; // 2x unrolled loop
2435 
2436     // Propagate byte to required width
2437     if (elem_size > 1) __ rldimi(byteVal, byteVal,  8, 64 - 2 *  8);
2438     if (elem_size > 2) __ rldimi(byteVal, byteVal, 16, 64 - 2 * 16);
2439     if (elem_size > 4) __ rldimi(byteVal, byteVal, 32, 64 - 2 * 32);
2440 
2441     __ srwi_(R0, size, exact_log2(2 * elem_size)); // size is a 32 bit value
2442     __ beq(CR0, L_Tail);
2443     __ mtctr(R0);
2444 
2445     __ align(32); // loop alignment
2446     __ bind(L_Loop);
2447     __ store_sized_value(byteVal, 0, dest, elem_size);
2448     __ store_sized_value(byteVal, elem_size, dest, elem_size);
2449     __ addi(dest, dest, 2 * elem_size);
2450     __ bdnz(L_Loop);
2451 
2452     __ bind(L_Tail);
2453     __ andi_(R0, size, elem_size);
2454     __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintbhBCLRisReturn);
2455     __ store_sized_value(byteVal, 0, dest, elem_size);
2456     __ blr();
2457   }
2458 
2459   //
2460   //  Generate 'unsafe' set memory stub
2461   //  Though just as safe as the other stubs, it takes an unscaled
2462   //  size_t (# bytes) argument instead of an element count.
2463   //
2464   //  Input:
2465   //    R3_ARG1   - destination array address
2466   //    R4_ARG2   - byte count (size_t)
2467   //    R5_ARG3   - byte value
2468   //
2469   address generate_unsafe_setmemory(address unsafe_byte_fill) {
2470     __ align(CodeEntryAlignment);
2471     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2472     address start = __ function_entry();
2473 
2474     // bump this on entry, not on exit:
2475     // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
2476 
2477     {
2478       Label L_fill8Bytes, L_fill4Bytes, L_fillBytes;
2479 
2480       const Register dest = R3_ARG1;
2481       const Register size = R4_ARG2;
2482       const Register byteVal = R5_ARG3;
2483       const Register rScratch1 = R6;
2484 
2485       // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2486 
2487       // Check for pointer & size alignment
2488       __ orr(rScratch1, dest, size);
2489 
2490       __ andi_(R0, rScratch1, 7);
2491       __ beq(CR0, L_fill8Bytes);
2492 
2493       __ andi_(R0, rScratch1, 3);
2494       __ beq(CR0, L_fill4Bytes);
2495 
2496       __ andi_(R0, rScratch1, 1);
2497       __ bne(CR0, L_fillBytes);
2498 
2499       // Mark remaining code as such which performs Unsafe accesses.
2500       UnsafeMemoryAccessMark umam(this, true, false);
2501 
2502       // At this point, we know the lower bit of size is zero and a
2503       // multiple of 2
2504       do_setmemory_atomic_loop(2, dest, size, byteVal, _masm);
2505 
2506       __ align(32);
2507       __ bind(L_fill8Bytes);
2508       // At this point, we know the lower 3 bits of size are zero and a
2509       // multiple of 8
2510       do_setmemory_atomic_loop(8, dest, size, byteVal, _masm);
2511 
2512       __ align(32);
2513       __ bind(L_fill4Bytes);
2514       // At this point, we know the lower 2 bits of size are zero and a
2515       // multiple of 4
2516       do_setmemory_atomic_loop(4, dest, size, byteVal, _masm);
2517 
2518       __ align(32);
2519       __ bind(L_fillBytes);
2520       do_setmemory_atomic_loop(1, dest, size, byteVal, _masm);
2521     }
2522 
2523     return start;
2524   }
2525 
2526 
2527   //
2528   //  Generate generic array copy stubs
2529   //
2530   //  Input:
2531   //    R3    -  src oop
2532   //    R4    -  src_pos
2533   //    R5    -  dst oop
2534   //    R6    -  dst_pos
2535   //    R7    -  element count
2536   //
2537   //  Output:
2538   //    R3 ==  0  -  success
2539   //    R3 == -1  -  need to call System.arraycopy
2540   //
2541   address generate_generic_copy(address entry_jbyte_arraycopy,
2542                                 address entry_jshort_arraycopy,
2543                                 address entry_jint_arraycopy,
2544                                 address entry_oop_arraycopy,
2545                                 address entry_disjoint_oop_arraycopy,
2546                                 address entry_jlong_arraycopy,
2547                                 address entry_checkcast_arraycopy) {
2548     Label L_failed, L_objArray;
2549 
2550     // Input registers
2551     const Register src       = R3_ARG1;  // source array oop
2552     const Register src_pos   = R4_ARG2;  // source position
2553     const Register dst       = R5_ARG3;  // destination array oop
2554     const Register dst_pos   = R6_ARG4;  // destination position
2555     const Register length    = R7_ARG5;  // elements count
2556 
2557     // registers used as temp
2558     const Register src_klass = R8_ARG6;  // source array klass
2559     const Register dst_klass = R9_ARG7;  // destination array klass
2560     const Register lh        = R10_ARG8; // layout handler
2561     const Register temp      = R2;
2562 
2563     //__ align(CodeEntryAlignment);
2564     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2565     StubCodeMark mark(this, stub_id);
2566     address start = __ function_entry();
2567 
2568     // Bump this on entry, not on exit:
2569     //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
2570 
2571     // In principle, the int arguments could be dirty.
2572 
2573     //-----------------------------------------------------------------------
2574     // Assembler stubs will be used for this call to arraycopy
2575     // if the following conditions are met:
2576     //
2577     // (1) src and dst must not be null.
2578     // (2) src_pos must not be negative.
2579     // (3) dst_pos must not be negative.
2580     // (4) length  must not be negative.
2581     // (5) src klass and dst klass should be the same and not null.
2582     // (6) src and dst should be arrays.
2583     // (7) src_pos + length must not exceed length of src.
2584     // (8) dst_pos + length must not exceed length of dst.
2585     BLOCK_COMMENT("arraycopy initial argument checks");
2586 
2587     __ cmpdi(CR1, src, 0);      // if (src == nullptr) return -1;
2588     __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
2589     __ cmpdi(CR5, dst, 0);      // if (dst == nullptr) return -1;
2590     __ cror(CR1, Assembler::equal, CR0, Assembler::less);
2591     __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
2592     __ cror(CR5, Assembler::equal, CR0, Assembler::less);
2593     __ extsw_(length, length);   // if (length < 0) return -1;
2594     __ cror(CR1, Assembler::equal, CR5, Assembler::equal);
2595     __ cror(CR1, Assembler::equal, CR0, Assembler::less);
2596     __ beq(CR1, L_failed);
2597 
2598     BLOCK_COMMENT("arraycopy argument klass checks");
2599     __ load_klass(src_klass, src);
2600     __ load_klass(dst_klass, dst);
2601 
2602     // Load layout helper
2603     //
2604     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2605     // 32        30    24            16              8     2                 0
2606     //
2607     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2608     //
2609 
2610     int lh_offset = in_bytes(Klass::layout_helper_offset());
2611 
2612     // Load 32-bits signed value. Use br() instruction with it to check icc.
2613     __ lwz(lh, lh_offset, src_klass);
2614 
2615     // Handle objArrays completely differently...
2616     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2617     __ load_const_optimized(temp, objArray_lh, R0);
2618     __ cmpw(CR0, lh, temp);
2619     __ beq(CR0, L_objArray);
2620 
2621     __ cmpd(CR5, src_klass, dst_klass);          // if (src->klass() != dst->klass()) return -1;
2622     __ bne(CR5, L_failed);
2623 
2624     // Check for flat inline type array -> return -1
2625     __ test_flat_array_oop(src, temp, L_failed);
2626 
2627     // Check for null-free (non-flat) inline type array -> handle as object array
2628     __ test_null_free_array_oop(src, temp, L_objArray);
2629 
2630     __ cmpwi(CR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
2631     __ bge(CR6, L_failed);
2632 
2633     // At this point, it is known to be a typeArray (array_tag 0x3).
2634 #ifdef ASSERT
2635     { Label L;
2636       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2637       __ load_const_optimized(temp, lh_prim_tag_in_place, R0);
2638       __ cmpw(CR0, lh, temp);
2639       __ bge(CR0, L);
2640       __ stop("must be a primitive array");
2641       __ bind(L);
2642     }
2643 #endif
2644 
2645     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2646                            temp, dst_klass, L_failed);
2647 
2648     // TypeArrayKlass
2649     //
2650     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2651     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2652     //
2653 
2654     const Register offset = dst_klass;    // array offset
2655     const Register elsize = src_klass;    // log2 element size
2656 
2657     __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
2658     __ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
2659     __ add(src, offset, src);       // src array offset
2660     __ add(dst, offset, dst);       // dst array offset
2661 
2662     // Next registers should be set before the jump to corresponding stub.
2663     const Register from     = R3_ARG1;  // source array address
2664     const Register to       = R4_ARG2;  // destination array address
2665     const Register count    = R5_ARG3;  // elements count
2666 
2667     // 'from', 'to', 'count' registers should be set in this order
2668     // since they are the same as 'src', 'src_pos', 'dst'.
2669 
2670     BLOCK_COMMENT("scale indexes to element size");
2671     __ sld(src_pos, src_pos, elsize);
2672     __ sld(dst_pos, dst_pos, elsize);
2673     __ add(from, src_pos, src);  // src_addr
2674     __ add(to, dst_pos, dst);    // dst_addr
2675     __ mr(count, length);        // length
2676 
2677     BLOCK_COMMENT("choose copy loop based on element size");
2678     // Using conditional branches with range 32kB.
2679     const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CR0, Assembler::equal);
2680     __ cmpwi(CR0, elsize, 0);
2681     __ bc(bo, bi, entry_jbyte_arraycopy);
2682     __ cmpwi(CR0, elsize, LogBytesPerShort);
2683     __ bc(bo, bi, entry_jshort_arraycopy);
2684     __ cmpwi(CR0, elsize, LogBytesPerInt);
2685     __ bc(bo, bi, entry_jint_arraycopy);
2686 #ifdef ASSERT
2687     { Label L;
2688       __ cmpwi(CR0, elsize, LogBytesPerLong);
2689       __ beq(CR0, L);
2690       __ stop("must be long copy, but elsize is wrong");
2691       __ bind(L);
2692     }
2693 #endif
2694     __ b(entry_jlong_arraycopy);
2695 
2696     // ObjArrayKlass
2697   __ bind(L_objArray);
2698     // live at this point:  src_klass, dst_klass, src[_pos], dst[_pos], length
2699 
2700     Label L_disjoint_plain_copy, L_checkcast_copy;
2701     //  test array classes for subtyping
2702     __ cmpd(CR0, src_klass, dst_klass);         // usual case is exact equality
2703     __ bne(CR0, L_checkcast_copy);
2704 
2705     // Identically typed arrays can be copied without element-wise checks.
2706     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2707                            temp, lh, L_failed);
2708 
2709     __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2710     __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2711     __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2712     __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2713     __ add(from, src_pos, src);  // src_addr
2714     __ add(to, dst_pos, dst);    // dst_addr
2715     __ mr(count, length);        // length
2716     __ b(entry_oop_arraycopy);
2717 
2718   __ bind(L_checkcast_copy);
2719     // live at this point:  src_klass, dst_klass
2720     {
2721       // Before looking at dst.length, make sure dst is also an objArray.
2722       __ lwz(temp, lh_offset, dst_klass);
2723       __ cmpw(CR0, lh, temp);
2724       __ bne(CR0, L_failed);
2725 
2726       // It is safe to examine both src.length and dst.length.
2727       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2728                              temp, lh, L_failed);
2729 
2730       // Marshal the base address arguments now, freeing registers.
2731       __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2732       __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2733       __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2734       __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2735       __ add(from, src_pos, src);  // src_addr
2736       __ add(to, dst_pos, dst);    // dst_addr
2737       __ mr(count, length);        // length
2738 
2739       Register sco_temp = R6_ARG4;             // This register is free now.
2740       assert_different_registers(from, to, count, sco_temp,
2741                                  dst_klass, src_klass);
2742 
2743       // Generate the type check.
2744       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2745       __ lwz(sco_temp, sco_offset, dst_klass);
2746       generate_type_check(src_klass, sco_temp, dst_klass,
2747                           temp, /* temp */ R10_ARG8, L_disjoint_plain_copy);
2748 
2749       // Fetch destination element klass from the ObjArrayKlass header.
2750       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2751 
2752       // The checkcast_copy loop needs two extra arguments:
2753       __ ld(R7_ARG5, ek_offset, dst_klass);   // dest elem klass
2754       __ lwz(R6_ARG4, sco_offset, R7_ARG5);   // sco of elem klass
2755       __ b(entry_checkcast_arraycopy);
2756     }
2757 
2758     __ bind(L_disjoint_plain_copy);
2759     __ b(entry_disjoint_oop_arraycopy);
2760 
2761   __ bind(L_failed);
2762     __ li(R3_RET, -1); // return -1
2763     __ blr();
2764     return start;
2765   }
2766 
2767   // Arguments for generated stub:
2768   //   R3_ARG1   - source byte array address
2769   //   R4_ARG2   - destination byte array address
2770   //   R5_ARG3   - round key array
2771   address generate_aescrypt_encryptBlock() {
2772     assert(UseAES, "need AES instructions and misaligned SSE support");
2773     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2774     StubCodeMark mark(this, stub_id);
2775 
2776     address start = __ function_entry();
2777 
2778     Label L_doLast, L_error;
2779 
2780     Register from           = R3_ARG1;  // source array address
2781     Register to             = R4_ARG2;  // destination array address
2782     Register key            = R5_ARG3;  // round key array
2783 
2784     Register keylen         = R8;
2785     Register temp           = R9;
2786     Register keypos         = R10;
2787     Register fifteen        = R12;
2788 
2789     VectorRegister vRet     = VR0;
2790 
2791     VectorRegister vKey1    = VR1;
2792     VectorRegister vKey2    = VR2;
2793     VectorRegister vKey3    = VR3;
2794     VectorRegister vKey4    = VR4;
2795 
2796     VectorRegister fromPerm = VR5;
2797     VectorRegister keyPerm  = VR6;
2798     VectorRegister toPerm   = VR7;
2799     VectorRegister fSplt    = VR8;
2800 
2801     VectorRegister vTmp1    = VR9;
2802     VectorRegister vTmp2    = VR10;
2803     VectorRegister vTmp3    = VR11;
2804     VectorRegister vTmp4    = VR12;
2805 
2806     __ li              (fifteen, 15);
2807 
2808     // load unaligned from[0-15] to vRet
2809     __ lvx             (vRet, from);
2810     __ lvx             (vTmp1, fifteen, from);
2811     __ lvsl            (fromPerm, from);
2812 #ifdef VM_LITTLE_ENDIAN
2813     __ vspltisb        (fSplt, 0x0f);
2814     __ vxor            (fromPerm, fromPerm, fSplt);
2815 #endif
2816     __ vperm           (vRet, vRet, vTmp1, fromPerm);
2817 
2818     // load keylen (44 or 52 or 60)
2819     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2820 
2821     // to load keys
2822     __ load_perm       (keyPerm, key);
2823 #ifdef VM_LITTLE_ENDIAN
2824     __ vspltisb        (vTmp2, -16);
2825     __ vrld            (keyPerm, keyPerm, vTmp2);
2826     __ vrld            (keyPerm, keyPerm, vTmp2);
2827     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
2828 #endif
2829 
2830     // load the 1st round key to vTmp1
2831     __ lvx             (vTmp1, key);
2832     __ li              (keypos, 16);
2833     __ lvx             (vKey1, keypos, key);
2834     __ vec_perm        (vTmp1, vKey1, keyPerm);
2835 
2836     // 1st round
2837     __ vxor            (vRet, vRet, vTmp1);
2838 
2839     // load the 2nd round key to vKey1
2840     __ li              (keypos, 32);
2841     __ lvx             (vKey2, keypos, key);
2842     __ vec_perm        (vKey1, vKey2, keyPerm);
2843 
2844     // load the 3rd round key to vKey2
2845     __ li              (keypos, 48);
2846     __ lvx             (vKey3, keypos, key);
2847     __ vec_perm        (vKey2, vKey3, keyPerm);
2848 
2849     // load the 4th round key to vKey3
2850     __ li              (keypos, 64);
2851     __ lvx             (vKey4, keypos, key);
2852     __ vec_perm        (vKey3, vKey4, keyPerm);
2853 
2854     // load the 5th round key to vKey4
2855     __ li              (keypos, 80);
2856     __ lvx             (vTmp1, keypos, key);
2857     __ vec_perm        (vKey4, vTmp1, keyPerm);
2858 
2859     // 2nd - 5th rounds
2860     __ vcipher         (vRet, vRet, vKey1);
2861     __ vcipher         (vRet, vRet, vKey2);
2862     __ vcipher         (vRet, vRet, vKey3);
2863     __ vcipher         (vRet, vRet, vKey4);
2864 
2865     // load the 6th round key to vKey1
2866     __ li              (keypos, 96);
2867     __ lvx             (vKey2, keypos, key);
2868     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2869 
2870     // load the 7th round key to vKey2
2871     __ li              (keypos, 112);
2872     __ lvx             (vKey3, keypos, key);
2873     __ vec_perm        (vKey2, vKey3, keyPerm);
2874 
2875     // load the 8th round key to vKey3
2876     __ li              (keypos, 128);
2877     __ lvx             (vKey4, keypos, key);
2878     __ vec_perm        (vKey3, vKey4, keyPerm);
2879 
2880     // load the 9th round key to vKey4
2881     __ li              (keypos, 144);
2882     __ lvx             (vTmp1, keypos, key);
2883     __ vec_perm        (vKey4, vTmp1, keyPerm);
2884 
2885     // 6th - 9th rounds
2886     __ vcipher         (vRet, vRet, vKey1);
2887     __ vcipher         (vRet, vRet, vKey2);
2888     __ vcipher         (vRet, vRet, vKey3);
2889     __ vcipher         (vRet, vRet, vKey4);
2890 
2891     // load the 10th round key to vKey1
2892     __ li              (keypos, 160);
2893     __ lvx             (vKey2, keypos, key);
2894     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2895 
2896     // load the 11th round key to vKey2
2897     __ li              (keypos, 176);
2898     __ lvx             (vTmp1, keypos, key);
2899     __ vec_perm        (vKey2, vTmp1, keyPerm);
2900 
2901     // if all round keys are loaded, skip next 4 rounds
2902     __ cmpwi           (CR0, keylen, 44);
2903     __ beq             (CR0, L_doLast);
2904 
2905     // 10th - 11th rounds
2906     __ vcipher         (vRet, vRet, vKey1);
2907     __ vcipher         (vRet, vRet, vKey2);
2908 
2909     // load the 12th round key to vKey1
2910     __ li              (keypos, 192);
2911     __ lvx             (vKey2, keypos, key);
2912     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2913 
2914     // load the 13th round key to vKey2
2915     __ li              (keypos, 208);
2916     __ lvx             (vTmp1, keypos, key);
2917     __ vec_perm        (vKey2, vTmp1, keyPerm);
2918 
2919     // if all round keys are loaded, skip next 2 rounds
2920     __ cmpwi           (CR0, keylen, 52);
2921     __ beq             (CR0, L_doLast);
2922 
2923 #ifdef ASSERT
2924     __ cmpwi           (CR0, keylen, 60);
2925     __ bne             (CR0, L_error);
2926 #endif
2927 
2928     // 12th - 13th rounds
2929     __ vcipher         (vRet, vRet, vKey1);
2930     __ vcipher         (vRet, vRet, vKey2);
2931 
2932     // load the 14th round key to vKey1
2933     __ li              (keypos, 224);
2934     __ lvx             (vKey2, keypos, key);
2935     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2936 
2937     // load the 15th round key to vKey2
2938     __ li              (keypos, 240);
2939     __ lvx             (vTmp1, keypos, key);
2940     __ vec_perm        (vKey2, vTmp1, keyPerm);
2941 
2942     __ bind(L_doLast);
2943 
2944     // last two rounds
2945     __ vcipher         (vRet, vRet, vKey1);
2946     __ vcipherlast     (vRet, vRet, vKey2);
2947 
2948 #ifdef VM_LITTLE_ENDIAN
2949     // toPerm = 0x0F0E0D0C0B0A09080706050403020100
2950     __ lvsl            (toPerm, keypos); // keypos is a multiple of 16
2951     __ vxor            (toPerm, toPerm, fSplt);
2952 
2953     // Swap Bytes
2954     __ vperm           (vRet, vRet, vRet, toPerm);
2955 #endif
2956 
2957     // store result (unaligned)
2958     // Note: We can't use a read-modify-write sequence which touches additional Bytes.
2959     Register lo = temp, hi = fifteen; // Reuse
2960     __ vsldoi          (vTmp1, vRet, vRet, 8);
2961     __ mfvrd           (hi, vRet);
2962     __ mfvrd           (lo, vTmp1);
2963     __ std             (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
2964     __ std             (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
2965 
2966     __ blr();
2967 
2968 #ifdef ASSERT
2969     __ bind(L_error);
2970     __ stop("aescrypt_encryptBlock: invalid key length");
2971 #endif
2972      return start;
2973   }
2974 
2975   // Arguments for generated stub:
2976   //   R3_ARG1   - source byte array address
2977   //   R4_ARG2   - destination byte array address
2978   //   R5_ARG3   - sessionKe (key) in little endian int array
2979   address generate_aescrypt_decryptBlock() {
2980     assert(UseAES, "need AES instructions and misaligned SSE support");
2981     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2982     StubCodeMark mark(this, stub_id);
2983 
2984     address start = __ function_entry();
2985 
2986     Label L_doLast, L_do44, L_do52, L_error;
2987 
2988     Register from           = R3_ARG1;  // source array address
2989     Register to             = R4_ARG2;  // destination array address
2990     Register key            = R5_ARG3;  // round key array
2991 
2992     Register keylen         = R8;
2993     Register temp           = R9;
2994     Register keypos         = R10;
2995     Register fifteen        = R12;
2996 
2997     VectorRegister vRet     = VR0;
2998 
2999     VectorRegister vKey1    = VR1;
3000     VectorRegister vKey2    = VR2;
3001     VectorRegister vKey3    = VR3;
3002     VectorRegister vKey4    = VR4;
3003     VectorRegister vKey5    = VR5;
3004 
3005     VectorRegister fromPerm = VR6;
3006     VectorRegister keyPerm  = VR7;
3007     VectorRegister toPerm   = VR8;
3008     VectorRegister fSplt    = VR9;
3009 
3010     VectorRegister vTmp1    = VR10;
3011     VectorRegister vTmp2    = VR11;
3012     VectorRegister vTmp3    = VR12;
3013     VectorRegister vTmp4    = VR13;
3014 
3015     __ li              (fifteen, 15);
3016 
3017     // load unaligned from[0-15] to vRet
3018     __ lvx             (vRet, from);
3019     __ lvx             (vTmp1, fifteen, from);
3020     __ lvsl            (fromPerm, from);
3021 #ifdef VM_LITTLE_ENDIAN
3022     __ vspltisb        (fSplt, 0x0f);
3023     __ vxor            (fromPerm, fromPerm, fSplt);
3024 #endif
3025     __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
3026 
3027     // load keylen (44 or 52 or 60)
3028     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
3029 
3030     // to load keys
3031     __ load_perm       (keyPerm, key);
3032 #ifdef VM_LITTLE_ENDIAN
3033     __ vxor            (vTmp2, vTmp2, vTmp2);
3034     __ vspltisb        (vTmp2, -16);
3035     __ vrld            (keyPerm, keyPerm, vTmp2);
3036     __ vrld            (keyPerm, keyPerm, vTmp2);
3037     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
3038 #endif
3039 
3040     __ cmpwi           (CR0, keylen, 44);
3041     __ beq             (CR0, L_do44);
3042 
3043     __ cmpwi           (CR0, keylen, 52);
3044     __ beq             (CR0, L_do52);
3045 
3046 #ifdef ASSERT
3047     __ cmpwi           (CR0, keylen, 60);
3048     __ bne             (CR0, L_error);
3049 #endif
3050 
3051     // load the 15th round key to vKey1
3052     __ li              (keypos, 240);
3053     __ lvx             (vKey1, keypos, key);
3054     __ li              (keypos, 224);
3055     __ lvx             (vKey2, keypos, key);
3056     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
3057 
3058     // load the 14th round key to vKey2
3059     __ li              (keypos, 208);
3060     __ lvx             (vKey3, keypos, key);
3061     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3062 
3063     // load the 13th round key to vKey3
3064     __ li              (keypos, 192);
3065     __ lvx             (vKey4, keypos, key);
3066     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
3067 
3068     // load the 12th round key to vKey4
3069     __ li              (keypos, 176);
3070     __ lvx             (vKey5, keypos, key);
3071     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
3072 
3073     // load the 11th round key to vKey5
3074     __ li              (keypos, 160);
3075     __ lvx             (vTmp1, keypos, key);
3076     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
3077 
3078     // 1st - 5th rounds
3079     __ vxor            (vRet, vRet, vKey1);
3080     __ vncipher        (vRet, vRet, vKey2);
3081     __ vncipher        (vRet, vRet, vKey3);
3082     __ vncipher        (vRet, vRet, vKey4);
3083     __ vncipher        (vRet, vRet, vKey5);
3084 
3085     __ b               (L_doLast);
3086 
3087     __ align(32);
3088     __ bind            (L_do52);
3089 
3090     // load the 13th round key to vKey1
3091     __ li              (keypos, 208);
3092     __ lvx             (vKey1, keypos, key);
3093     __ li              (keypos, 192);
3094     __ lvx             (vKey2, keypos, key);
3095     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
3096 
3097     // load the 12th round key to vKey2
3098     __ li              (keypos, 176);
3099     __ lvx             (vKey3, keypos, key);
3100     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3101 
3102     // load the 11th round key to vKey3
3103     __ li              (keypos, 160);
3104     __ lvx             (vTmp1, keypos, key);
3105     __ vec_perm        (vKey3, vTmp1, vKey3, keyPerm);
3106 
3107     // 1st - 3rd rounds
3108     __ vxor            (vRet, vRet, vKey1);
3109     __ vncipher        (vRet, vRet, vKey2);
3110     __ vncipher        (vRet, vRet, vKey3);
3111 
3112     __ b               (L_doLast);
3113 
3114     __ align(32);
3115     __ bind            (L_do44);
3116 
3117     // load the 11th round key to vKey1
3118     __ li              (keypos, 176);
3119     __ lvx             (vKey1, keypos, key);
3120     __ li              (keypos, 160);
3121     __ lvx             (vTmp1, keypos, key);
3122     __ vec_perm        (vKey1, vTmp1, vKey1, keyPerm);
3123 
3124     // 1st round
3125     __ vxor            (vRet, vRet, vKey1);
3126 
3127     __ bind            (L_doLast);
3128 
3129     // load the 10th round key to vKey1
3130     __ li              (keypos, 144);
3131     __ lvx             (vKey2, keypos, key);
3132     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
3133 
3134     // load the 9th round key to vKey2
3135     __ li              (keypos, 128);
3136     __ lvx             (vKey3, keypos, key);
3137     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3138 
3139     // load the 8th round key to vKey3
3140     __ li              (keypos, 112);
3141     __ lvx             (vKey4, keypos, key);
3142     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
3143 
3144     // load the 7th round key to vKey4
3145     __ li              (keypos, 96);
3146     __ lvx             (vKey5, keypos, key);
3147     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
3148 
3149     // load the 6th round key to vKey5
3150     __ li              (keypos, 80);
3151     __ lvx             (vTmp1, keypos, key);
3152     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
3153 
3154     // last 10th - 6th rounds
3155     __ vncipher        (vRet, vRet, vKey1);
3156     __ vncipher        (vRet, vRet, vKey2);
3157     __ vncipher        (vRet, vRet, vKey3);
3158     __ vncipher        (vRet, vRet, vKey4);
3159     __ vncipher        (vRet, vRet, vKey5);
3160 
3161     // load the 5th round key to vKey1
3162     __ li              (keypos, 64);
3163     __ lvx             (vKey2, keypos, key);
3164     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
3165 
3166     // load the 4th round key to vKey2
3167     __ li              (keypos, 48);
3168     __ lvx             (vKey3, keypos, key);
3169     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3170 
3171     // load the 3rd round key to vKey3
3172     __ li              (keypos, 32);
3173     __ lvx             (vKey4, keypos, key);
3174     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
3175 
3176     // load the 2nd round key to vKey4
3177     __ li              (keypos, 16);
3178     __ lvx             (vKey5, keypos, key);
3179     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
3180 
3181     // load the 1st round key to vKey5
3182     __ lvx             (vTmp1, key);
3183     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
3184 
3185     // last 5th - 1th rounds
3186     __ vncipher        (vRet, vRet, vKey1);
3187     __ vncipher        (vRet, vRet, vKey2);
3188     __ vncipher        (vRet, vRet, vKey3);
3189     __ vncipher        (vRet, vRet, vKey4);
3190     __ vncipherlast    (vRet, vRet, vKey5);
3191 
3192 #ifdef VM_LITTLE_ENDIAN
3193     // toPerm = 0x0F0E0D0C0B0A09080706050403020100
3194     __ lvsl            (toPerm, keypos); // keypos is a multiple of 16
3195     __ vxor            (toPerm, toPerm, fSplt);
3196 
3197     // Swap Bytes
3198     __ vperm           (vRet, vRet, vRet, toPerm);
3199 #endif
3200 
3201     // store result (unaligned)
3202     // Note: We can't use a read-modify-write sequence which touches additional Bytes.
3203     Register lo = temp, hi = fifteen; // Reuse
3204     __ vsldoi          (vTmp1, vRet, vRet, 8);
3205     __ mfvrd           (hi, vRet);
3206     __ mfvrd           (lo, vTmp1);
3207     __ std             (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
3208     __ std             (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
3209 
3210     __ blr();
3211 
3212 #ifdef ASSERT
3213     __ bind(L_error);
3214     __ stop("aescrypt_decryptBlock: invalid key length");
3215 #endif
3216      return start;
3217   }
3218 
3219   address generate_sha256_implCompress(StubId stub_id) {
3220     assert(UseSHA, "need SHA instructions");
3221     bool multi_block;
3222     switch (stub_id) {
3223     case StubId::stubgen_sha256_implCompress_id:
3224       multi_block = false;
3225       break;
3226     case StubId::stubgen_sha256_implCompressMB_id:
3227       multi_block = true;
3228       break;
3229     default:
3230       ShouldNotReachHere();
3231     }
3232     StubCodeMark mark(this, stub_id);
3233     address start = __ function_entry();
3234 
3235     __ sha256 (multi_block);
3236     __ blr();
3237 
3238     return start;
3239   }
3240 
3241   address generate_sha512_implCompress(StubId stub_id) {
3242     assert(UseSHA, "need SHA instructions");
3243     bool multi_block;
3244     switch (stub_id) {
3245     case StubId::stubgen_sha512_implCompress_id:
3246       multi_block = false;
3247       break;
3248     case StubId::stubgen_sha512_implCompressMB_id:
3249       multi_block = true;
3250       break;
3251     default:
3252       ShouldNotReachHere();
3253     }
3254     StubCodeMark mark(this, stub_id);
3255     address start = __ function_entry();
3256 
3257     __ sha512 (multi_block);
3258     __ blr();
3259 
3260     return start;
3261   }
3262 
3263   address generate_data_cache_writeback() {
3264     const Register cacheline = R3_ARG1;
3265     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
3266     StubCodeMark mark(this, stub_id);
3267     address start = __ pc();
3268 
3269     __ cache_wb(Address(cacheline));
3270     __ blr();
3271 
3272     return start;
3273   }
3274 
3275   address generate_data_cache_writeback_sync() {
3276     const Register is_presync = R3_ARG1;
3277     Register temp = R4;
3278     Label SKIP;
3279     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
3280     StubCodeMark mark(this, stub_id);
3281     address start = __ pc();
3282 
3283     __ andi_(temp, is_presync, 1);
3284     __ bne(CR0, SKIP);
3285     __ cache_wbsync(false); // post sync => emit 'sync'
3286     __ bind(SKIP);          // pre sync => emit nothing
3287     __ blr();
3288 
3289     return start;
3290   }
3291 
3292   void generate_arraycopy_stubs() {
3293     // generate the common exit first so later stubs can rely on it if
3294     // they want an UnsafeMemoryAccess exit non-local to the stub
3295     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
3296     // register the stub as the default exit with class UnsafeMemoryAccess
3297     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
3298 
3299     // Note: the disjoint stubs must be generated first, some of the
3300     //       conjoint stubs use them.
3301 
3302     // Note: chaining of stubs does not rely on branching to an
3303     //       auxiliary post-push entry because none of the stubs
3304     //       push/pop a frame.
3305 
3306     // non-aligned disjoint versions
3307     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id);
3308     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(StubId::stubgen_jshort_disjoint_arraycopy_id);
3309     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(StubId::stubgen_jint_disjoint_arraycopy_id);
3310     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(StubId::stubgen_jlong_disjoint_arraycopy_id);
3311     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id);
3312     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
3313 
3314     // aligned disjoint versions
3315     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id);
3316     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id);
3317     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id);
3318     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id);
3319     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id);
3320     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
3321 
3322     // non-aligned conjoint versions
3323     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(StubId::stubgen_jbyte_arraycopy_id);
3324     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(StubId::stubgen_jshort_arraycopy_id);
3325     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(StubId::stubgen_jint_arraycopy_id);
3326     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(StubId::stubgen_jlong_arraycopy_id);
3327     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_id);
3328     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id);
3329 
3330     // aligned conjoint versions
3331     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id);
3332     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(StubId::stubgen_arrayof_jshort_arraycopy_id);
3333     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(StubId::stubgen_arrayof_jint_arraycopy_id);
3334     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(StubId::stubgen_arrayof_jlong_arraycopy_id);
3335     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
3336     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
3337 
3338     // special/generic versions
3339     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id);
3340     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id);
3341 
3342     StubRoutines::_unsafe_arraycopy  = generate_unsafe_copy(STUB_ENTRY(jbyte_arraycopy()),
3343                                                             STUB_ENTRY(jshort_arraycopy()),
3344                                                             STUB_ENTRY(jint_arraycopy()),
3345                                                             STUB_ENTRY(jlong_arraycopy()));
3346     StubRoutines::_generic_arraycopy = generate_generic_copy(STUB_ENTRY(jbyte_arraycopy()),
3347                                                              STUB_ENTRY(jshort_arraycopy()),
3348                                                              STUB_ENTRY(jint_arraycopy()),
3349                                                              STUB_ENTRY(oop_arraycopy()),
3350                                                              STUB_ENTRY(oop_disjoint_arraycopy()),
3351                                                              STUB_ENTRY(jlong_arraycopy()),
3352                                                              STUB_ENTRY(checkcast_arraycopy()));
3353 
3354     // fill routines
3355 #ifdef COMPILER2
3356     if (OptimizeFill) {
3357       StubRoutines::_jbyte_fill          = generate_fill(StubId::stubgen_jbyte_fill_id);
3358       StubRoutines::_jshort_fill         = generate_fill(StubId::stubgen_jshort_fill_id);
3359       StubRoutines::_jint_fill           = generate_fill(StubId::stubgen_jint_fill_id);
3360       StubRoutines::_arrayof_jbyte_fill  = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
3361       StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
3362       StubRoutines::_arrayof_jint_fill   = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
3363     }
3364     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
3365 #endif
3366   }
3367 
3368   // Stub for BigInteger::multiplyToLen()
3369   //
3370   //  Arguments:
3371   //
3372   //  Input:
3373   //    R3 - x address
3374   //    R4 - x length
3375   //    R5 - y address
3376   //    R6 - y length
3377   //    R7 - z address
3378   //
3379   address generate_multiplyToLen() {
3380 
3381     StubId stub_id = StubId::stubgen_multiplyToLen_id;
3382     StubCodeMark mark(this, stub_id);
3383 
3384     address start = __ function_entry();
3385 
3386     const Register x     = R3;
3387     const Register xlen  = R4;
3388     const Register y     = R5;
3389     const Register ylen  = R6;
3390     const Register z     = R7;
3391 
3392     const Register tmp1  = R2; // TOC not used.
3393     const Register tmp2  = R9;
3394     const Register tmp3  = R10;
3395     const Register tmp4  = R11;
3396     const Register tmp5  = R12;
3397 
3398     // non-volatile regs
3399     const Register tmp6  = R31;
3400     const Register tmp7  = R30;
3401     const Register tmp8  = R29;
3402     const Register tmp9  = R28;
3403     const Register tmp10 = R27;
3404     const Register tmp11 = R26;
3405     const Register tmp12 = R25;
3406     const Register tmp13 = R24;
3407 
3408     BLOCK_COMMENT("Entry:");
3409 
3410     // C2 does not respect int to long conversion for stub calls.
3411     __ clrldi(xlen, xlen, 32);
3412     __ clrldi(ylen, ylen, 32);
3413 
3414     // Save non-volatile regs (frameless).
3415     int current_offs = 8;
3416     __ std(R24, -current_offs, R1_SP); current_offs += 8;
3417     __ std(R25, -current_offs, R1_SP); current_offs += 8;
3418     __ std(R26, -current_offs, R1_SP); current_offs += 8;
3419     __ std(R27, -current_offs, R1_SP); current_offs += 8;
3420     __ std(R28, -current_offs, R1_SP); current_offs += 8;
3421     __ std(R29, -current_offs, R1_SP); current_offs += 8;
3422     __ std(R30, -current_offs, R1_SP); current_offs += 8;
3423     __ std(R31, -current_offs, R1_SP);
3424 
3425     __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5,
3426                        tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
3427 
3428     // Restore non-volatile regs.
3429     current_offs = 8;
3430     __ ld(R24, -current_offs, R1_SP); current_offs += 8;
3431     __ ld(R25, -current_offs, R1_SP); current_offs += 8;
3432     __ ld(R26, -current_offs, R1_SP); current_offs += 8;
3433     __ ld(R27, -current_offs, R1_SP); current_offs += 8;
3434     __ ld(R28, -current_offs, R1_SP); current_offs += 8;
3435     __ ld(R29, -current_offs, R1_SP); current_offs += 8;
3436     __ ld(R30, -current_offs, R1_SP); current_offs += 8;
3437     __ ld(R31, -current_offs, R1_SP);
3438 
3439     __ blr();  // Return to caller.
3440 
3441     return start;
3442   }
3443 
3444   /**
3445   *  Arguments:
3446   *
3447   *  Input:
3448   *   R3_ARG1    - out address
3449   *   R4_ARG2    - in address
3450   *   R5_ARG3    - offset
3451   *   R6_ARG4    - len
3452   *   R7_ARG5    - k
3453   *  Output:
3454   *   R3_RET     - carry
3455   */
3456   address generate_mulAdd() {
3457     __ align(CodeEntryAlignment);
3458     StubId stub_id = StubId::stubgen_mulAdd_id;
3459     StubCodeMark mark(this, stub_id);
3460 
3461     address start = __ function_entry();
3462 
3463     // C2 does not sign extend signed parameters to full 64 bits registers:
3464     __ rldic (R5_ARG3, R5_ARG3, 2, 32);  // always positive
3465     __ clrldi(R6_ARG4, R6_ARG4, 32);     // force zero bits on higher word
3466     __ clrldi(R7_ARG5, R7_ARG5, 32);     // force zero bits on higher word
3467 
3468     __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
3469 
3470     // Moves output carry to return register
3471     __ mr    (R3_RET,  R10);
3472 
3473     __ blr();
3474 
3475     return start;
3476   }
3477 
3478   /**
3479   *  Arguments:
3480   *
3481   *  Input:
3482   *   R3_ARG1    - in address
3483   *   R4_ARG2    - in length
3484   *   R5_ARG3    - out address
3485   *   R6_ARG4    - out length
3486   */
3487   address generate_squareToLen() {
3488     __ align(CodeEntryAlignment);
3489     StubId stub_id = StubId::stubgen_squareToLen_id;
3490     StubCodeMark mark(this, stub_id);
3491 
3492     address start = __ function_entry();
3493 
3494     // args - higher word is cleaned (unsignedly) due to int to long casting
3495     const Register in        = R3_ARG1;
3496     const Register in_len    = R4_ARG2;
3497     __ clrldi(in_len, in_len, 32);
3498     const Register out       = R5_ARG3;
3499     const Register out_len   = R6_ARG4;
3500     __ clrldi(out_len, out_len, 32);
3501 
3502     // output
3503     const Register ret       = R3_RET;
3504 
3505     // temporaries
3506     const Register lplw_s    = R7;
3507     const Register in_aux    = R8;
3508     const Register out_aux   = R9;
3509     const Register piece     = R10;
3510     const Register product   = R14;
3511     const Register lplw      = R15;
3512     const Register i_minus1  = R16;
3513     const Register carry     = R17;
3514     const Register offset    = R18;
3515     const Register off_aux   = R19;
3516     const Register t         = R20;
3517     const Register mlen      = R21;
3518     const Register len       = R22;
3519     const Register a         = R23;
3520     const Register b         = R24;
3521     const Register i         = R25;
3522     const Register c         = R26;
3523     const Register cs        = R27;
3524 
3525     // Labels
3526     Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_LOOP_SQUARE;
3527     Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_SQUARE;
3528 
3529     // Save non-volatile regs (frameless).
3530     int current_offs = -8;
3531     __ std(R28, current_offs, R1_SP); current_offs -= 8;
3532     __ std(R27, current_offs, R1_SP); current_offs -= 8;
3533     __ std(R26, current_offs, R1_SP); current_offs -= 8;
3534     __ std(R25, current_offs, R1_SP); current_offs -= 8;
3535     __ std(R24, current_offs, R1_SP); current_offs -= 8;
3536     __ std(R23, current_offs, R1_SP); current_offs -= 8;
3537     __ std(R22, current_offs, R1_SP); current_offs -= 8;
3538     __ std(R21, current_offs, R1_SP); current_offs -= 8;
3539     __ std(R20, current_offs, R1_SP); current_offs -= 8;
3540     __ std(R19, current_offs, R1_SP); current_offs -= 8;
3541     __ std(R18, current_offs, R1_SP); current_offs -= 8;
3542     __ std(R17, current_offs, R1_SP); current_offs -= 8;
3543     __ std(R16, current_offs, R1_SP); current_offs -= 8;
3544     __ std(R15, current_offs, R1_SP); current_offs -= 8;
3545     __ std(R14, current_offs, R1_SP);
3546 
3547     // Store the squares, right shifted one bit (i.e., divided by 2)
3548     __ subi   (out_aux,   out,       8);
3549     __ subi   (in_aux,    in,        4);
3550     __ cmpwi  (CR0,      in_len,    0);
3551     // Initialize lplw outside of the loop
3552     __ xorr   (lplw,      lplw,      lplw);
3553     __ ble    (CR0,      SKIP_LOOP_SQUARE);    // in_len <= 0
3554     __ mtctr  (in_len);
3555 
3556     __ bind(LOOP_SQUARE);
3557     __ lwzu   (piece,     4,         in_aux);
3558     __ mulld  (product,   piece,     piece);
3559     // shift left 63 bits and only keep the MSB
3560     __ rldic  (lplw_s,    lplw,      63, 0);
3561     __ mr     (lplw,      product);
3562     // shift right 1 bit without sign extension
3563     __ srdi   (product,   product,   1);
3564     // join them to the same register and store it
3565     __ orr    (product,   lplw_s,    product);
3566 #ifdef VM_LITTLE_ENDIAN
3567     // Swap low and high words for little endian
3568     __ rldicl (product,   product,   32, 0);
3569 #endif
3570     __ stdu   (product,   8,         out_aux);
3571     __ bdnz   (LOOP_SQUARE);
3572 
3573     __ bind(SKIP_LOOP_SQUARE);
3574 
3575     // Add in off-diagonal sums
3576     __ cmpwi  (CR0,      in_len,    0);
3577     __ ble    (CR0,      SKIP_DIAGONAL_SUM);
3578     // Avoid CTR usage here in order to use it at mulAdd
3579     __ subi   (i_minus1,  in_len,    1);
3580     __ li     (offset,    4);
3581 
3582     __ bind(LOOP_DIAGONAL_SUM);
3583 
3584     __ sldi   (off_aux,   out_len,   2);
3585     __ sub    (off_aux,   off_aux,   offset);
3586 
3587     __ mr     (len,       i_minus1);
3588     __ sldi   (mlen,      i_minus1,  2);
3589     __ lwzx   (t,         in,        mlen);
3590 
3591     __ muladd (out, in, off_aux, len, t, a, b, carry);
3592 
3593     // begin<addOne>
3594     // off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
3595     __ addi   (mlen,      mlen,      4);
3596     __ sldi   (a,         out_len,   2);
3597     __ subi   (a,         a,         4);
3598     __ sub    (a,         a,         mlen);
3599     __ subi   (off_aux,   offset,    4);
3600     __ sub    (off_aux,   a,         off_aux);
3601 
3602     __ lwzx   (b,         off_aux,   out);
3603     __ add    (b,         b,         carry);
3604     __ stwx   (b,         off_aux,   out);
3605 
3606     // if (((uint64_t)s >> 32) != 0) {
3607     __ srdi_  (a,         b,         32);
3608     __ beq    (CR0,      SKIP_ADDONE);
3609 
3610     // while (--mlen >= 0) {
3611     __ bind(LOOP_ADDONE);
3612     __ subi   (mlen,      mlen,      4);
3613     __ cmpwi  (CR0,      mlen,      0);
3614     __ beq    (CR0,      SKIP_ADDONE);
3615 
3616     // if (--offset_aux < 0) { // Carry out of number
3617     __ subi   (off_aux,   off_aux,   4);
3618     __ cmpwi  (CR0,      off_aux,   0);
3619     __ blt    (CR0,      SKIP_ADDONE);
3620 
3621     // } else {
3622     __ lwzx   (b,         off_aux,   out);
3623     __ addi   (b,         b,         1);
3624     __ stwx   (b,         off_aux,   out);
3625     __ cmpwi  (CR0,      b,         0);
3626     __ bne    (CR0,      SKIP_ADDONE);
3627     __ b      (LOOP_ADDONE);
3628 
3629     __ bind(SKIP_ADDONE);
3630     // } } } end<addOne>
3631 
3632     __ addi   (offset,    offset,    8);
3633     __ subi   (i_minus1,  i_minus1,  1);
3634     __ cmpwi  (CR0,      i_minus1,  0);
3635     __ bge    (CR0,      LOOP_DIAGONAL_SUM);
3636 
3637     __ bind(SKIP_DIAGONAL_SUM);
3638 
3639     // Shift back up and set low bit
3640     // Shifts 1 bit left up to len positions. Assumes no leading zeros
3641     // begin<primitiveLeftShift>
3642     __ cmpwi  (CR0,      out_len,   0);
3643     __ ble    (CR0,      SKIP_LSHIFT);
3644     __ li     (i,         0);
3645     __ lwz    (c,         0,         out);
3646     __ subi   (b,         out_len,   1);
3647     __ mtctr  (b);
3648 
3649     __ bind(LOOP_LSHIFT);
3650     __ mr     (b,         c);
3651     __ addi   (cs,        i,         4);
3652     __ lwzx   (c,         out,       cs);
3653 
3654     __ sldi   (b,         b,         1);
3655     __ srwi   (cs,        c,         31);
3656     __ orr    (b,         b,         cs);
3657     __ stwx   (b,         i,         out);
3658 
3659     __ addi   (i,         i,         4);
3660     __ bdnz   (LOOP_LSHIFT);
3661 
3662     __ sldi   (c,         out_len,   2);
3663     __ subi   (c,         c,         4);
3664     __ lwzx   (b,         out,       c);
3665     __ sldi   (b,         b,         1);
3666     __ stwx   (b,         out,       c);
3667 
3668     __ bind(SKIP_LSHIFT);
3669     // end<primitiveLeftShift>
3670 
3671     // Set low bit
3672     __ sldi   (i,         in_len,    2);
3673     __ subi   (i,         i,         4);
3674     __ lwzx   (i,         in,        i);
3675     __ sldi   (c,         out_len,   2);
3676     __ subi   (c,         c,         4);
3677     __ lwzx   (b,         out,       c);
3678 
3679     __ andi   (i,         i,         1);
3680     __ orr    (i,         b,         i);
3681 
3682     __ stwx   (i,         out,       c);
3683 
3684     // Restore non-volatile regs.
3685     current_offs = -8;
3686     __ ld(R28, current_offs, R1_SP); current_offs -= 8;
3687     __ ld(R27, current_offs, R1_SP); current_offs -= 8;
3688     __ ld(R26, current_offs, R1_SP); current_offs -= 8;
3689     __ ld(R25, current_offs, R1_SP); current_offs -= 8;
3690     __ ld(R24, current_offs, R1_SP); current_offs -= 8;
3691     __ ld(R23, current_offs, R1_SP); current_offs -= 8;
3692     __ ld(R22, current_offs, R1_SP); current_offs -= 8;
3693     __ ld(R21, current_offs, R1_SP); current_offs -= 8;
3694     __ ld(R20, current_offs, R1_SP); current_offs -= 8;
3695     __ ld(R19, current_offs, R1_SP); current_offs -= 8;
3696     __ ld(R18, current_offs, R1_SP); current_offs -= 8;
3697     __ ld(R17, current_offs, R1_SP); current_offs -= 8;
3698     __ ld(R16, current_offs, R1_SP); current_offs -= 8;
3699     __ ld(R15, current_offs, R1_SP); current_offs -= 8;
3700     __ ld(R14, current_offs, R1_SP);
3701 
3702     __ mr(ret, out);
3703     __ blr();
3704 
3705     return start;
3706   }
3707 
3708   /**
3709    * Arguments:
3710    *
3711    * Inputs:
3712    *   R3_ARG1    - int   crc
3713    *   R4_ARG2    - byte* buf
3714    *   R5_ARG3    - int   length (of buffer)
3715    *
3716    * scratch:
3717    *   R2, R6-R12
3718    *
3719    * Output:
3720    *   R3_RET     - int   crc result
3721    */
3722   // Compute CRC32 function.
3723   address generate_CRC32_updateBytes(StubId stub_id) {
3724     bool is_crc32c;
3725     switch (stub_id) {
3726     case StubId::stubgen_updateBytesCRC32_id:
3727       is_crc32c = false;
3728       break;
3729     case StubId::stubgen_updateBytesCRC32C_id:
3730       is_crc32c = true;
3731       break;
3732     default:
3733       ShouldNotReachHere();
3734     }
3735     __ align(CodeEntryAlignment);
3736     StubCodeMark mark(this, stub_id);
3737     address start = __ function_entry();  // Remember stub start address (is rtn value).
3738     __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
3739     __ blr();
3740     return start;
3741   }
3742 
3743   address generate_floatToFloat16() {
3744     __ align(CodeEntryAlignment);
3745     StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
3746     address start = __ function_entry();
3747     __ f2hf(R3_RET, F1_ARG1, F0);
3748     __ blr();
3749     return start;
3750   }
3751 
3752   address generate_float16ToFloat() {
3753     __ align(CodeEntryAlignment);
3754     StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
3755     address start = __ function_entry();
3756     __ hf2f(F1_RET, R3_ARG1);
3757     __ blr();
3758     return start;
3759   }
3760 
3761   address generate_method_entry_barrier() {
3762     __ align(CodeEntryAlignment);
3763     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
3764     StubCodeMark mark(this, stub_id);
3765 
3766     address stub_address = __ pc();
3767 
3768     int nbytes_save = MacroAssembler::num_volatile_regs * BytesPerWord;
3769     __ save_volatile_gprs(R1_SP, -nbytes_save, true);
3770 
3771     // Link register points to instruction in prologue of the guarded nmethod.
3772     // As the stub requires one layer of indirection (argument is of type address* and not address),
3773     // passing the link register's value directly doesn't work.
3774     // Since we have to save the link register on the stack anyway, we calculate the corresponding stack address
3775     // and pass that one instead.
3776     __ addi(R3_ARG1, R1_SP, _abi0(lr));
3777 
3778     __ save_LR(R0);
3779     __ push_frame_reg_args(nbytes_save, R0);
3780 
3781     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier));
3782     __ mr(R0, R3_RET);
3783 
3784     __ pop_frame();
3785     __ restore_LR(R3_RET /* used as tmp register */);
3786     __ restore_volatile_gprs(R1_SP, -nbytes_save, true);
3787 
3788     __ cmpdi(CR0, R0, 0);
3789 
3790     // Return to prologue if no deoptimization is required (bnelr)
3791     __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintIsTaken);
3792 
3793     // Deoptimization required.
3794     // For actually handling the deoptimization, the 'wrong method stub' is invoked.
3795     __ load_const_optimized(R0, SharedRuntime::get_handle_wrong_method_stub());
3796     __ mtctr(R0);
3797 
3798     // Pop the frame built in the prologue.
3799     __ pop_frame();
3800 
3801     // Restore link register.  Required as the 'wrong method stub' needs the caller's frame
3802     // to properly deoptimize this method (e.g. by re-resolving the call site for compiled methods).
3803     // This method's prologue is aborted.
3804     __ restore_LR(R0);
3805 
3806     __ bctr();
3807     return stub_address;
3808   }
3809 
3810 #ifdef VM_LITTLE_ENDIAN
3811 // The following Base64 decode intrinsic is based on an algorithm outlined
3812 // in here:
3813 // http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html
3814 // in the section titled "Vector lookup (pshufb with bitmask)"
3815 //
3816 // This implementation differs in the following ways:
3817 //  * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions
3818 //    are used instead.  It turns out that some of the vector operations
3819 //    needed in the algorithm require fewer AltiVec instructions.
3820 //  * The algorithm in the above mentioned paper doesn't handle the
3821 //    Base64-URL variant in RFC 4648.  Adjustments to both the code and to two
3822 //    lookup tables are needed for this.
3823 //  * The "Pack" section of the code is a complete rewrite for Power because we
3824 //    can utilize better instructions for this step.
3825 //
3826 
3827 // Offsets per group of Base64 characters
3828 // Uppercase
3829 #define UC  (signed char)((-'A' + 0) & 0xff)
3830 // Lowercase
3831 #define LC  (signed char)((-'a' + 26) & 0xff)
3832 // Digits
3833 #define DIG (signed char)((-'0' + 52) & 0xff)
3834 // Plus sign (URL = 0)
3835 #define PLS (signed char)((-'+' + 62) & 0xff)
3836 // Hyphen (URL = 1)
3837 #define HYP (signed char)((-'-' + 62) & 0xff)
3838 // Slash (URL = 0)
3839 #define SLS (signed char)((-'/' + 63) & 0xff)
3840 // Underscore (URL = 1)
3841 #define US  (signed char)((-'_' + 63) & 0xff)
3842 
3843 // For P10 (or later) only
3844 #define VALID_B64 0x80
3845 #define VB64(x) (VALID_B64 | x)
3846 
3847 #define BLK_OFFSETOF(x) (offsetof(constant_block, x))
3848 
3849 // In little-endian mode, the lxv instruction loads the element at EA into
3850 // element 15 of the vector register, EA+1 goes into element 14, and so
3851 // on.
3852 //
3853 // To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the
3854 // order of the elements in a vector initialization.
3855 #define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0
3856 
3857   //
3858   // Base64 decodeBlock intrinsic
3859   address generate_base64_decodeBlock() {
3860     __ align(CodeEntryAlignment);
3861     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
3862     StubCodeMark mark(this, stub_id);
3863     address start   = __ function_entry();
3864 
3865     typedef struct {
3866       signed char offsetLUT_val[16];
3867       signed char offsetLUT_URL_val[16];
3868       unsigned char maskLUT_val[16];
3869       unsigned char maskLUT_URL_val[16];
3870       unsigned char bitposLUT_val[16];
3871       unsigned char table_32_47_val[16];
3872       unsigned char table_32_47_URL_val[16];
3873       unsigned char table_48_63_val[16];
3874       unsigned char table_64_79_val[16];
3875       unsigned char table_80_95_val[16];
3876       unsigned char table_80_95_URL_val[16];
3877       unsigned char table_96_111_val[16];
3878       unsigned char table_112_127_val[16];
3879       unsigned char pack_lshift_val[16];
3880       unsigned char pack_rshift_val[16];
3881       unsigned char pack_permute_val[16];
3882     } constant_block;
3883 
3884     alignas(16) static const constant_block const_block = {
3885 
3886       .offsetLUT_val = {
3887         ARRAY_TO_LXV_ORDER(
3888         0,   0, PLS, DIG,  UC,  UC,  LC,  LC,
3889         0,   0,   0,   0,   0,   0,   0,   0 ) },
3890 
3891       .offsetLUT_URL_val = {
3892         ARRAY_TO_LXV_ORDER(
3893         0,   0, HYP, DIG,  UC,  UC,  LC,  LC,
3894         0,   0,   0,   0,   0,   0,   0,   0 ) },
3895 
3896       .maskLUT_val = {
3897         ARRAY_TO_LXV_ORDER(
3898         /* 0        */ (unsigned char)0b10101000,
3899         /* 1 .. 9   */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3900                        (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3901                        (unsigned char)0b11111000,
3902         /* 10       */ (unsigned char)0b11110000,
3903         /* 11       */ (unsigned char)0b01010100,
3904         /* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000,
3905         /* 15       */ (unsigned char)0b01010100 ) },
3906 
3907       .maskLUT_URL_val = {
3908         ARRAY_TO_LXV_ORDER(
3909         /* 0        */ (unsigned char)0b10101000,
3910         /* 1 .. 9   */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3911                        (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3912                        (unsigned char)0b11111000,
3913         /* 10       */ (unsigned char)0b11110000,
3914         /* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000,
3915         /* 13       */ (unsigned char)0b01010100,
3916         /* 14       */ (unsigned char)0b01010000,
3917         /* 15       */ (unsigned char)0b01110000 ) },
3918 
3919       .bitposLUT_val = {
3920         ARRAY_TO_LXV_ORDER(
3921         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80,
3922         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) },
3923 
3924       // In the following table_*_val constants, a 0 value means the
3925       // character is not in the Base64 character set
3926       .table_32_47_val = {
3927         ARRAY_TO_LXV_ORDER (
3928          /* space .. '*' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '+' = 62 */ VB64(62), /* ',' .. '.' = 0 */ 0, 0, 0, /* '/' = 63 */ VB64(63) ) },
3929 
3930       .table_32_47_URL_val = {
3931         ARRAY_TO_LXV_ORDER(
3932          /* space .. ',' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '-' = 62 */ VB64(62), /* '.' .. '/' */ 0, 0 ) },
3933 
3934       .table_48_63_val = {
3935         ARRAY_TO_LXV_ORDER(
3936          /* '0' .. '9' = 52 .. 61 */ VB64(52), VB64(53), VB64(54), VB64(55), VB64(56), VB64(57), VB64(58), VB64(59), VB64(60), VB64(61),
3937          /* ':' .. '?' = 0 */ 0, 0, 0, 0, 0, 0 ) },
3938 
3939       .table_64_79_val = {
3940         ARRAY_TO_LXV_ORDER(
3941          /* '@' = 0 */ 0, /* 'A' .. 'O' = 0 .. 14 */ VB64(0), VB64(1), VB64(2), VB64(3), VB64(4), VB64(5), VB64(6), VB64(7), VB64(8),
3942          VB64(9), VB64(10), VB64(11), VB64(12), VB64(13), VB64(14) ) },
3943 
3944       .table_80_95_val = {
3945         ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3946         VB64(23), VB64(24), VB64(25), /* '[' .. '_' = 0 */ 0, 0, 0, 0, 0 ) },
3947 
3948       .table_80_95_URL_val = {
3949         ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3950         VB64(23), VB64(24), VB64(25), /* '[' .. '^' = 0 */ 0, 0, 0, 0, /* '_' = 63 */ VB64(63) ) },
3951 
3952       .table_96_111_val = {
3953         ARRAY_TO_LXV_ORDER(/* '`' = 0 */ 0, /* 'a' .. 'o' = 26 .. 40 */ VB64(26), VB64(27), VB64(28), VB64(29), VB64(30), VB64(31),
3954         VB64(32), VB64(33), VB64(34), VB64(35), VB64(36), VB64(37), VB64(38), VB64(39), VB64(40) ) },
3955 
3956       .table_112_127_val = {
3957         ARRAY_TO_LXV_ORDER(/* 'p' .. 'z' = 41 .. 51 */ VB64(41), VB64(42), VB64(43), VB64(44), VB64(45), VB64(46), VB64(47), VB64(48),
3958         VB64(49), VB64(50), VB64(51), /* '{' .. DEL = 0 */ 0, 0, 0, 0, 0 ) },
3959 
3960       .pack_lshift_val = {
3961         ARRAY_TO_LXV_ORDER(
3962         0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2 ) },
3963 
3964       .pack_rshift_val = {
3965         ARRAY_TO_LXV_ORDER(
3966         0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0 ) },
3967 
3968       // The first 4 index values are "don't care" because
3969       // we only use the first 12 bytes of the vector,
3970       // which are decoded from 16 bytes of Base64 characters.
3971       .pack_permute_val = {
3972         ARRAY_TO_LXV_ORDER(
3973          0, 0, 0, 0,
3974          0,  1,  2,
3975          4,  5,  6,
3976          8,  9, 10,
3977         12, 13, 14 ) }
3978     };
3979 
3980     const unsigned block_size = 16;  // number of bytes to process in each pass through the loop
3981     const unsigned block_size_shift = 4;
3982 
3983     // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
3984     Register s      = R3_ARG1; // source starting address of Base64 characters
3985     Register sp     = R4_ARG2; // source offset
3986     Register sl     = R5_ARG3; // source length = # of Base64 characters to be processed
3987     Register d      = R6_ARG4; // destination address
3988     Register dp     = R7_ARG5; // destination offset
3989     Register isURL  = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
3990     Register isMIME = R9_ARG7; // boolean, if non-zero indicates use of RFC 2045 MIME encoding - not used
3991 
3992     // Local variables
3993     Register const_ptr     = R9;  // used for loading constants
3994     Register tmp_reg       = R10; // used for speeding up load_constant_optimized()
3995 
3996     // Re-use R9 and R10 to avoid using non-volatile registers (requires save/restore)
3997     Register out           = R9;  // moving out (destination) pointer
3998     Register in            = R10; // moving in (source) pointer
3999 
4000     // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
4001     // VR Constants
4002     VectorRegister  vec_0s                  = VR0;
4003     VectorRegister  vec_4s                  = VR1;
4004     VectorRegister  vec_8s                  = VR2;
4005     VectorRegister  vec_special_case_char   = VR3;
4006     VectorRegister  pack_rshift             = VR4;
4007     VectorRegister  pack_lshift             = VR5;
4008 
4009     // VSR Constants
4010     VectorSRegister offsetLUT               = VSR0;
4011     VectorSRegister maskLUT                 = VSR1;
4012     VectorSRegister bitposLUT               = VSR2;
4013     VectorSRegister vec_0xfs                = VSR3;
4014     VectorSRegister vec_special_case_offset = VSR4;
4015     VectorSRegister pack_permute            = VSR5;
4016 
4017     // P10 (or later) VSR lookup constants
4018     VectorSRegister table_32_47             = VSR0;
4019     VectorSRegister table_48_63             = VSR1;
4020     VectorSRegister table_64_79             = VSR2;
4021     VectorSRegister table_80_95             = VSR3;
4022     VectorSRegister table_96_111            = VSR4;
4023     VectorSRegister table_112_127           = VSR6;
4024 
4025     // Data read in and later converted
4026     VectorRegister  input                   = VR6;
4027     // Variable for testing Base64 validity
4028     VectorRegister  non_match               = VR10;
4029 
4030     // P9 VR Variables for lookup
4031     VectorRegister  higher_nibble           = VR7;
4032     VectorRegister  eq_special_case_char    = VR8;
4033     VectorRegister  offsets                 = VR9;
4034 
4035     // P9 VSR lookup variables
4036     VectorSRegister bit                     = VSR6;
4037     VectorSRegister lower_nibble            = VSR7;
4038     VectorSRegister M                       = VSR8;
4039 
4040     // P10 (or later) VSR lookup variables
4041     VectorSRegister  xlate_a                = VSR7;
4042     VectorSRegister  xlate_b                = VSR8;
4043 
4044     // Variables for pack
4045     // VR
4046     VectorRegister  l                       = VR7;  // reuse higher_nibble's register
4047     VectorRegister  r                       = VR8;  // reuse eq_special_case_char's register
4048     VectorRegister  gathered                = VR10; // reuse non_match's register
4049 
4050     Label not_URL, calculate_size, loop_start, loop_exit, return_zero;
4051 
4052     // The upper 32 bits of the non-pointer parameter registers are not
4053     // guaranteed to be zero, so mask off those upper bits.
4054     __ clrldi(sp, sp, 32);
4055     __ clrldi(sl, sl, 32);
4056 
4057     // Don't handle the last 4 characters of the source, because this
4058     // VSX-based algorithm doesn't handle padding characters.  Also the
4059     // vector code will always write 16 bytes of decoded data on each pass,
4060     // but only the first 12 of those 16 bytes are valid data (16 base64
4061     // characters become 12 bytes of binary data), so for this reason we
4062     // need to subtract an additional 8 bytes from the source length, in
4063     // order not to write past the end of the destination buffer.  The
4064     // result of this subtraction implies that a Java function in the
4065     // Base64 class will be used to process the last 12 characters.
4066     __ sub(sl, sl, sp);
4067     __ subi(sl, sl, 12);
4068 
4069     // Load CTR with the number of passes through the loop
4070     // = sl >> block_size_shift.  After the shift, if sl <= 0, there's too
4071     // little data to be processed by this intrinsic.
4072     __ srawi_(sl, sl, block_size_shift);
4073     __ ble(CR0, return_zero);
4074     __ mtctr(sl);
4075 
4076     // Clear the other two parameter registers upper 32 bits.
4077     __ clrldi(isURL, isURL, 32);
4078     __ clrldi(dp, dp, 32);
4079 
4080     // Load constant vec registers that need to be loaded from memory
4081     __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
4082     __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
4083     __ lxv(pack_rshift->to_vsr(), BLK_OFFSETOF(pack_rshift_val), const_ptr);
4084     __ lxv(pack_lshift->to_vsr(), BLK_OFFSETOF(pack_lshift_val), const_ptr);
4085     __ lxv(pack_permute, BLK_OFFSETOF(pack_permute_val), const_ptr);
4086 
4087     // Splat the constants that can use xxspltib
4088     __ xxspltib(vec_0s->to_vsr(), 0);
4089     __ xxspltib(vec_8s->to_vsr(), 8);
4090     if (PowerArchitecturePPC64 >= 10) {
4091       // Using VALID_B64 for the offsets effectively strips the upper bit
4092       // of each byte that was selected from the table.  Setting the upper
4093       // bit gives us a way to distinguish between the 6-bit value of 0
4094       // from an error code of 0, which will happen if the character is
4095       // outside the range of the lookup, or is an illegal Base64
4096       // character, such as %.
4097       __ xxspltib(offsets->to_vsr(), VALID_B64);
4098 
4099       __ lxv(table_48_63, BLK_OFFSETOF(table_48_63_val), const_ptr);
4100       __ lxv(table_64_79, BLK_OFFSETOF(table_64_79_val), const_ptr);
4101       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
4102       __ lxv(table_96_111, BLK_OFFSETOF(table_96_111_val), const_ptr);
4103       __ lxv(table_112_127, BLK_OFFSETOF(table_112_127_val), const_ptr);
4104     } else {
4105       __ xxspltib(vec_4s->to_vsr(), 4);
4106       __ xxspltib(vec_0xfs, 0xf);
4107       __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
4108     }
4109 
4110     // The rest of the constants use different values depending on the
4111     // setting of isURL
4112     __ cmpwi(CR0, isURL, 0);
4113     __ beq(CR0, not_URL);
4114 
4115     // isURL != 0 (true)
4116     if (PowerArchitecturePPC64 >= 10) {
4117       __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_URL_val), const_ptr);
4118       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_URL_val), const_ptr);
4119     } else {
4120       __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_URL_val), const_ptr);
4121       __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_URL_val), const_ptr);
4122       __ xxspltib(vec_special_case_char->to_vsr(), '_');
4123       __ xxspltib(vec_special_case_offset, (unsigned char)US);
4124     }
4125     __ b(calculate_size);
4126 
4127     // isURL = 0 (false)
4128     __ bind(not_URL);
4129     if (PowerArchitecturePPC64 >= 10) {
4130       __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_val), const_ptr);
4131       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
4132     } else {
4133       __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_val), const_ptr);
4134       __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_val), const_ptr);
4135       __ xxspltib(vec_special_case_char->to_vsr(), '/');
4136       __ xxspltib(vec_special_case_offset, (unsigned char)SLS);
4137     }
4138 
4139     __ bind(calculate_size);
4140 
4141     // out starts at d + dp
4142     __ add(out, d, dp);
4143 
4144     // in starts at s + sp
4145     __ add(in, s, sp);
4146 
4147     __ align(32);
4148     __ bind(loop_start);
4149     __ lxv(input->to_vsr(), 0, in); // offset=0
4150 
4151     //
4152     // Lookup
4153     //
4154     if (PowerArchitecturePPC64 >= 10) {
4155       // Use xxpermx to do a lookup of each Base64 character in the
4156       // input vector and translate it to a 6-bit value + 0x80.
4157       // Characters which are not valid Base64 characters will result
4158       // in a zero in the corresponding byte.
4159       //
4160       // Note that due to align(32) call above, the xxpermx instructions do
4161       // not require align_prefix() calls, since the final xxpermx
4162       // prefix+opcode is at byte 24.
4163       __ xxpermx(xlate_a, table_32_47, table_48_63, input->to_vsr(), 1);    // offset=4
4164       __ xxpermx(xlate_b, table_64_79, table_80_95, input->to_vsr(), 2);    // offset=12
4165       __ xxlor(xlate_b, xlate_a, xlate_b);                                  // offset=20
4166       __ xxpermx(xlate_a, table_96_111, table_112_127, input->to_vsr(), 3); // offset=24
4167       __ xxlor(input->to_vsr(), xlate_a, xlate_b);
4168       // Check for non-Base64 characters by comparing each byte to zero.
4169       __ vcmpequb_(non_match, input, vec_0s);
4170     } else {
4171       // Isolate the upper 4 bits of each character by shifting it right 4 bits
4172       __ vsrb(higher_nibble, input, vec_4s);
4173       // Isolate the lower 4 bits by masking
4174       __ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
4175 
4176       // Get the offset (the value to subtract from the byte) by using
4177       // a lookup table indexed by the upper 4 bits of the character
4178       __ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
4179 
4180       // Find out which elements are the special case character (isURL ? '/' : '-')
4181       __ vcmpequb(eq_special_case_char, input, vec_special_case_char);
4182 
4183       // For each character in the input which is a special case
4184       // character, replace its offset with one that is special for that
4185       // character.
4186       __ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
4187 
4188       // Use the lower_nibble to select a mask "M" from the lookup table.
4189       __ xxperm(M, maskLUT, lower_nibble);
4190 
4191       // "bit" is used to isolate which of the bits in M is relevant.
4192       __ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
4193 
4194       // Each element of non_match correspond to one each of the 16 input
4195       // characters.  Those elements that become 0x00 after the xxland
4196       // instruction are invalid Base64 characters.
4197       __ xxland(non_match->to_vsr(), M, bit);
4198 
4199       // Compare each element to zero
4200       //
4201       __ vcmpequb_(non_match, non_match, vec_0s);
4202     }
4203     // vmcmpequb_ sets the EQ bit of CR6 if no elements compare equal.
4204     // Any element comparing equal to zero means there is an error in
4205     // that element.  Note that the comparison result register
4206     // non_match is not referenced again.  Only CR6-EQ matters.
4207     __ bne_predict_not_taken(CR6, loop_exit);
4208 
4209     // The Base64 characters had no errors, so add the offsets, which in
4210     // the case of Power10 is a constant vector of all 0x80's (see earlier
4211     // comment where the offsets register is loaded).
4212     __ vaddubm(input, input, offsets);
4213 
4214     // Pack
4215     //
4216     // In the tables below, b0, b1, .. b15 are the bytes of decoded
4217     // binary data, the first line of each of the cells (except for
4218     // the constants) uses the bit-field nomenclature from the
4219     // above-linked paper, whereas the second line is more specific
4220     // about which exact bits are present, and is constructed using the
4221     // Power ISA 3.x document style, where:
4222     //
4223     // * The specifier after the colon depicts which bits are there.
4224     // * The bit numbering is big endian style (bit 0 is the most
4225     //   significant).
4226     // * || is a concatenate operator.
4227     // * Strings of 0's are a field of zeros with the shown length, and
4228     //   likewise for strings of 1's.
4229 
4230     // Note that only e12..e15 are shown here because the shifting
4231     // and OR'ing pattern replicates for e8..e11, e4..7, and
4232     // e0..e3.
4233     //
4234     // +======================+=================+======================+======================+=============+
4235     // |        Vector        |       e12       |         e13          |         e14          |     e15     |
4236     // |       Element        |                 |                      |                      |             |
4237     // +======================+=================+======================+======================+=============+
4238     // |    after vaddubm     |    00dddddd     |       00cccccc       |       00bbbbbb       |  00aaaaaa   |
4239     // |                      |   00||b2:2..7   | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4240     // +----------------------+-----------------+----------------------+----------------------+-------------+
4241     // |     pack_lshift      |                 |         << 6         |         << 4         |    << 2     |
4242     // +----------------------+-----------------+----------------------+----------------------+-------------+
4243     // |     l after vslb     |    00dddddd     |       cc000000       |       bbbb0000       |  aaaaaa00   |
4244     // |                      |   00||b2:2..7   |   b2:0..1||000000    |    b1:0..3||0000     | b0:0..5||00 |
4245     // +----------------------+-----------------+----------------------+----------------------+-------------+
4246     // |     l after vslo     |    cc000000     |       bbbb0000       |       aaaaaa00       |  00000000   |
4247     // |                      | b2:0..1||000000 |    b1:0..3||0000     |     b0:0..5||00      |  00000000   |
4248     // +----------------------+-----------------+----------------------+----------------------+-------------+
4249     // |     pack_rshift      |                 |         >> 2         |         >> 4         |             |
4250     // +----------------------+-----------------+----------------------+----------------------+-------------+
4251     // |     r after vsrb     |    00dddddd     |       0000cccc       |       000000bb       |  00aaaaaa   |
4252     // |                      |   00||b2:2..7   |    0000||b1:4..7     |   000000||b0:6..7    | 00||b0:0..5 |
4253     // +----------------------+-----------------+----------------------+----------------------+-------------+
4254     // | gathered after xxlor |    ccdddddd     |       bbbbcccc       |       aaaaaabb       |  00aaaaaa   |
4255     // |                      |     b2:0..7     |       b1:0..7        |       b0:0..7        | 00||b0:0..5 |
4256     // +======================+=================+======================+======================+=============+
4257     //
4258     // Note: there is a typo in the above-linked paper that shows the result of the gathering process is:
4259     // [ddddddcc|bbbbcccc|aaaaaabb]
4260     // but should be:
4261     // [ccdddddd|bbbbcccc|aaaaaabb]
4262     //
4263     __ vslb(l, input, pack_lshift);
4264     // vslo of vec_8s shifts the vector by one octet toward lower
4265     // element numbers, discarding element 0.  This means it actually
4266     // shifts to the right (not left) according to the order of the
4267     // table above.
4268     __ vslo(l, l, vec_8s);
4269     __ vsrb(r, input, pack_rshift);
4270     __ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
4271 
4272     // Final rearrangement of bytes into their correct positions.
4273     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4274     // |    Vector    |  e0  |  e1  |  e2  |  e3  | e4  | e5  | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
4275     // |   Elements   |      |      |      |      |     |     |    |    |    |    |     |     |     |     |     |     |
4276     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4277     // | after xxlor  | b11  | b10  |  b9  |  xx  | b8  | b7  | b6 | xx | b5 | b4 | b3  | xx  | b2  | b1  | b0  | xx  |
4278     // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4279     // | pack_permute |  0   |  0   |  0   |  0   |  0  |  1  | 2  | 4  | 5  | 6  |  8  |  9  | 10  | 12  | 13  | 14  |
4280     // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4281     // | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5  | b4  | b3  | b2  | b1  | b0  |
4282     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4283     // xx bytes are not used to form the final data
4284     // b0..b15 are the decoded and reassembled 8-bit bytes of data
4285     // b11 with asterisk is a "don't care", because these bytes will be
4286     // overwritten on the next iteration.
4287     __ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
4288 
4289     // We cannot use a static displacement on the store, since it's a
4290     // multiple of 12, not 16.  Note that this stxv instruction actually
4291     // writes 16 bytes, even though only the first 12 are valid data.
4292     __ stxv(gathered->to_vsr(), 0, out);
4293     __ addi(out, out, 12);
4294     __ addi(in, in, 16);
4295     __ bdnz(loop_start);
4296 
4297     __ bind(loop_exit);
4298 
4299     // Return the number of out bytes produced, which is (out - (d + dp)) == out - d - dp;
4300     __ sub(R3_RET, out, d);
4301     __ sub(R3_RET, R3_RET, dp);
4302 
4303     __ blr();
4304 
4305     __ bind(return_zero);
4306     __ li(R3_RET, 0);
4307     __ blr();
4308 
4309     return start;
4310   }
4311 
4312 #undef UC
4313 #undef LC
4314 #undef DIG
4315 #undef PLS
4316 #undef HYP
4317 #undef SLS
4318 #undef US
4319 
4320 // This algorithm is based on the methods described in this paper:
4321 // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
4322 //
4323 // The details of this implementation vary from the paper due to the
4324 // difference in the ISA between SSE and AltiVec, especially in the
4325 // splitting bytes section where there is no need on Power to mask after
4326 // the shift because the shift is byte-wise rather than an entire an entire
4327 // 128-bit word.
4328 //
4329 // For the lookup part of the algorithm, different logic is used than
4330 // described in the paper because of the availability of vperm, which can
4331 // do a 64-byte table lookup in four instructions, while preserving the
4332 // branchless nature.
4333 //
4334 // Description of the ENCODE_CORE macro
4335 //
4336 // Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2
4337 // bits of each byte are zeros)
4338 //
4339 // (Note: e7..e0 are not shown because they follow the same pattern as
4340 // e8..e15)
4341 //
4342 // In the table below, b0, b1, .. b15 are the bytes of unencoded
4343 // binary data, the first line of each of the cells (except for
4344 // the constants) uses the bit-field nomenclature from the
4345 // above-linked paper, whereas the second line is more specific
4346 // about which exact bits are present, and is constructed using the
4347 // Power ISA 3.x document style, where:
4348 //
4349 // * The specifier after the colon depicts which bits are there.
4350 // * The bit numbering is big endian style (bit 0 is the most
4351 //   significant).
4352 // * || is a concatenate operator.
4353 // * Strings of 0's are a field of zeros with the shown length, and
4354 //   likewise for strings of 1's.
4355 //
4356 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4357 // |          Vector          |     e8      |          e9          |         e10          |     e11     |     e12     |         e13          |         e14          |     e15     |
4358 // |         Element          |             |                      |                      |             |             |                      |                      |             |
4359 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4360 // |        after lxv         |  jjjjkkkk   |       iiiiiijj       |       gghhhhhh       |  ffffgggg   |  eeeeeeff   |       ccdddddd       |       bbbbcccc       |  aaaaaabb   |
4361 // |                          |     b7      |          b6          |          b5          |     b4      |     b3      |          b2          |          b1          |     b0      |
4362 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4363 // |      xxperm indexes      |      0      |          10          |          11          |     12      |      0      |          13          |          14          |     15      |
4364 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4365 // |     (1) after xxperm     |             |       gghhhhhh       |       ffffgggg       |  eeeeeeff   |             |       ccdddddd       |       bbbbcccc       |  aaaaaabb   |
4366 // |                          |    (b15)    |          b5          |          b4          |     b3      |    (b15)    |          b2          |          b1          |     b0      |
4367 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4368 // |      rshift_amount       |      0      |          6           |          4           |      2      |      0      |          6           |          4           |      2      |
4369 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4370 // |        after vsrb        |             |       000000gg       |       0000ffff       |  00eeeeee   |             |       000000cc       |       0000bbbb       |  00aaaaaa   |
4371 // |                          |    (b15)    |   000000||b5:0..1    |    0000||b4:0..3     | 00||b3:0..5 |    (b15)    |   000000||b2:0..1    |    0000||b1:0..3     | 00||b0:0..5 |
4372 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4373 // |       rshift_mask        |  00000000   |      000000||11      |      0000||1111      | 00||111111  |  00000000   |      000000||11      |      0000||1111      | 00||111111  |
4374 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4375 // |    rshift after vand     |  00000000   |       000000gg       |       0000ffff       |  00eeeeee   |  00000000   |       000000cc       |       0000bbbb       |  00aaaaaa   |
4376 // |                          |  00000000   |   000000||b5:0..1    |    0000||b4:0..3     | 00||b3:0..5 |  00000000   |   000000||b2:0..1    |    0000||b1:0..3     | 00||b0:0..5 |
4377 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4378 // |    1 octet lshift (1)    |  gghhhhhh   |       ffffgggg       |       eeeeeeff       |             |  ccdddddd   |       bbbbcccc       |       aaaaaabb       |  00000000   |
4379 // |                          |     b5      |          b4          |          b3          |    (b15)    |     b2      |          b1          |          b0          |  00000000   |
4380 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4381 // |      lshift_amount       |      0      |          2           |          4           |      0      |      0      |          2           |          4           |      0      |
4382 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4383 // |        after vslb        |  gghhhhhh   |       ffgggg00       |       eeff0000       |             |  ccdddddd   |       bbcccc00       |       aabb0000       |  00000000   |
4384 // |                          |     b5      |     b4:2..7||00      |    b3:4..7||0000     |    (b15)    |   b2:0..7   |     b1:2..7||00      |    b0:4..7||0000     |  00000000   |
4385 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4386 // |       lshift_mask        | 00||111111  |     00||1111||00     |     00||11||0000     |  00000000   | 00||111111  |     00||1111||00     |     00||11||0000     |  00000000   |
4387 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4388 // |    lshift after vand     |  00hhhhhh   |       00gggg00       |       00ff0000       |  00000000   |  00dddddd   |       00cccc00       |       00bb0000       |  00000000   |
4389 // |                          | 00||b5:2..7 |   00||b4:4..7||00    |  00||b3:6..7||0000   |  00000000   | 00||b2:2..7 |   00||b1:4..7||00    |  00||b0:6..7||0000   |  00000000   |
4390 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4391 // | after vor lshift, rshift |  00hhhhhh   |       00gggggg       |       00ffffff       |  00eeeeee   |  00dddddd   |       00cccccc       |       00bbbbbb       |  00aaaaaa   |
4392 // |                          | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4393 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4394 //
4395 // Expand the first 12 bytes into 16 bytes, leaving every 4th byte
4396 // blank for now.
4397 // __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);
4398 //
4399 // Generate two bit-shifted pieces - rshift and lshift - that will
4400 // later be OR'd together.
4401 //
4402 // First the right-shifted piece
4403 // __ vsrb(rshift, input, expand_rshift);
4404 // __ vand(rshift, rshift, expand_rshift_mask);
4405 //
4406 // Now the left-shifted piece, which is done by octet shifting
4407 // the input one byte to the left, then doing a variable shift,
4408 // followed by a mask operation.
4409 //
4410 // __ vslo(lshift, input, vec_8s);
4411 // __ vslb(lshift, lshift, expand_lshift);
4412 // __ vand(lshift, lshift, expand_lshift_mask);
4413 //
4414 // Combine the two pieces by OR'ing
4415 // __ vor(expanded, rshift, lshift);
4416 //
4417 // At this point, expanded is a vector containing a 6-bit value in each
4418 // byte.  These values are used as indexes into a 64-byte lookup table that
4419 // is contained in four vector registers.  The lookup operation is done
4420 // using vperm instructions with the same indexes for the lower 32 and
4421 // upper 32 bytes.  To figure out which of the two looked-up bytes to use
4422 // at each location, all values in expanded are compared to 31.  Using
4423 // vsel, values higher than 31 use the results from the upper 32 bytes of
4424 // the lookup operation, while values less than or equal to 31 use the
4425 // lower 32 bytes of the lookup operation.
4426 //
4427 // Note: it's tempting to use a xxpermx,xxpermx,vor sequence here on
4428 // Power10 (or later), but experiments doing so on Power10 yielded a slight
4429 // performance drop, perhaps due to the need for xxpermx instruction
4430 // prefixes.
4431 
4432 #define ENCODE_CORE                                                        \
4433     __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);           \
4434     __ vsrb(rshift, input, expand_rshift);                                 \
4435     __ vand(rshift, rshift, expand_rshift_mask);                           \
4436     __ vslo(lshift, input, vec_8s);                                        \
4437     __ vslb(lshift, lshift, expand_lshift);                                \
4438     __ vand(lshift, lshift, expand_lshift_mask);                           \
4439     __ vor(expanded, rshift, lshift);                                      \
4440     __ vperm(encoded_00_31, vec_base64_00_15, vec_base64_16_31, expanded); \
4441     __ vperm(encoded_32_63, vec_base64_32_47, vec_base64_48_63, expanded); \
4442     __ vcmpgtub(gt_31, expanded, vec_31s);                                 \
4443     __ vsel(expanded, encoded_00_31, encoded_32_63, gt_31);
4444 
4445 // Intrinsic function prototype in Base64.java:
4446 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
4447 
4448   address generate_base64_encodeBlock() {
4449     __ align(CodeEntryAlignment);
4450     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
4451     StubCodeMark mark(this, stub_id);
4452     address start   = __ function_entry();
4453 
4454     typedef struct {
4455       unsigned char expand_permute_val[16];
4456       unsigned char expand_rshift_val[16];
4457       unsigned char expand_rshift_mask_val[16];
4458       unsigned char expand_lshift_val[16];
4459       unsigned char expand_lshift_mask_val[16];
4460       unsigned char base64_00_15_val[16];
4461       unsigned char base64_16_31_val[16];
4462       unsigned char base64_32_47_val[16];
4463       unsigned char base64_48_63_val[16];
4464       unsigned char base64_48_63_URL_val[16];
4465     } constant_block;
4466 
4467     alignas(16) static const constant_block const_block = {
4468       .expand_permute_val = {
4469         ARRAY_TO_LXV_ORDER(
4470         0,  4,  5,  6,
4471         0,  7,  8,  9,
4472         0, 10, 11, 12,
4473         0, 13, 14, 15 ) },
4474 
4475       .expand_rshift_val = {
4476         ARRAY_TO_LXV_ORDER(
4477         0, 6, 4, 2,
4478         0, 6, 4, 2,
4479         0, 6, 4, 2,
4480         0, 6, 4, 2 ) },
4481 
4482       .expand_rshift_mask_val = {
4483         ARRAY_TO_LXV_ORDER(
4484         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4485         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4486         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4487         0b00000000, 0b00000011, 0b00001111, 0b00111111 ) },
4488 
4489       .expand_lshift_val = {
4490         ARRAY_TO_LXV_ORDER(
4491         0, 2, 4, 0,
4492         0, 2, 4, 0,
4493         0, 2, 4, 0,
4494         0, 2, 4, 0 ) },
4495 
4496       .expand_lshift_mask_val = {
4497         ARRAY_TO_LXV_ORDER(
4498         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4499         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4500         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4501         0b00111111, 0b00111100, 0b00110000, 0b00000000 ) },
4502 
4503       .base64_00_15_val = {
4504         ARRAY_TO_LXV_ORDER(
4505         'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P' ) },
4506 
4507       .base64_16_31_val = {
4508         ARRAY_TO_LXV_ORDER(
4509         'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f' ) },
4510 
4511       .base64_32_47_val = {
4512         ARRAY_TO_LXV_ORDER(
4513         'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v' ) },
4514 
4515       .base64_48_63_val = {
4516         ARRAY_TO_LXV_ORDER(
4517         'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/' ) },
4518 
4519       .base64_48_63_URL_val = {
4520         ARRAY_TO_LXV_ORDER(
4521         'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) }
4522     };
4523 
4524     // Number of bytes to process in each pass through the main loop.
4525     // 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes.
4526     const unsigned block_size = 12;
4527 
4528     // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
4529     Register src       = R3_ARG1; // source starting address of Base64 characters
4530     Register sp        = R4_ARG2; // source starting position
4531     Register sl        = R5_ARG3; // total source length of the Base64 characters to be processed
4532     Register dst       = R6_ARG4; // destination address
4533     Register dp        = R7_ARG5; // destination starting position
4534     Register isURL     = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
4535 
4536     // Local variables
4537     Register const_ptr     = R12; // used for loading constants (reuses isURL's register)
4538     Register tmp_reg       = R9;  // used for speeding up load_constant()
4539 
4540     Register size           = R9;  // number of bytes to process (reuses tmp_reg's register)
4541     Register blocked_size   = R10; // number of bytes to process a block at a time
4542     Register block_modulo   = R12; // == block_size (reuse const_ptr)
4543     Register remaining      = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg)
4544     Register in             = R4;  // current input (source) pointer (reuse sp's register)
4545     Register num_blocks     = R11; // number of blocks to be processed by the loop
4546     Register out            = R8;  // current output (destination) pointer (reuse const_ptr's register)
4547     Register three          = R9;  // constant divisor (reuse size's register)
4548     Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register)
4549     Register tmp1           = R7;  // temp register for lxvl length (reuse dp's register)
4550     Register modulo_chars   = R7;  // number of bytes written during the final write % 4 (reuse tmp1's register)
4551     Register pad_char       = R6;  // literal '=' (reuse dst's register)
4552 
4553     // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
4554     // VR Constants
4555     VectorRegister  vec_8s             = VR0;
4556     VectorRegister  vec_31s            = VR1;
4557     VectorRegister  vec_base64_00_15   = VR2;
4558     VectorRegister  vec_base64_16_31   = VR3;
4559     VectorRegister  vec_base64_32_47   = VR4;
4560     VectorRegister  vec_base64_48_63   = VR5;
4561     VectorRegister  expand_rshift      = VR6;
4562     VectorRegister  expand_rshift_mask = VR7;
4563     VectorRegister  expand_lshift      = VR8;
4564     VectorRegister  expand_lshift_mask = VR9;
4565 
4566     // VR variables for expand
4567     VectorRegister  input              = VR10;
4568     VectorRegister  rshift             = VR11;
4569     VectorRegister  lshift             = VR12;
4570     VectorRegister  expanded           = VR13;
4571 
4572     // VR variables for lookup
4573     VectorRegister  encoded_00_31      = VR10; // (reuse input)
4574     VectorRegister  encoded_32_63      = VR11; // (reuse rshift)
4575     VectorRegister  gt_31              = VR12; // (reuse lshift)
4576 
4577     // VSR Constants
4578     VectorSRegister expand_permute     = VSR0;
4579 
4580     Label not_URL, calculate_size, calculate_blocked_size, skip_loop;
4581     Label loop_start, le_16_to_write, no_pad, one_pad_char;
4582 
4583     // The upper 32 bits of the non-pointer parameter registers are not
4584     // guaranteed to be zero, so mask off those upper bits.
4585     __ clrldi(sp, sp, 32);
4586     __ clrldi(sl, sl, 32);
4587     __ clrldi(dp, dp, 32);
4588     __ clrldi(isURL, isURL, 32);
4589 
4590     // load up the constants
4591     __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
4592     __ lxv(expand_permute,               BLK_OFFSETOF(expand_permute_val),     const_ptr);
4593     __ lxv(expand_rshift->to_vsr(),      BLK_OFFSETOF(expand_rshift_val),      const_ptr);
4594     __ lxv(expand_rshift_mask->to_vsr(), BLK_OFFSETOF(expand_rshift_mask_val), const_ptr);
4595     __ lxv(expand_lshift->to_vsr(),      BLK_OFFSETOF(expand_lshift_val),      const_ptr);
4596     __ lxv(expand_lshift_mask->to_vsr(), BLK_OFFSETOF(expand_lshift_mask_val), const_ptr);
4597     __ lxv(vec_base64_00_15->to_vsr(),   BLK_OFFSETOF(base64_00_15_val),       const_ptr);
4598     __ lxv(vec_base64_16_31->to_vsr(),   BLK_OFFSETOF(base64_16_31_val),       const_ptr);
4599     __ lxv(vec_base64_32_47->to_vsr(),   BLK_OFFSETOF(base64_32_47_val),       const_ptr);
4600 
4601     // Splat the constants that can use xxspltib
4602     __ xxspltib(vec_8s->to_vsr(), 8);
4603     __ xxspltib(vec_31s->to_vsr(), 31);
4604 
4605 
4606     // Use a different translation lookup table depending on the
4607     // setting of isURL
4608     __ cmpdi(CR0, isURL, 0);
4609     __ beq(CR0, not_URL);
4610     __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr);
4611     __ b(calculate_size);
4612 
4613     __ bind(not_URL);
4614     __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_val), const_ptr);
4615 
4616     __ bind(calculate_size);
4617 
4618     // size = sl - sp - 4 (*)
4619     // (*) Don't process the last four bytes in the main loop because
4620     // we don't want the lxv instruction to read past the end of the src
4621     // data, in case those four bytes are on the start of an unmapped or
4622     // otherwise inaccessible page.
4623     //
4624     __ sub(size, sl, sp);
4625     __ subi(size, size, 4);
4626     __ cmpdi(CR7, size, block_size);
4627     __ bgt(CR7, calculate_blocked_size);
4628     __ mr(remaining, size);
4629     // Add the 4 back into remaining again
4630     __ addi(remaining, remaining, 4);
4631     // make "in" point to the beginning of the source data: in = src + sp
4632     __ add(in, src, sp);
4633     // out = dst + dp
4634     __ add(out, dst, dp);
4635     __ b(skip_loop);
4636 
4637     __ bind(calculate_blocked_size);
4638     __ li(block_modulo, block_size);
4639     // num_blocks = size / block_modulo
4640     __ divwu(num_blocks, size, block_modulo);
4641     // blocked_size = num_blocks * size
4642     __ mullw(blocked_size, num_blocks, block_modulo);
4643     // remaining = size - blocked_size
4644     __ sub(remaining, size, blocked_size);
4645     __ mtctr(num_blocks);
4646 
4647     // Add the 4 back in to remaining again
4648     __ addi(remaining, remaining, 4);
4649 
4650     // make "in" point to the beginning of the source data: in = src + sp
4651     __ add(in, src, sp);
4652 
4653     // out = dst + dp
4654     __ add(out, dst, dp);
4655 
4656     __ align(32);
4657     __ bind(loop_start);
4658 
4659     __ lxv(input->to_vsr(), 0, in);
4660 
4661     ENCODE_CORE
4662 
4663     __ stxv(expanded->to_vsr(), 0, out);
4664     __ addi(in, in, 12);
4665     __ addi(out, out, 16);
4666     __ bdnz(loop_start);
4667 
4668     __ bind(skip_loop);
4669 
4670     // When there are less than 16 bytes left, we need to be careful not to
4671     // read beyond the end of the src buffer, which might be in an unmapped
4672     // page.
4673     // Load the remaining bytes using lxvl.
4674     __ rldicr(tmp1, remaining, 56, 7);
4675     __ lxvl(input->to_vsr(), in, tmp1);
4676 
4677     ENCODE_CORE
4678 
4679     // bytes_to_write = ((remaining * 4) + 2) / 3
4680     __ li(three, 3);
4681     __ rlwinm(bytes_to_write, remaining, 2, 0, 29); // remaining * 4
4682     __ addi(bytes_to_write, bytes_to_write, 2);
4683     __ divwu(bytes_to_write, bytes_to_write, three);
4684 
4685     __ cmpwi(CR7, bytes_to_write, 16);
4686     __ ble_predict_taken(CR7, le_16_to_write);
4687     __ stxv(expanded->to_vsr(), 0, out);
4688 
4689     // We've processed 12 of the 13-15 data bytes, so advance the pointers,
4690     // and do one final pass for the remaining 1-3 bytes.
4691     __ addi(in, in, 12);
4692     __ addi(out, out, 16);
4693     __ subi(remaining, remaining, 12);
4694     __ subi(bytes_to_write, bytes_to_write, 16);
4695     __ rldicr(tmp1, bytes_to_write, 56, 7);
4696     __ lxvl(input->to_vsr(), in, tmp1);
4697 
4698     ENCODE_CORE
4699 
4700     __ bind(le_16_to_write);
4701     // shift bytes_to_write into the upper 8 bits of t1 for use by stxvl
4702     __ rldicr(tmp1, bytes_to_write, 56, 7);
4703     __ stxvl(expanded->to_vsr(), out, tmp1);
4704     __ add(out, out, bytes_to_write);
4705 
4706     __ li(pad_char, '=');
4707     __ rlwinm_(modulo_chars, bytes_to_write, 0, 30, 31); // bytes_to_write % 4, set CR0
4708     // Examples:
4709     //    remaining  bytes_to_write  modulo_chars  num pad chars
4710     //        0            0               0            0
4711     //        1            2               2            2
4712     //        2            3               3            1
4713     //        3            4               0            0
4714     //        4            6               2            2
4715     //        5            7               3            1
4716     //        ...
4717     //       12           16               0            0
4718     //       13           18               2            2
4719     //       14           19               3            1
4720     //       15           20               0            0
4721     __ beq(CR0, no_pad);
4722     __ cmpwi(CR7, modulo_chars, 3);
4723     __ beq(CR7, one_pad_char);
4724 
4725     // two pad chars
4726     __ stb(pad_char, out);
4727     __ addi(out, out, 1);
4728 
4729     __ bind(one_pad_char);
4730     __ stb(pad_char, out);
4731 
4732     __ bind(no_pad);
4733 
4734     __ blr();
4735     return start;
4736   }
4737 
4738 #endif // VM_LITTLE_ENDIAN
4739 
4740 void generate_lookup_secondary_supers_table_stub() {
4741     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
4742     StubCodeMark mark(this, stub_id);
4743 
4744     const Register
4745       r_super_klass  = R4_ARG2,
4746       r_array_base   = R3_ARG1,
4747       r_array_length = R7_ARG5,
4748       r_array_index  = R6_ARG4,
4749       r_sub_klass    = R5_ARG3,
4750       r_bitmap       = R11_scratch1,
4751       result         = R8_ARG6;
4752 
4753     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
4754       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
4755       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
4756                                              r_array_base, r_array_length, r_array_index,
4757                                              r_bitmap, result, slot);
4758       __ blr();
4759     }
4760   }
4761 
4762   // Slow path implementation for UseSecondarySupersTable.
4763   address generate_lookup_secondary_supers_table_slow_path_stub() {
4764     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
4765     StubCodeMark mark(this, stub_id);
4766 
4767     address start = __ pc();
4768     const Register
4769       r_super_klass  = R4_ARG2,
4770       r_array_base   = R3_ARG1,
4771       temp1          = R7_ARG5,
4772       r_array_index  = R6_ARG4,
4773       r_bitmap       = R11_scratch1,
4774       result         = R8_ARG6;
4775 
4776     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
4777     __ blr();
4778 
4779     return start;
4780   }
4781 
4782   address generate_cont_thaw(StubId stub_id) {
4783     if (!Continuations::enabled()) return nullptr;
4784 
4785     Continuation::thaw_kind kind;
4786     bool return_barrier;
4787     bool return_barrier_exception;
4788 
4789     switch (stub_id) {
4790     case StubId::stubgen_cont_thaw_id:
4791       kind = Continuation::thaw_top;
4792       return_barrier = false;
4793       return_barrier_exception = false;
4794       break;
4795     case StubId::stubgen_cont_returnBarrier_id:
4796       kind = Continuation::thaw_return_barrier;
4797       return_barrier = true;
4798       return_barrier_exception = false;
4799       break;
4800     case StubId::stubgen_cont_returnBarrierExc_id:
4801       kind = Continuation::thaw_return_barrier_exception;
4802       return_barrier = true;
4803       return_barrier_exception = true;
4804       break;
4805     default:
4806       ShouldNotReachHere();
4807     }
4808     StubCodeMark mark(this, stub_id);
4809 
4810     Register tmp1 = R10_ARG8;
4811     Register tmp2 = R9_ARG7;
4812     Register tmp3 = R8_ARG6;
4813     Register nvtmp = R15_esp;   // nonvolatile tmp register
4814     FloatRegister nvftmp = F20; // nonvolatile fp tmp register
4815 
4816     address start = __ pc();
4817 
4818     if (kind == Continuation::thaw_top) {
4819       __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
4820     }
4821 
4822     if (return_barrier) {
4823       assert(!InlineTypeReturnedAsFields, "unsupported");
4824       __ mr(nvtmp, R3_RET); __ fmr(nvftmp, F1_RET); // preserve possible return value from a method returning to the return barrier
4825       DEBUG_ONLY(__ ld_ptr(tmp1, _abi0(callers_sp), R1_SP);)
4826       __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
4827 #ifdef ASSERT
4828       __ ld_ptr(tmp2, _abi0(callers_sp), R1_SP);
4829       __ cmpd(CR0, tmp1, tmp2);
4830       __ asm_assert_eq(FILE_AND_LINE ": callers sp is corrupt");
4831 #endif
4832     }
4833 #ifdef ASSERT
4834     __ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread);
4835     __ cmpd(CR0, R1_SP, tmp1);
4836     __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
4837 #endif
4838 
4839     __ li(R4_ARG2, return_barrier ? 1 : 0);
4840     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), R16_thread, R4_ARG2);
4841 
4842 #ifdef ASSERT
4843     DEBUG_ONLY(__ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread));
4844     DEBUG_ONLY(__ cmpd(CR0, R1_SP, tmp1));
4845     __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
4846 #endif
4847 
4848     // R3_RET contains the size of the frames to thaw, 0 if overflow or no more frames
4849     Label thaw_success;
4850     __ cmpdi(CR0, R3_RET, 0);
4851     __ bne(CR0, thaw_success);
4852     __ load_const_optimized(tmp1, (SharedRuntime::throw_StackOverflowError_entry()), R0);
4853     __ mtctr(tmp1); __ bctr();
4854     __ bind(thaw_success);
4855 
4856     __ addi(R3_RET, R3_RET, frame::native_abi_reg_args_size); // Large abi required for C++ calls.
4857     __ neg(R3_RET, R3_RET);
4858     // align down resulting in a smaller negative offset
4859     __ clrrdi(R3_RET, R3_RET, exact_log2(frame::alignment_in_bytes));
4860     DEBUG_ONLY(__ mr(tmp1, R1_SP);)
4861     __ resize_frame(R3_RET, tmp2);  // make room for the thawed frames
4862 
4863     __ li(R4_ARG2, kind);
4864     __ call_VM_leaf(Continuation::thaw_entry(), R16_thread, R4_ARG2);
4865     __ mr(R1_SP, R3_RET); // R3_RET contains the SP of the thawed top frame
4866 
4867     if (return_barrier) {
4868       assert(!InlineTypeReturnedAsFields, "unsupported");
4869       // we're now in the caller of the frame that returned to the barrier
4870       __ mr(R3_RET, nvtmp); __ fmr(F1_RET, nvftmp); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4871     } else {
4872       // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
4873       __ li(R3_RET, 0); // return 0 (success) from doYield
4874     }
4875 
4876     if (return_barrier_exception) {
4877       Register ex_pc = R17_tos;   // nonvolatile register
4878       __ ld(ex_pc, _abi0(lr), R1_SP); // LR
4879       __ mr(nvtmp, R3_RET); // save return value containing the exception oop
4880       // The thawed top frame has got a frame::java_abi. This is not sufficient for the runtime call.
4881       __ push_frame_reg_args(0, tmp1);
4882       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), R16_thread, ex_pc);
4883       __ mtlr(R3_RET); // the exception handler
4884       __ pop_frame();
4885       // See OptoRuntime::generate_exception_blob for register arguments
4886       __ mr(R3_ARG1, nvtmp); // exception oop
4887       __ mr(R4_ARG2, ex_pc); // exception pc
4888     } else {
4889       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4890       __ ld(R0, _abi0(lr), R1_SP); // LR
4891       __ mtlr(R0);
4892     }
4893     __ blr();
4894 
4895     return start;
4896   }
4897 
4898   address generate_cont_thaw() {
4899     return generate_cont_thaw(StubId::stubgen_cont_thaw_id);
4900   }
4901 
4902   // TODO: will probably need multiple return barriers depending on return type
4903 
4904   address generate_cont_returnBarrier() {
4905     return generate_cont_thaw(StubId::stubgen_cont_returnBarrier_id);
4906   }
4907 
4908   address generate_cont_returnBarrier_exception() {
4909     return generate_cont_thaw(StubId::stubgen_cont_returnBarrierExc_id);
4910   }
4911 
4912   address generate_cont_preempt_stub() {
4913     if (!Continuations::enabled()) return nullptr;
4914     StubId stub_id = StubId::stubgen_cont_preempt_id;
4915     StubCodeMark mark(this, stub_id);
4916     address start = __ pc();
4917 
4918     __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
4919 
4920     __ reset_last_Java_frame(false /*check_last_java_sp*/);
4921 
4922     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4923     __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
4924 
4925     Label preemption_cancelled;
4926     __ lbz(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
4927     __ cmpwi(CR0, R11_scratch1, 0);
4928     __ bne(CR0, preemption_cancelled);
4929 
4930     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4931     SharedRuntime::continuation_enter_cleanup(_masm);
4932     __ pop_frame();
4933     __ restore_LR(R11_scratch1);
4934     __ blr();
4935 
4936     // We acquired the monitor after freezing the frames so call thaw to continue execution.
4937     __ bind(preemption_cancelled);
4938     __ li(R11_scratch1, 0); // false
4939     __ stb(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
4940     int simm16_offs = __ load_const_optimized(R11_scratch1, ContinuationEntry::thaw_call_pc_address(), R0, true);
4941     __ ld(R11_scratch1, simm16_offs, R11_scratch1);
4942     __ mtctr(R11_scratch1);
4943     __ bctr();
4944 
4945     return start;
4946   }
4947 
4948   // exception handler for upcall stubs
4949   address generate_upcall_stub_exception_handler() {
4950     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
4951     StubCodeMark mark(this, stub_id);
4952     address start = __ pc();
4953 
4954     // Native caller has no idea how to handle exceptions,
4955     // so we just crash here. Up to callee to catch exceptions.
4956     __ verify_oop(R3_ARG1);
4957     __ load_const_optimized(R12_scratch2, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception), R0);
4958     __ call_c(R12_scratch2);
4959     __ should_not_reach_here();
4960 
4961     return start;
4962   }
4963 
4964   // load Method* target of MethodHandle
4965   // R3_ARG1 = jobject receiver
4966   // R19_method = result Method*
4967   address generate_upcall_stub_load_target() {
4968 
4969     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
4970     StubCodeMark mark(this, stub_id);
4971     address start = __ pc();
4972 
4973     __ resolve_global_jobject(R3_ARG1, R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS);
4974     // Load target method from receiver
4975     __ load_heap_oop(R19_method, java_lang_invoke_MethodHandle::form_offset(), R3_ARG1,
4976                      R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4977     __ load_heap_oop(R19_method, java_lang_invoke_LambdaForm::vmentry_offset(), R19_method,
4978                      R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4979     __ load_heap_oop(R19_method, java_lang_invoke_MemberName::method_offset(), R19_method,
4980                      R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4981     __ ld(R19_method, java_lang_invoke_ResolvedMethodName::vmtarget_offset(), R19_method);
4982     __ std(R19_method, in_bytes(JavaThread::callee_target_offset()), R16_thread); // just in case callee is deoptimized
4983 
4984     __ blr();
4985 
4986     return start;
4987   }
4988 
4989   // Initialization
4990   void generate_preuniverse_stubs() {
4991     // preuniverse stubs are not needed for ppc
4992   }
4993 
4994   void generate_initial_stubs() {
4995     // Generates all stubs and initializes the entry points
4996 
4997     // Entry points that exist in all platforms.
4998     // Note: This is code that could be shared among different platforms - however the
4999     // benefit seems to be smaller than the disadvantage of having a
5000     // much more complicated generator structure. See also comment in
5001     // stubRoutines.hpp.
5002 
5003     StubRoutines::_forward_exception_entry          = generate_forward_exception();
5004     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
5005     StubRoutines::_catch_exception_entry            = generate_catch_exception();
5006 
5007     if (UnsafeMemoryAccess::_table == nullptr) {
5008       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
5009     }
5010 
5011     // CRC32 Intrinsics.
5012     if (UseCRC32Intrinsics) {
5013       StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32_id);
5014     }
5015 
5016     // CRC32C Intrinsics.
5017     if (UseCRC32CIntrinsics) {
5018       StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32C_id);
5019     }
5020 
5021     if (VM_Version::supports_float16()) {
5022       // For results consistency both intrinsics should be enabled.
5023       StubRoutines::_hf2f = generate_float16ToFloat();
5024       StubRoutines::_f2hf = generate_floatToFloat16();
5025     }
5026   }
5027 
5028   void generate_continuation_stubs() {
5029     // Continuation stubs:
5030     StubRoutines::_cont_thaw          = generate_cont_thaw();
5031     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
5032     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
5033     StubRoutines::_cont_preempt_stub  = generate_cont_preempt_stub();
5034   }
5035 
5036   void generate_final_stubs() {
5037     // Generates all stubs and initializes the entry points
5038 
5039     // support for verify_oop (must happen after universe_init)
5040     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();
5041 
5042     // nmethod entry barriers for concurrent class unloading
5043     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
5044 
5045     // arraycopy stubs used by compilers
5046     generate_arraycopy_stubs();
5047 
5048 #ifdef COMPILER2
5049     if (UseSecondarySupersTable) {
5050       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
5051       if (!InlineSecondarySupersTest) {
5052         generate_lookup_secondary_supers_table_stub();
5053       }
5054     }
5055 #endif // COMPILER2
5056 
5057     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
5058     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
5059   }
5060 
5061   void generate_compiler_stubs() {
5062 #ifdef COMPILER2
5063 
5064     if (UseMultiplyToLenIntrinsic) {
5065       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5066     }
5067     if (UseSquareToLenIntrinsic) {
5068       StubRoutines::_squareToLen = generate_squareToLen();
5069     }
5070     if (UseMulAddIntrinsic) {
5071       StubRoutines::_mulAdd = generate_mulAdd();
5072     }
5073     if (UseMontgomeryMultiplyIntrinsic) {
5074       StubRoutines::_montgomeryMultiply
5075         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
5076     }
5077     if (UseMontgomerySquareIntrinsic) {
5078       StubRoutines::_montgomerySquare
5079         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
5080     }
5081 
5082     // data cache line writeback
5083     if (VM_Version::supports_data_cache_line_flush()) {
5084       StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5085       StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5086     }
5087 
5088     if (UseGHASHIntrinsics) {
5089       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5090     }
5091 
5092     if (UseAESIntrinsics) {
5093       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5094       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5095     }
5096 
5097     if (UseSHA256Intrinsics) {
5098       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
5099       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
5100     }
5101     if (UseSHA512Intrinsics) {
5102       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
5103       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
5104     }
5105 
5106 #ifdef VM_LITTLE_ENDIAN
5107     // Currently supported on PPC64LE only
5108     if (UseBASE64Intrinsics) {
5109       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
5110       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
5111     }
5112 #endif
5113 #endif // COMPILER2
5114   }
5115 
5116  public:
5117   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData *stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
5118     switch(blob_id) {
5119     case BlobId::stubgen_preuniverse_id:
5120       generate_preuniverse_stubs();
5121       break;
5122     case BlobId::stubgen_initial_id:
5123       generate_initial_stubs();
5124       break;
5125     case BlobId::stubgen_continuation_id:
5126       generate_continuation_stubs();
5127       break;
5128     case BlobId::stubgen_compiler_id:
5129       generate_compiler_stubs();
5130       break;
5131     case BlobId::stubgen_final_id:
5132       generate_final_stubs();
5133       break;
5134     default:
5135       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
5136       break;
5137     };
5138   }
5139 };
5140 
5141 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData *stub_data) {
5142   StubGenerator g(code, blob_id, stub_data);
5143 }
5144