Old src/hotspot/cpu/ppc/stubGenerator

   1 /*
   2  * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2025 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/macroAssembler.inline.hpp"
  27 #include "compiler/oopMap.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "gc/shared/barrierSetNMethod.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_ppc.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "prims/upcallLinker.hpp"
  39 #include "runtime/continuation.hpp"
  40 #include "runtime/continuationEntry.inline.hpp"
  41 #include "runtime/frame.inline.hpp"
  42 #include "runtime/handles.inline.hpp"
  43 #include "runtime/javaThread.hpp"
  44 #include "runtime/sharedRuntime.hpp"
  45 #include "runtime/stubCodeGenerator.hpp"
  46 #include "runtime/stubRoutines.hpp"
  47 #include "runtime/vm_version.hpp"
  48 #include "utilities/align.hpp"
  49 #include "utilities/powerOfTwo.hpp"
  50 #if INCLUDE_ZGC
  51 #include "gc/z/zBarrierSetAssembler.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp.
  57 
  58 #define __ _masm->
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) // nothing
  62 #else
  63 #define BLOCK_COMMENT(str) __ block_comment(str)
  64 #endif
  65 
  66 #if defined(ABI_ELFv2)
  67 #define STUB_ENTRY(name) StubRoutines::name
  68 #else
  69 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name)->entry()
  70 #endif
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75   // Call stubs are used to call Java from C
  76   //
  77   // Arguments:
  78   //
  79   //   R3  - call wrapper address     : address
  80   //   R4  - result                   : intptr_t*
  81   //   R5  - result type              : BasicType
  82   //   R6  - method                   : Method
  83   //   R7  - frame mgr entry point    : address
  84   //   R8  - parameter block          : intptr_t*
  85   //   R9  - parameter count in words : int
  86   //   R10 - thread                   : Thread*
  87   //
  88   address generate_call_stub(address& return_address) {
  89     // Setup a new c frame, copy java arguments, call template interpreter or
  90     // native_entry, and process result.
  91 
  92     StubId stub_id = StubId::stubgen_call_stub_id;
  93     StubCodeMark mark(this, stub_id);
  94 
  95     address start = __ function_entry();
  96 
  97     int save_nonvolatile_registers_size = __ save_nonvolatile_registers_size(true, SuperwordUseVSX);
  98 
  99     // some sanity checks
 100     STATIC_ASSERT(StackAlignmentInBytes == 16);
 101     assert((sizeof(frame::native_abi_minframe) % 16) == 0,    "unaligned");
 102     assert((sizeof(frame::native_abi_reg_args) % 16) == 0,    "unaligned");
 103     assert((save_nonvolatile_registers_size % 16) == 0,       "unaligned");
 104     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
 105     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");
 106 
 107     Register r_arg_call_wrapper_addr        = R3;
 108     Register r_arg_result_addr              = R4;
 109     Register r_arg_result_type              = R5;
 110     Register r_arg_method                   = R6;
 111     Register r_arg_entry                    = R7;
 112     Register r_arg_argument_addr            = R8;
 113     Register r_arg_argument_count           = R9;
 114     Register r_arg_thread                   = R10;
 115 
 116     Register r_entryframe_fp                = R2; // volatile
 117     Register r_argument_size                = R11_scratch1; // volatile
 118     Register r_top_of_arguments_addr        = R21_tmp1;
 119 
 120     {
 121       // Stack on entry to call_stub:
 122       //
 123       //      F1      [C_FRAME]
 124       //              ...
 125       Register r_frame_size  = R12_scratch2; // volatile
 126       Label arguments_copied;
 127 
 128       // Save LR/CR to caller's C_FRAME.
 129       __ save_LR_CR(R0);
 130 
 131       // Keep copy of our frame pointer (caller's SP).
 132       __ mr(r_entryframe_fp, R1_SP);
 133 
 134       // calculate frame size
 135       STATIC_ASSERT(Interpreter::logStackElementSize == 3);
 136 
 137       // space for arguments aligned up: ((arg_count + 1) * 8) &~ 15
 138       __ addi(r_frame_size, r_arg_argument_count, 1);
 139       __ rldicr(r_frame_size, r_frame_size, 3, 63 - 4);
 140 
 141       // this is the pure space for arguments (excluding alignment padding)
 142       __ sldi(r_argument_size, r_arg_argument_count, 3);
 143 
 144       __ addi(r_frame_size, r_frame_size,
 145               save_nonvolatile_registers_size + frame::entry_frame_locals_size + frame::top_ijava_frame_abi_size);
 146 
 147       // push ENTRY_FRAME
 148       __ push_frame(r_frame_size, R0);
 149 
 150       // Save non-volatiles registers to ENTRY_FRAME.
 151       __ save_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
 152                                     true, SuperwordUseVSX);
 153 
 154       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
 155       // Push ENTRY_FRAME including arguments:
 156       //
 157       //      F0      [TOP_IJAVA_FRAME_ABI]
 158       //              alignment (optional)
 159       //              [outgoing Java arguments]
 160       //              [non-volatiles]
 161       //              [ENTRY_FRAME_LOCALS]
 162       //      F1      [C_FRAME]
 163       //              ...
 164 
 165       // initialize call_stub locals (step 1)
 166       __ std(r_arg_call_wrapper_addr, _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
 167       __ std(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
 168       __ std(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
 169       // we will save arguments_tos_address later
 170 
 171       BLOCK_COMMENT("Copy Java arguments");
 172       // copy Java arguments
 173 
 174       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
 175       __ addi(r_top_of_arguments_addr, r_entryframe_fp,
 176               -(save_nonvolatile_registers_size + frame::entry_frame_locals_size));
 177       __ sub(r_top_of_arguments_addr, r_top_of_arguments_addr, r_argument_size);
 178 
 179       // any arguments to copy?
 180       __ cmpdi(CR0, r_arg_argument_count, 0);
 181       __ beq(CR0, arguments_copied);
 182 
 183       // prepare loop and copy arguments in reverse order
 184       {
 185         Register r_argument_addr     = R22_tmp2;
 186         Register r_argumentcopy_addr = R23_tmp3;
 187         // init CTR with arg_argument_count
 188         __ mtctr(r_arg_argument_count);
 189 
 190         // let r_argumentcopy_addr point to last outgoing Java arguments P
 191         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
 192 
 193         // let r_argument_addr point to last incoming java argument
 194         __ add(r_argument_addr, r_arg_argument_addr, r_argument_size);
 195         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 196 
 197         // now loop while CTR > 0 and copy arguments
 198         {
 199           Label next_argument;
 200           __ bind(next_argument);
 201 
 202           __ ld(R0, 0, r_argument_addr);
 203           // argument_addr--;
 204           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 205           __ std(R0, 0, r_argumentcopy_addr);
 206           // argumentcopy_addr++;
 207           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
 208 
 209           __ bdnz(next_argument);
 210         }
 211       }
 212 
 213       // Arguments copied, continue.
 214       __ bind(arguments_copied);
 215     }
 216 
 217     {
 218       BLOCK_COMMENT("Call template interpreter or native entry.");
 219       assert_different_registers(r_arg_entry, r_top_of_arguments_addr, r_arg_method, r_arg_thread);
 220 
 221       // Register state on entry to template interpreter / native entry:
 222       //
 223       //   tos         -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
 224       //   R19_method  -  Method
 225       //   R16_thread  -  JavaThread*
 226 
 227       // Tos must point to last argument - element_size.
 228       const Register tos = R15_esp;
 229 
 230       __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
 231 
 232       // initialize call_stub locals (step 2)
 233       // now save tos as arguments_tos_address
 234       __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
 235 
 236       // load argument registers for call
 237       __ mr(R19_method, r_arg_method);
 238       __ mr(R16_thread, r_arg_thread);
 239       assert(tos != r_arg_method, "trashed r_arg_method");
 240       assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
 241 
 242       // Set R15_prev_state to 0 for simplifying checks in callee.
 243       __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R0);
 244       // Stack on entry to template interpreter / native entry:
 245       //
 246       //      F0      [TOP_IJAVA_FRAME_ABI]
 247       //              alignment (optional)
 248       //              [outgoing Java arguments]
 249       //              [non-volatiles]
 250       //              [ENTRY_FRAME_LOCALS]
 251       //      F1      [C_FRAME]
 252       //              ...
 253       //
 254 
 255       // global toc register
 256       __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R0);
 257       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
 258       // when called via a c2i.
 259 
 260       // Pass initial_caller_sp to framemanager.
 261       __ mr(R21_sender_SP, R1_SP);
 262 
 263       // Do a light-weight C-call here, r_arg_entry holds the address
 264       // of the interpreter entry point (template interpreter or native entry)
 265       // and save runtime-value of LR in return_address.
 266       assert(r_arg_entry != tos && r_arg_entry != R19_method && r_arg_entry != R16_thread,
 267              "trashed r_arg_entry");
 268       return_address = __ call_stub(r_arg_entry);
 269     }
 270 
 271     {
 272       BLOCK_COMMENT("Returned from template interpreter or native entry.");
 273       // Now pop frame, process result, and return to caller.
 274 
 275       // Stack on exit from template interpreter / native entry:
 276       //
 277       //      F0      [ABI]
 278       //              ...
 279       //              [non-volatiles]
 280       //              [ENTRY_FRAME_LOCALS]
 281       //      F1      [C_FRAME]
 282       //              ...
 283       //
 284       // Just pop the topmost frame ...
 285       //
 286 
 287       Label ret_is_object;
 288       Label ret_is_long;
 289       Label ret_is_float;
 290       Label ret_is_double;
 291 
 292       Register r_lr = R11_scratch1;
 293       Register r_cr = R12_scratch2;
 294 
 295       // Reload some volatile registers which we've spilled before the call
 296       // to template interpreter / native entry.
 297       // Access all locals via frame pointer, because we know nothing about
 298       // the topmost frame's size.
 299       __ ld(r_entryframe_fp, _abi0(callers_sp), R1_SP); // restore after call
 300       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
 301       __ ld(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
 302       __ ld(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
 303       __ ld(r_cr, _abi0(cr), r_entryframe_fp);
 304       __ ld(r_lr, _abi0(lr), r_entryframe_fp);
 305       __ mtcr(r_cr); // restore CR
 306       __ mtlr(r_lr); // restore LR
 307 
 308       // Store result depending on type. Everything that is not
 309       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
 310       // Using volatile CRs.
 311       __ cmpwi(CR1, r_arg_result_type, T_OBJECT);
 312       __ cmpwi(CR5, r_arg_result_type, T_LONG);
 313       __ cmpwi(CR6, r_arg_result_type, T_FLOAT);
 314       __ cmpwi(CR7, r_arg_result_type, T_DOUBLE);
 315 
 316       __ pop_cont_fastpath(); // kills CR0, uses R16_thread
 317 
 318       // restore non-volatile registers
 319       __ restore_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
 320                                        true, SuperwordUseVSX);
 321 
 322       // pop frame
 323       __ mr(R1_SP, r_entryframe_fp);
 324 
 325       // Stack on exit from call_stub:
 326       //
 327       //      0       [C_FRAME]
 328       //              ...
 329       //
 330       //  no call_stub frames left.
 331 
 332       __ beq(CR1, ret_is_object);
 333       __ beq(CR5, ret_is_long);
 334       __ beq(CR6, ret_is_float);
 335       __ beq(CR7, ret_is_double);
 336 
 337       // default:
 338       __ stw(R3_RET, 0, r_arg_result_addr);
 339       __ blr(); // return to caller
 340 
 341       // case T_OBJECT:
 342       // case T_LONG:
 343       __ bind(ret_is_object);
 344       __ bind(ret_is_long);
 345       __ std(R3_RET, 0, r_arg_result_addr);
 346       __ blr(); // return to caller
 347 
 348       // case T_FLOAT:
 349       __ bind(ret_is_float);
 350       __ stfs(F1_RET, 0, r_arg_result_addr);
 351       __ blr(); // return to caller
 352 
 353       // case T_DOUBLE:
 354       __ bind(ret_is_double);
 355       __ stfd(F1_RET, 0, r_arg_result_addr);
 356       __ blr(); // return to caller
 357     }
 358 
 359     return start;
 360   }
 361 
 362   // Return point for a Java call if there's an exception thrown in
 363   // Java code.  The exception is caught and transformed into a
 364   // pending exception stored in JavaThread that can be tested from
 365   // within the VM.
 366   //
 367   address generate_catch_exception() {
 368     StubId stub_id = StubId::stubgen_catch_exception_id;
 369     StubCodeMark mark(this, stub_id);
 370 
 371     address start = __ pc();
 372 
 373     // Registers alive
 374     //
 375     //  R16_thread
 376     //  R3_ARG1 - address of pending exception
 377     //  R4_ARG2 - return address in call stub
 378 
 379     const Register exception_file = R21_tmp1;
 380     const Register exception_line = R22_tmp2;
 381 
 382     __ load_const(exception_file, (void*)__FILE__);
 383     __ load_const(exception_line, (void*)__LINE__);
 384 
 385     __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
 386     // store into `char *'
 387     __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
 388     // store into `int'
 389     __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
 390 
 391     // complete return to VM
 392     assert(StubRoutines::_call_stub_return_address != nullptr, "must have been generated before");
 393 
 394     __ mtlr(R4_ARG2);
 395     // continue in call stub
 396     __ blr();
 397 
 398     return start;
 399   }
 400 
 401   // Continuation point for runtime calls returning with a pending
 402   // exception.  The pending exception check happened in the runtime
 403   // or native call stub.  The pending exception in Thread is
 404   // converted into a Java-level exception.
 405   //
 406   // Read:
 407   //
 408   //   LR:     The pc the runtime library callee wants to return to.
 409   //           Since the exception occurred in the callee, the return pc
 410   //           from the point of view of Java is the exception pc.
 411   //   thread: Needed for method handles.
 412   //
 413   // Invalidate:
 414   //
 415   //   volatile registers (except below).
 416   //
 417   // Update:
 418   //
 419   //   R4_ARG2: exception
 420   //
 421   // (LR is unchanged and is live out).
 422   //
 423   address generate_forward_exception() {
 424     StubId stub_id = StubId::stubgen_forward_exception_id;
 425     StubCodeMark mark(this, stub_id);
 426     address start = __ pc();
 427 
 428     if (VerifyOops) {
 429       // Get pending exception oop.
 430       __ ld(R3_ARG1,
 431                 in_bytes(Thread::pending_exception_offset()),
 432                 R16_thread);
 433       // Make sure that this code is only executed if there is a pending exception.
 434       {
 435         Label L;
 436         __ cmpdi(CR0, R3_ARG1, 0);
 437         __ bne(CR0, L);
 438         __ stop("StubRoutines::forward exception: no pending exception (1)");
 439         __ bind(L);
 440       }
 441       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
 442     }
 443 
 444     // Save LR/CR and copy exception pc (LR) into R4_ARG2.
 445     __ save_LR(R4_ARG2);
 446     __ push_frame_reg_args(0, R0);
 447     // Find exception handler.
 448     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 449                      SharedRuntime::exception_handler_for_return_address),
 450                     R16_thread,
 451                     R4_ARG2);
 452     // Copy handler's address.
 453     __ mtctr(R3_RET);
 454     __ pop_frame();
 455     __ restore_LR(R0);
 456 
 457     // Set up the arguments for the exception handler:
 458     //  - R3_ARG1: exception oop
 459     //  - R4_ARG2: exception pc.
 460 
 461     // Load pending exception oop.
 462     __ ld(R3_ARG1,
 463               in_bytes(Thread::pending_exception_offset()),
 464               R16_thread);
 465 
 466     // The exception pc is the return address in the caller.
 467     // Must load it into R4_ARG2.
 468     __ mflr(R4_ARG2);
 469 
 470 #ifdef ASSERT
 471     // Make sure exception is set.
 472     {
 473       Label L;
 474       __ cmpdi(CR0, R3_ARG1, 0);
 475       __ bne(CR0, L);
 476       __ stop("StubRoutines::forward exception: no pending exception (2)");
 477       __ bind(L);
 478     }
 479 #endif
 480 
 481     // Clear the pending exception.
 482     __ li(R0, 0);
 483     __ std(R0,
 484                in_bytes(Thread::pending_exception_offset()),
 485                R16_thread);
 486     // Jump to exception handler.
 487     __ bctr();
 488 
 489     return start;
 490   }
 491 
 492 #undef __
 493 #define __ _masm->
 494 
 495 #if !defined(PRODUCT)
 496   // Wrapper which calls oopDesc::is_oop_or_null()
 497   // Only called by MacroAssembler::verify_oop
 498   static void verify_oop_helper(const char* message, oopDesc* o) {
 499     if (!oopDesc::is_oop_or_null(o)) {
 500       fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
 501     }
 502     ++ StubRoutines::_verify_oop_count;
 503   }
 504 #endif
 505 
 506   // Return address of code to be called from code generated by
 507   // MacroAssembler::verify_oop.
 508   //
 509   // Don't generate, rather use C++ code.
 510   address generate_verify_oop() {
 511     // this is actually a `FunctionDescriptor*'.
 512     address start = nullptr;
 513 
 514 #if !defined(PRODUCT)
 515     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 516 #endif
 517 
 518     return start;
 519   }
 520 
 521   // Computes the Galois/Counter Mode (GCM) product and reduction.
 522   //
 523   // This function performs polynomial multiplication of the subkey H with
 524   // the current GHASH state using vectorized polynomial multiplication (`vpmsumd`).
 525   // The subkey H is divided into lower, middle, and higher halves.
 526   // The multiplication results are reduced using `vConstC2` to stay within GF(2^128).
 527   // The final computed value is stored back into `vState`.
 528   static void computeGCMProduct(MacroAssembler* _masm,
 529                                 VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH,
 530                                 VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState,
 531                                 VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct,
 532                                 VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9,
 533                                 VectorRegister vCombinedResult, VectorRegister vSwappedH) {
 534     __ vxor(vH, vH, vState);
 535     __ vpmsumd(vLowProduct, vLowerH, vH);                          // L : Lower Half of subkey H
 536     __ vpmsumd(vMidProduct, vSwappedH, vH);                        // M : Combined halves of subkey H
 537     __ vpmsumd(vHighProduct, vHigherH, vH);                        // H : Higher Half of subkey H
 538     __ vpmsumd(vReducedLow, vLowProduct, vConstC2);                // Reduction
 539     __ vsldoi(vTmp8, vMidProduct, vZero, 8);                       // mL : Extract the lower 64 bits of M
 540     __ vsldoi(vTmp9, vZero, vMidProduct, 8);                       // mH : Extract the higher 64 bits of M
 541     __ vxor(vLowProduct, vLowProduct, vTmp8);                      // LL + mL : Partial result for lower half
 542     __ vxor(vHighProduct, vHighProduct, vTmp9);                    // HH + mH : Partial result for upper half
 543     __ vsldoi(vLowProduct, vLowProduct, vLowProduct, 8);           // Swap
 544     __ vxor(vLowProduct, vLowProduct, vReducedLow);
 545     __ vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8);       // Swap
 546     __ vpmsumd(vLowProduct, vLowProduct, vConstC2);                // Reduction using constant
 547     __ vxor(vCombinedResult, vCombinedResult, vHighProduct);       // Combine reduced Low & High products
 548     __ vxor(vState, vLowProduct, vCombinedResult);
 549   }
 550 
 551   // Generate stub for ghash process blocks.
 552   //
 553   // Arguments for generated stub:
 554   //      state:    R3_ARG1 (long[] state)
 555   //      subkeyH:  R4_ARG2 (long[] subH)
 556   //      data:     R5_ARG3 (byte[] data)
 557   //      blocks:   R6_ARG4 (number of 16-byte blocks to process)
 558   //
 559   // The polynomials are processed in bit-reflected order for efficiency reasons.
 560   // This optimization leverages the structure of the Galois field arithmetic
 561   // to minimize the number of bit manipulations required during multiplication.
 562   // For an explanation of how this works, refer :
 563   // Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich,
 564   // Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel®
 565   // Architecture Processor"
 566   // http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf
 567   //
 568   //
 569   address generate_ghash_processBlocks() {
 570     StubCodeMark mark(this, "StubRoutines", "ghash");
 571     address start = __ function_entry();
 572 
 573     // Registers for parameters
 574     Register state = R3_ARG1;                     // long[] state
 575     Register subkeyH = R4_ARG2;                   // long[] subH
 576     Register data = R5_ARG3;                      // byte[] data
 577     Register blocks = R6_ARG4;
 578     Register temp1 = R8;
 579     // Vector Registers
 580     VectorRegister vZero = VR0;
 581     VectorRegister vH = VR1;
 582     VectorRegister vLowerH = VR2;
 583     VectorRegister vHigherH = VR3;
 584     VectorRegister vLowProduct = VR4;
 585     VectorRegister vMidProduct = VR5;
 586     VectorRegister vHighProduct = VR6;
 587     VectorRegister vReducedLow = VR7;
 588     VectorRegister vTmp8 = VR8;
 589     VectorRegister vTmp9 = VR9;
 590     VectorRegister vTmp10 = VR10;
 591     VectorRegister vSwappedH = VR11;
 592     VectorRegister vTmp12 = VR12;
 593     VectorRegister loadOrder = VR13;
 594     VectorRegister vHigh = VR14;
 595     VectorRegister vLow = VR15;
 596     VectorRegister vState = VR16;
 597     VectorRegister vPerm = VR17;
 598     VectorRegister vCombinedResult = VR18;
 599     VectorRegister vConstC2 = VR19;
 600 
 601     __ li(temp1, 0xc2);
 602     __ sldi(temp1, temp1, 56);
 603     __ vspltisb(vZero, 0);
 604     __ mtvrd(vConstC2, temp1);
 605     __ lxvd2x(vH->to_vsr(), subkeyH);
 606     __ lxvd2x(vState->to_vsr(), state);
 607     // Operations to obtain lower and higher bytes of subkey H.
 608     __ vspltisb(vReducedLow, 1);
 609     __ vspltisb(vTmp10, 7);
 610     __ vsldoi(vTmp8, vZero, vReducedLow, 1);            // 0x1
 611     __ vor(vTmp8, vConstC2, vTmp8);                     // 0xC2...1
 612     __ vsplt(vTmp9, 0, vH);                             // MSB of H
 613     __ vsl(vH, vH, vReducedLow);                        // Carry = H<<7
 614     __ vsrab(vTmp9, vTmp9, vTmp10);
 615     __ vand(vTmp9, vTmp9, vTmp8);                       // Carry
 616     __ vxor(vTmp10, vH, vTmp9);
 617     __ vsldoi(vConstC2, vZero, vConstC2, 8);
 618     __ vsldoi(vSwappedH, vTmp10, vTmp10, 8);            // swap Lower and Higher Halves of subkey H
 619     __ vsldoi(vLowerH, vZero, vSwappedH, 8);            // H.L
 620     __ vsldoi(vHigherH, vSwappedH, vZero, 8);           // H.H
 621 #ifdef ASSERT
 622     __ cmpwi(CR0, blocks, 0);                           // Compare 'blocks' (R6_ARG4) with zero
 623     __ asm_assert_ne("blocks should NOT be zero");
 624 #endif
 625     __ clrldi(blocks, blocks, 32);
 626     __ mtctr(blocks);
 627     __ lvsl(loadOrder, temp1);
 628 #ifdef VM_LITTLE_ENDIAN
 629     __ vspltisb(vTmp12, 0xf);
 630     __ vxor(loadOrder, loadOrder, vTmp12);
 631 #define LE_swap_bytes(x) __ vec_perm(x, x, x, loadOrder)
 632 #else
 633 #define LE_swap_bytes(x)
 634 #endif
 635 
 636     // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation.
 637     //
 638     // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts,
 639     // performing three 128-bit multiplications and combining the results efficiently.
 640     //
 641     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
 642     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
 643     //
 644     // Inputs:
 645     // - vH:       The data vector (state), containing both B0 (lower half) and B1 (higher half).
 646     // - vLowerH:  Lower half of the subkey H (A0).
 647     // - vHigherH: Higher half of the subkey H (A1).
 648     // - vConstC2: Constant used for reduction (for final processing).
 649     //
 650     // References:
 651     // Shay Gueron, Michael E. Kounavis.
 652     // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode"
 653     // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918
 654     //
 655     Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop;
 656     __ andi(temp1, data, 15);
 657     __ cmpwi(CR0, temp1, 0);
 658     __ bne(CR0, L_initialize_unaligned_loop);
 659 
 660     __ bind(L_aligned_loop);
 661       __ lvx(vH, temp1, data);
 662       LE_swap_bytes(vH);
 663       computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
 664                     vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
 665       __ addi(data, data, 16);
 666     __ bdnz(L_aligned_loop);
 667     __ b(L_store);
 668 
 669     __ bind(L_initialize_unaligned_loop);
 670     __ li(temp1, 0);
 671     __ lvsl(vPerm, temp1, data);
 672     __ lvx(vHigh, temp1, data);
 673 #ifdef VM_LITTLE_ENDIAN
 674     __ vspltisb(vTmp12, -1);
 675     __ vxor(vPerm, vPerm, vTmp12);
 676 #endif
 677     __ bind(L_unaligned_loop);
 678       __ addi(data, data, 16);
 679       __ lvx(vLow, temp1, data);
 680       __ vec_perm(vH, vHigh, vLow, vPerm);
 681       computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
 682                     vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
 683       __ vmr(vHigh, vLow);
 684     __ bdnz(L_unaligned_loop);
 685 
 686     __ bind(L_store);
 687     __ stxvd2x(vState->to_vsr(), state);
 688     __ blr();
 689 
 690     return start;
 691   }
 692   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
 693   //
 694   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
 695   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
 696   //
 697   // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
 698   // for turning on loop predication optimization, and hence the behavior of "array range check"
 699   // and "loop invariant check" could be influenced, which potentially boosted JVM98.
 700   //
 701   // Generate stub for disjoint short fill. If "aligned" is true, the
 702   // "to" address is assumed to be heapword aligned.
 703   //
 704   // Arguments for generated stub:
 705   //   to:    R3_ARG1
 706   //   value: R4_ARG2
 707   //   count: R5_ARG3 treated as signed
 708   //
 709   address generate_fill(StubId stub_id) {
 710     BasicType t;
 711     bool aligned;
 712 
 713     switch (stub_id) {
 714     case StubId::stubgen_jbyte_fill_id:
 715       t = T_BYTE;
 716       aligned = false;
 717       break;
 718     case StubId::stubgen_jshort_fill_id:
 719       t = T_SHORT;
 720       aligned = false;
 721       break;
 722     case StubId::stubgen_jint_fill_id:
 723       t = T_INT;
 724       aligned = false;
 725       break;
 726     case StubId::stubgen_arrayof_jbyte_fill_id:
 727       t = T_BYTE;
 728       aligned = true;
 729       break;
 730     case StubId::stubgen_arrayof_jshort_fill_id:
 731       t = T_SHORT;
 732       aligned = true;
 733       break;
 734     case StubId::stubgen_arrayof_jint_fill_id:
 735       t = T_INT;
 736       aligned = true;
 737       break;
 738     default:
 739       ShouldNotReachHere();
 740     }
 741 
 742     StubCodeMark mark(this, stub_id);
 743     address start = __ function_entry();
 744 
 745     const Register to    = R3_ARG1;   // source array address
 746     const Register value = R4_ARG2;   // fill value
 747     const Register count = R5_ARG3;   // elements count
 748     const Register temp  = R6_ARG4;   // temp register
 749 
 750     //assert_clean_int(count, O3);    // Make sure 'count' is clean int.
 751 
 752     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
 753     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
 754 
 755     int shift = -1;
 756     switch (t) {
 757        case T_BYTE:
 758         shift = 2;
 759         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 760         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit
 761         __ cmpdi(CR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 762         __ blt(CR0, L_fill_elements);
 763         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 764         break;
 765        case T_SHORT:
 766         shift = 1;
 767         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 768         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 769         __ cmpdi(CR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 770         __ blt(CR0, L_fill_elements);
 771         break;
 772       case T_INT:
 773         shift = 0;
 774         __ cmpdi(CR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 775         __ blt(CR0, L_fill_4_bytes);
 776         break;
 777       default: ShouldNotReachHere();
 778     }
 779 
 780     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
 781       // Align source address at 4 bytes address boundary.
 782       if (t == T_BYTE) {
 783         // One byte misalignment happens only for byte arrays.
 784         __ andi_(temp, to, 1);
 785         __ beq(CR0, L_skip_align1);
 786         __ stb(value, 0, to);
 787         __ addi(to, to, 1);
 788         __ addi(count, count, -1);
 789         __ bind(L_skip_align1);
 790       }
 791       // Two bytes misalignment happens only for byte and short (char) arrays.
 792       __ andi_(temp, to, 2);
 793       __ beq(CR0, L_skip_align2);
 794       __ sth(value, 0, to);
 795       __ addi(to, to, 2);
 796       __ addi(count, count, -(1 << (shift - 1)));
 797       __ bind(L_skip_align2);
 798     }
 799 
 800     if (!aligned) {
 801       // Align to 8 bytes, we know we are 4 byte aligned to start.
 802       __ andi_(temp, to, 7);
 803       __ beq(CR0, L_fill_32_bytes);
 804       __ stw(value, 0, to);
 805       __ addi(to, to, 4);
 806       __ addi(count, count, -(1 << shift));
 807       __ bind(L_fill_32_bytes);
 808     }
 809 
 810     __ li(temp, 8<<shift);                  // Prepare for 32 byte loop.
 811     // Clone bytes int->long as above.
 812     __ rldimi(value, value, 32, 0);         // 32 bit -> 64 bit
 813 
 814     Label L_check_fill_8_bytes;
 815     // Fill 32-byte chunks.
 816     __ subf_(count, temp, count);
 817     __ blt(CR0, L_check_fill_8_bytes);
 818 
 819     Label L_fill_32_bytes_loop;
 820     __ align(32);
 821     __ bind(L_fill_32_bytes_loop);
 822 
 823     __ std(value, 0, to);
 824     __ std(value, 8, to);
 825     __ subf_(count, temp, count);           // Update count.
 826     __ std(value, 16, to);
 827     __ std(value, 24, to);
 828 
 829     __ addi(to, to, 32);
 830     __ bge(CR0, L_fill_32_bytes_loop);
 831 
 832     __ bind(L_check_fill_8_bytes);
 833     __ add_(count, temp, count);
 834     __ beq(CR0, L_exit);
 835     __ addic_(count, count, -(2 << shift));
 836     __ blt(CR0, L_fill_4_bytes);
 837 
 838     //
 839     // Length is too short, just fill 8 bytes at a time.
 840     //
 841     Label L_fill_8_bytes_loop;
 842     __ bind(L_fill_8_bytes_loop);
 843     __ std(value, 0, to);
 844     __ addic_(count, count, -(2 << shift));
 845     __ addi(to, to, 8);
 846     __ bge(CR0, L_fill_8_bytes_loop);
 847 
 848     // Fill trailing 4 bytes.
 849     __ bind(L_fill_4_bytes);
 850     __ andi_(temp, count, 1<<shift);
 851     __ beq(CR0, L_fill_2_bytes);
 852 
 853     __ stw(value, 0, to);
 854     if (t == T_BYTE || t == T_SHORT) {
 855       __ addi(to, to, 4);
 856       // Fill trailing 2 bytes.
 857       __ bind(L_fill_2_bytes);
 858       __ andi_(temp, count, 1<<(shift-1));
 859       __ beq(CR0, L_fill_byte);
 860       __ sth(value, 0, to);
 861       if (t == T_BYTE) {
 862         __ addi(to, to, 2);
 863         // Fill trailing byte.
 864         __ bind(L_fill_byte);
 865         __ andi_(count, count, 1);
 866         __ beq(CR0, L_exit);
 867         __ stb(value, 0, to);
 868       } else {
 869         __ bind(L_fill_byte);
 870       }
 871     } else {
 872       __ bind(L_fill_2_bytes);
 873     }
 874     __ bind(L_exit);
 875     __ blr();
 876 
 877     // Handle copies less than 8 bytes. Int is handled elsewhere.
 878     if (t == T_BYTE) {
 879       __ bind(L_fill_elements);
 880       Label L_fill_2, L_fill_4;
 881       __ andi_(temp, count, 1);
 882       __ beq(CR0, L_fill_2);
 883       __ stb(value, 0, to);
 884       __ addi(to, to, 1);
 885       __ bind(L_fill_2);
 886       __ andi_(temp, count, 2);
 887       __ beq(CR0, L_fill_4);
 888       __ stb(value, 0, to);
 889       __ stb(value, 0, to);
 890       __ addi(to, to, 2);
 891       __ bind(L_fill_4);
 892       __ andi_(temp, count, 4);
 893       __ beq(CR0, L_exit);
 894       __ stb(value, 0, to);
 895       __ stb(value, 1, to);
 896       __ stb(value, 2, to);
 897       __ stb(value, 3, to);
 898       __ blr();
 899     }
 900 
 901     if (t == T_SHORT) {
 902       Label L_fill_2;
 903       __ bind(L_fill_elements);
 904       __ andi_(temp, count, 1);
 905       __ beq(CR0, L_fill_2);
 906       __ sth(value, 0, to);
 907       __ addi(to, to, 2);
 908       __ bind(L_fill_2);
 909       __ andi_(temp, count, 2);
 910       __ beq(CR0, L_exit);
 911       __ sth(value, 0, to);
 912       __ sth(value, 2, to);
 913       __ blr();
 914     }
 915     return start;
 916   }
 917 
 918   inline void assert_positive_int(Register count) {
 919 #ifdef ASSERT
 920     __ srdi_(R0, count, 31);
 921     __ asm_assert_eq("missing zero extend");
 922 #endif
 923   }
 924 
 925   // Generate overlap test for array copy stubs.
 926   //
 927   // Input:
 928   //   R3_ARG1    -  from
 929   //   R4_ARG2    -  to
 930   //   R5_ARG3    -  element count
 931   //
 932   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
 933     Register tmp1 = R6_ARG4;
 934     Register tmp2 = R7_ARG5;
 935 
 936     assert_positive_int(R5_ARG3);
 937 
 938     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
 939     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
 940     __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
 941     __ cmpld(CR1, tmp1, tmp2);
 942     __ crnand(CR0, Assembler::less, CR1, Assembler::less);
 943     // Overlaps if Src before dst and distance smaller than size.
 944     // Branch to forward copy routine otherwise (within range of 32kB).
 945     __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::less), no_overlap_target);
 946 
 947     // need to copy backwards
 948   }
 949 
 950   // This is common errorexit stub for UnsafeMemoryAccess.
 951   address generate_unsafecopy_common_error_exit() {
 952     address start_pc = __ pc();
 953     Register tmp1 = R6_ARG4;
 954     // probably copy stub would have changed value reset it.
 955     if (VM_Version::has_mfdscr()) {
 956       __ load_const_optimized(tmp1, VM_Version::_dscr_val);
 957       __ mtdscr(tmp1);
 958     }
 959     __ li(R3_RET, 0); // return 0
 960     __ blr();
 961     return start_pc;
 962   }
 963 
 964   // The guideline in the implementations of generate_disjoint_xxx_copy
 965   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
 966   // single instructions, but to avoid alignment interrupts (see subsequent
 967   // comment). Furthermore, we try to minimize misaligned access, even
 968   // though they cause no alignment interrupt.
 969   //
 970   // In Big-Endian mode, the PowerPC architecture requires implementations to
 971   // handle automatically misaligned integer halfword and word accesses,
 972   // word-aligned integer doubleword accesses, and word-aligned floating-point
 973   // accesses. Other accesses may or may not generate an Alignment interrupt
 974   // depending on the implementation.
 975   // Alignment interrupt handling may require on the order of hundreds of cycles,
 976   // so every effort should be made to avoid misaligned memory values.
 977   //
 978   //
 979   // Generate stub for disjoint byte copy.  If "aligned" is true, the
 980   // "from" and "to" addresses are assumed to be heapword aligned.
 981   //
 982   // Arguments for generated stub:
 983   //      from:  R3_ARG1
 984   //      to:    R4_ARG2
 985   //      count: R5_ARG3 treated as signed
 986   //
 987   address generate_disjoint_byte_copy(StubId stub_id) {
 988     bool aligned;
 989     switch (stub_id) {
 990     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 991       aligned = false;
 992       break;
 993     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 994       aligned = true;
 995       break;
 996     default:
 997       ShouldNotReachHere();
 998     }
 999 
1000     StubCodeMark mark(this, stub_id);
1001     address start = __ function_entry();
1002     assert_positive_int(R5_ARG3);
1003 
1004     Register tmp1 = R6_ARG4;
1005     Register tmp2 = R7_ARG5;
1006     Register tmp3 = R8_ARG6;
1007     Register tmp4 = R9_ARG7;
1008 
1009     VectorSRegister tmp_vsr1  = VSR1;
1010     VectorSRegister tmp_vsr2  = VSR2;
1011 
1012     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
1013     {
1014       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1015       UnsafeMemoryAccessMark umam(this, !aligned, false);
1016 
1017       // Don't try anything fancy if arrays don't have many elements.
1018       __ li(tmp3, 0);
1019       __ cmpwi(CR0, R5_ARG3, 17);
1020       __ ble(CR0, l_6); // copy 4 at a time
1021 
1022       if (!aligned) {
1023         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1024         __ andi_(tmp1, tmp1, 3);
1025         __ bne(CR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1026 
1027         // Copy elements if necessary to align to 4 bytes.
1028         __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1029         __ andi_(tmp1, tmp1, 3);
1030         __ beq(CR0, l_2);
1031 
1032         __ subf(R5_ARG3, tmp1, R5_ARG3);
1033         __ bind(l_9);
1034         __ lbz(tmp2, 0, R3_ARG1);
1035         __ addic_(tmp1, tmp1, -1);
1036         __ stb(tmp2, 0, R4_ARG2);
1037         __ addi(R3_ARG1, R3_ARG1, 1);
1038         __ addi(R4_ARG2, R4_ARG2, 1);
1039         __ bne(CR0, l_9);
1040 
1041         __ bind(l_2);
1042       }
1043 
1044       // copy 8 elements at a time
1045       __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1046       __ andi_(tmp1, tmp2, 7);
1047       __ bne(CR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1048 
1049       // copy a 2-element word if necessary to align to 8 bytes
1050       __ andi_(R0, R3_ARG1, 7);
1051       __ beq(CR0, l_7);
1052 
1053       __ lwzx(tmp2, R3_ARG1, tmp3);
1054       __ addi(R5_ARG3, R5_ARG3, -4);
1055       __ stwx(tmp2, R4_ARG2, tmp3);
1056       { // FasterArrayCopy
1057         __ addi(R3_ARG1, R3_ARG1, 4);
1058         __ addi(R4_ARG2, R4_ARG2, 4);
1059       }
1060       __ bind(l_7);
1061 
1062       { // FasterArrayCopy
1063         __ cmpwi(CR0, R5_ARG3, 31);
1064         __ ble(CR0, l_6); // copy 2 at a time if less than 32 elements remain
1065 
1066         __ srdi(tmp1, R5_ARG3, 5);
1067         __ andi_(R5_ARG3, R5_ARG3, 31);
1068         __ mtctr(tmp1);
1069 
1070 
1071         // Prefetch the data into the L2 cache.
1072         __ dcbt(R3_ARG1, 0);
1073 
1074         // If supported set DSCR pre-fetch to deepest.
1075         if (VM_Version::has_mfdscr()) {
1076           __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1077           __ mtdscr(tmp2);
1078         }
1079         __ li(tmp1, 16);
1080 
1081         // Backbranch target aligned to 32-byte. Not 16-byte align as
1082         // loop contains < 8 instructions that fit inside a single
1083         // i-cache sector.
1084         __ align(32);
1085 
1086         __ bind(l_10);
1087         // Use loop with VSX load/store instructions to
1088         // copy 32 elements a time.
1089         __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1090         __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1091         __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1092         __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1093         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1094         __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1095         __ bdnz(l_10);                       // Dec CTR and loop if not zero.
1096 
1097         // Restore DSCR pre-fetch value.
1098         if (VM_Version::has_mfdscr()) {
1099           __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1100           __ mtdscr(tmp2);
1101         }
1102 
1103      } // FasterArrayCopy
1104 
1105       __ bind(l_6);
1106 
1107       // copy 4 elements at a time
1108       __ cmpwi(CR0, R5_ARG3, 4);
1109       __ blt(CR0, l_1);
1110       __ srdi(tmp1, R5_ARG3, 2);
1111       __ mtctr(tmp1); // is > 0
1112       __ andi_(R5_ARG3, R5_ARG3, 3);
1113 
1114       { // FasterArrayCopy
1115         __ addi(R3_ARG1, R3_ARG1, -4);
1116         __ addi(R4_ARG2, R4_ARG2, -4);
1117         __ bind(l_3);
1118         __ lwzu(tmp2, 4, R3_ARG1);
1119         __ stwu(tmp2, 4, R4_ARG2);
1120         __ bdnz(l_3);
1121         __ addi(R3_ARG1, R3_ARG1, 4);
1122         __ addi(R4_ARG2, R4_ARG2, 4);
1123       }
1124 
1125       // do single element copy
1126       __ bind(l_1);
1127       __ cmpwi(CR0, R5_ARG3, 0);
1128       __ beq(CR0, l_4);
1129 
1130       { // FasterArrayCopy
1131         __ mtctr(R5_ARG3);
1132         __ addi(R3_ARG1, R3_ARG1, -1);
1133         __ addi(R4_ARG2, R4_ARG2, -1);
1134 
1135         __ bind(l_5);
1136         __ lbzu(tmp2, 1, R3_ARG1);
1137         __ stbu(tmp2, 1, R4_ARG2);
1138         __ bdnz(l_5);
1139       }
1140     }
1141 
1142     __ bind(l_4);
1143     __ li(R3_RET, 0); // return 0
1144     __ blr();
1145 
1146     return start;
1147   }
1148 
1149   // Generate stub for conjoint byte copy.  If "aligned" is true, the
1150   // "from" and "to" addresses are assumed to be heapword aligned.
1151   //
1152   // Arguments for generated stub:
1153   //      from:  R3_ARG1
1154   //      to:    R4_ARG2
1155   //      count: R5_ARG3 treated as signed
1156   //
1157   address generate_conjoint_byte_copy(StubId stub_id) {
1158     bool aligned;
1159     switch (stub_id) {
1160     case StubId::stubgen_jbyte_arraycopy_id:
1161       aligned = false;
1162       break;
1163     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1164       aligned = true;
1165       break;
1166     default:
1167       ShouldNotReachHere();
1168     }
1169 
1170     StubCodeMark mark(this, stub_id);
1171     address start = __ function_entry();
1172     assert_positive_int(R5_ARG3);
1173 
1174     Register tmp1 = R6_ARG4;
1175     Register tmp2 = R7_ARG5;
1176     Register tmp3 = R8_ARG6;
1177 
1178     address nooverlap_target = aligned ?
1179       STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy()) :
1180       STUB_ENTRY(jbyte_disjoint_arraycopy());
1181 
1182     array_overlap_test(nooverlap_target, 0);
1183     // Do reverse copy. We assume the case of actual overlap is rare enough
1184     // that we don't have to optimize it.
1185     Label l_1, l_2;
1186     {
1187       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1188       UnsafeMemoryAccessMark umam(this, !aligned, false);
1189       __ b(l_2);
1190       __ bind(l_1);
1191       __ stbx(tmp1, R4_ARG2, R5_ARG3);
1192       __ bind(l_2);
1193       __ addic_(R5_ARG3, R5_ARG3, -1);
1194       __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1195       __ bge(CR0, l_1);
1196     }
1197     __ li(R3_RET, 0); // return 0
1198     __ blr();
1199 
1200     return start;
1201   }
1202 
1203   // Generate stub for disjoint short copy.  If "aligned" is true, the
1204   // "from" and "to" addresses are assumed to be heapword aligned.
1205   //
1206   // Arguments for generated stub:
1207   //      from:  R3_ARG1
1208   //      to:    R4_ARG2
1209   //  elm.count: R5_ARG3 treated as signed
1210   //
1211   // Strategy for aligned==true:
1212   //
1213   //  If length <= 9:
1214   //     1. copy 2 elements at a time (l_6)
1215   //     2. copy last element if original element count was odd (l_1)
1216   //
1217   //  If length > 9:
1218   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
1219   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
1220   //     3. copy last element if one was left in step 2. (l_1)
1221   //
1222   //
1223   // Strategy for aligned==false:
1224   //
1225   //  If length <= 9: same as aligned==true case, but NOTE: load/stores
1226   //                  can be unaligned (see comment below)
1227   //
1228   //  If length > 9:
1229   //     1. continue with step 6. if the alignment of from and to mod 4
1230   //        is different.
1231   //     2. align from and to to 4 bytes by copying 1 element if necessary
1232   //     3. at l_2 from and to are 4 byte aligned; continue with
1233   //        5. if they cannot be aligned to 8 bytes because they have
1234   //        got different alignment mod 8.
1235   //     4. at this point we know that both, from and to, have the same
1236   //        alignment mod 8, now copy one element if necessary to get
1237   //        8 byte alignment of from and to.
1238   //     5. copy 4 elements at a time until less than 4 elements are
1239   //        left; depending on step 3. all load/stores are aligned or
1240   //        either all loads or all stores are unaligned.
1241   //     6. copy 2 elements at a time until less than 2 elements are
1242   //        left (l_6); arriving here from step 1., there is a chance
1243   //        that all accesses are unaligned.
1244   //     7. copy last element if one was left in step 6. (l_1)
1245   //
1246   //  There are unaligned data accesses using integer load/store
1247   //  instructions in this stub. POWER allows such accesses.
1248   //
1249   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1250   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
1251   //  integer load/stores have good performance. Only unaligned
1252   //  floating point load/stores can have poor performance.
1253   //
1254   //  TODO:
1255   //
1256   //  1. check if aligning the backbranch target of loops is beneficial
1257   //
1258   address generate_disjoint_short_copy(StubId stub_id) {
1259     bool aligned;
1260     switch (stub_id) {
1261     case StubId::stubgen_jshort_disjoint_arraycopy_id:
1262       aligned = false;
1263       break;
1264     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1265       aligned = true;
1266       break;
1267     default:
1268       ShouldNotReachHere();
1269     }
1270 
1271     StubCodeMark mark(this, stub_id);
1272 
1273     Register tmp1 = R6_ARG4;
1274     Register tmp2 = R7_ARG5;
1275     Register tmp3 = R8_ARG6;
1276     Register tmp4 = R9_ARG7;
1277 
1278     VectorSRegister tmp_vsr1  = VSR1;
1279     VectorSRegister tmp_vsr2  = VSR2;
1280 
1281     address start = __ function_entry();
1282     assert_positive_int(R5_ARG3);
1283 
1284     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1285     {
1286       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1287       UnsafeMemoryAccessMark umam(this, !aligned, false);
1288       // don't try anything fancy if arrays don't have many elements
1289       __ li(tmp3, 0);
1290       __ cmpwi(CR0, R5_ARG3, 9);
1291       __ ble(CR0, l_6); // copy 2 at a time
1292 
1293       if (!aligned) {
1294         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1295         __ andi_(tmp1, tmp1, 3);
1296         __ bne(CR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1297 
1298         // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1299 
1300         // Copy 1 element if necessary to align to 4 bytes.
1301         __ andi_(tmp1, R3_ARG1, 3);
1302         __ beq(CR0, l_2);
1303 
1304         __ lhz(tmp2, 0, R3_ARG1);
1305         __ addi(R3_ARG1, R3_ARG1, 2);
1306         __ sth(tmp2, 0, R4_ARG2);
1307         __ addi(R4_ARG2, R4_ARG2, 2);
1308         __ addi(R5_ARG3, R5_ARG3, -1);
1309         __ bind(l_2);
1310 
1311         // At this point the positions of both, from and to, are at least 4 byte aligned.
1312 
1313         // Copy 4 elements at a time.
1314         // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1315         __ xorr(tmp2, R3_ARG1, R4_ARG2);
1316         __ andi_(tmp1, tmp2, 7);
1317         __ bne(CR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1318 
1319         // Copy a 2-element word if necessary to align to 8 bytes.
1320         __ andi_(R0, R3_ARG1, 7);
1321         __ beq(CR0, l_7);
1322 
1323         __ lwzx(tmp2, R3_ARG1, tmp3);
1324         __ addi(R5_ARG3, R5_ARG3, -2);
1325         __ stwx(tmp2, R4_ARG2, tmp3);
1326         { // FasterArrayCopy
1327           __ addi(R3_ARG1, R3_ARG1, 4);
1328           __ addi(R4_ARG2, R4_ARG2, 4);
1329         }
1330       }
1331 
1332       __ bind(l_7);
1333 
1334       // Copy 4 elements at a time; either the loads or the stores can
1335       // be unaligned if aligned == false.
1336 
1337       { // FasterArrayCopy
1338         __ cmpwi(CR0, R5_ARG3, 15);
1339         __ ble(CR0, l_6); // copy 2 at a time if less than 16 elements remain
1340 
1341         __ srdi(tmp1, R5_ARG3, 4);
1342         __ andi_(R5_ARG3, R5_ARG3, 15);
1343         __ mtctr(tmp1);
1344 
1345 
1346         // Processor supports VSX, so use it to mass copy.
1347 
1348           // Prefetch src data into L2 cache.
1349           __ dcbt(R3_ARG1, 0);
1350 
1351           // If supported set DSCR pre-fetch to deepest.
1352           if (VM_Version::has_mfdscr()) {
1353             __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1354             __ mtdscr(tmp2);
1355           }
1356           __ li(tmp1, 16);
1357 
1358           // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1359           // as loop contains < 8 instructions that fit inside a single
1360           // i-cache sector.
1361           __ align(32);
1362 
1363           __ bind(l_9);
1364           // Use loop with VSX load/store instructions to
1365           // copy 16 elements a time.
1366           __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load from src.
1367           __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
1368           __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
1369           __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1370           __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
1371           __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
1372           __ bdnz(l_9);                        // Dec CTR and loop if not zero.
1373 
1374           // Restore DSCR pre-fetch value.
1375           if (VM_Version::has_mfdscr()) {
1376             __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1377             __ mtdscr(tmp2);
1378           }
1379 
1380       } // FasterArrayCopy
1381       __ bind(l_6);
1382 
1383       // copy 2 elements at a time
1384       { // FasterArrayCopy
1385         __ cmpwi(CR0, R5_ARG3, 2);
1386         __ blt(CR0, l_1);
1387         __ srdi(tmp1, R5_ARG3, 1);
1388         __ andi_(R5_ARG3, R5_ARG3, 1);
1389 
1390         __ addi(R3_ARG1, R3_ARG1, -4);
1391         __ addi(R4_ARG2, R4_ARG2, -4);
1392         __ mtctr(tmp1);
1393 
1394         __ bind(l_3);
1395         __ lwzu(tmp2, 4, R3_ARG1);
1396         __ stwu(tmp2, 4, R4_ARG2);
1397         __ bdnz(l_3);
1398 
1399         __ addi(R3_ARG1, R3_ARG1, 4);
1400         __ addi(R4_ARG2, R4_ARG2, 4);
1401       }
1402 
1403       // do single element copy
1404       __ bind(l_1);
1405       __ cmpwi(CR0, R5_ARG3, 0);
1406       __ beq(CR0, l_4);
1407 
1408       { // FasterArrayCopy
1409         __ mtctr(R5_ARG3);
1410         __ addi(R3_ARG1, R3_ARG1, -2);
1411         __ addi(R4_ARG2, R4_ARG2, -2);
1412 
1413         __ bind(l_5);
1414         __ lhzu(tmp2, 2, R3_ARG1);
1415         __ sthu(tmp2, 2, R4_ARG2);
1416         __ bdnz(l_5);
1417       }
1418     }
1419 
1420     __ bind(l_4);
1421     __ li(R3_RET, 0); // return 0
1422     __ blr();
1423 
1424     return start;
1425   }
1426 
1427   // Generate stub for conjoint short copy.  If "aligned" is true, the
1428   // "from" and "to" addresses are assumed to be heapword aligned.
1429   //
1430   // Arguments for generated stub:
1431   //      from:  R3_ARG1
1432   //      to:    R4_ARG2
1433   //      count: R5_ARG3 treated as signed
1434   //
1435   address generate_conjoint_short_copy(StubId stub_id) {
1436     bool aligned;
1437     switch (stub_id) {
1438     case StubId::stubgen_jshort_arraycopy_id:
1439       aligned = false;
1440       break;
1441     case StubId::stubgen_arrayof_jshort_arraycopy_id:
1442       aligned = true;
1443       break;
1444     default:
1445       ShouldNotReachHere();
1446     }
1447 
1448     StubCodeMark mark(this, stub_id);
1449     address start = __ function_entry();
1450     assert_positive_int(R5_ARG3);
1451 
1452     Register tmp1 = R6_ARG4;
1453     Register tmp2 = R7_ARG5;
1454     Register tmp3 = R8_ARG6;
1455 
1456     address nooverlap_target = aligned ?
1457       STUB_ENTRY(arrayof_jshort_disjoint_arraycopy()) :
1458       STUB_ENTRY(jshort_disjoint_arraycopy());
1459 
1460     array_overlap_test(nooverlap_target, 1);
1461 
1462     Label l_1, l_2;
1463     {
1464       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1465       UnsafeMemoryAccessMark umam(this, !aligned, false);
1466       __ sldi(tmp1, R5_ARG3, 1);
1467       __ b(l_2);
1468       __ bind(l_1);
1469       __ sthx(tmp2, R4_ARG2, tmp1);
1470       __ bind(l_2);
1471       __ addic_(tmp1, tmp1, -2);
1472       __ lhzx(tmp2, R3_ARG1, tmp1);
1473       __ bge(CR0, l_1);
1474     }
1475     __ li(R3_RET, 0); // return 0
1476     __ blr();
1477 
1478     return start;
1479   }
1480 
1481   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"
1482   // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1483   //
1484   // Arguments:
1485   //      from:  R3_ARG1
1486   //      to:    R4_ARG2
1487   //      count: R5_ARG3 treated as signed
1488   //
1489   void generate_disjoint_int_copy_core(bool aligned) {
1490     Register tmp1 = R6_ARG4;
1491     Register tmp2 = R7_ARG5;
1492     Register tmp3 = R8_ARG6;
1493     Register tmp4 = R0;
1494 
1495     VectorSRegister tmp_vsr1  = VSR1;
1496     VectorSRegister tmp_vsr2  = VSR2;
1497 
1498     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1499 
1500     // for short arrays, just do single element copy
1501     __ li(tmp3, 0);
1502     __ cmpwi(CR0, R5_ARG3, 5);
1503     __ ble(CR0, l_2);
1504 
1505     if (!aligned) {
1506         // check if arrays have same alignment mod 8.
1507         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1508         __ andi_(R0, tmp1, 7);
1509         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1510         __ bne(CR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1511 
1512         // copy 1 element to align to and from on an 8 byte boundary
1513         __ andi_(R0, R3_ARG1, 7);
1514         __ beq(CR0, l_4);
1515 
1516         __ lwzx(tmp2, R3_ARG1, tmp3);
1517         __ addi(R5_ARG3, R5_ARG3, -1);
1518         __ stwx(tmp2, R4_ARG2, tmp3);
1519         { // FasterArrayCopy
1520           __ addi(R3_ARG1, R3_ARG1, 4);
1521           __ addi(R4_ARG2, R4_ARG2, 4);
1522         }
1523         __ bind(l_4);
1524       }
1525 
1526     { // FasterArrayCopy
1527       __ cmpwi(CR0, R5_ARG3, 7);
1528       __ ble(CR0, l_2); // copy 1 at a time if less than 8 elements remain
1529 
1530       __ srdi(tmp1, R5_ARG3, 3);
1531       __ andi_(R5_ARG3, R5_ARG3, 7);
1532       __ mtctr(tmp1);
1533 
1534     // Processor supports VSX, so use it to mass copy.
1535 
1536     // Prefetch the data into the L2 cache.
1537     __ dcbt(R3_ARG1, 0);
1538 
1539     // Set DSCR pre-fetch to deepest.
1540     if (VM_Version::has_mfdscr()) {
1541       __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1542       __ mtdscr(tmp2);
1543     }
1544     __ li(tmp1, 16);
1545 
1546     // Backbranch target aligned to 32-byte. Not 16-byte align as
1547     // loop contains < 8 instructions that fit inside a single
1548     // i-cache sector.
1549     __ align(32);
1550 
1551     __ bind(l_7);
1552     // Use loop with VSX load/store instructions to
1553     // copy 8 elements a time.
1554     __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1555     __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1556     __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1557     __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1558     __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1559     __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1560     __ bdnz(l_7);                        // Dec CTR and loop if not zero.
1561 
1562     // Restore DSCR pre-fetch value.
1563     if (VM_Version::has_mfdscr()) {
1564       __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1565       __ mtdscr(tmp2);
1566     }
1567 
1568    } // FasterArrayCopy
1569 
1570     // copy 1 element at a time
1571     __ bind(l_2);
1572     __ cmpwi(CR0, R5_ARG3, 0);
1573     __ beq(CR0, l_1);
1574 
1575     { // FasterArrayCopy
1576       __ mtctr(R5_ARG3);
1577       __ addi(R3_ARG1, R3_ARG1, -4);
1578       __ addi(R4_ARG2, R4_ARG2, -4);
1579 
1580       __ bind(l_3);
1581       __ lwzu(tmp2, 4, R3_ARG1);
1582       __ stwu(tmp2, 4, R4_ARG2);
1583       __ bdnz(l_3);
1584     }
1585 
1586     __ bind(l_1);
1587     return;
1588   }
1589 
1590   // Generate stub for disjoint int copy.  If "aligned" is true, the
1591   // "from" and "to" addresses are assumed to be heapword aligned.
1592   //
1593   // Arguments for generated stub:
1594   //      from:  R3_ARG1
1595   //      to:    R4_ARG2
1596   //      count: R5_ARG3 treated as signed
1597   //
1598   address generate_disjoint_int_copy(StubId stub_id) {
1599     bool aligned;
1600     switch (stub_id) {
1601     case StubId::stubgen_jint_disjoint_arraycopy_id:
1602       aligned = false;
1603       break;
1604     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1605       aligned = true;
1606       break;
1607     default:
1608       ShouldNotReachHere();
1609     }
1610 
1611     StubCodeMark mark(this, stub_id);
1612     address start = __ function_entry();
1613     assert_positive_int(R5_ARG3);
1614     {
1615       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1616       UnsafeMemoryAccessMark umam(this, !aligned, false);
1617       generate_disjoint_int_copy_core(aligned);
1618     }
1619     __ li(R3_RET, 0); // return 0
1620     __ blr();
1621     return start;
1622   }
1623 
1624   // Generate core code for conjoint int copy (and oop copy on
1625   // 32-bit).  If "aligned" is true, the "from" and "to" addresses
1626   // are assumed to be heapword aligned.
1627   //
1628   // Arguments:
1629   //      from:  R3_ARG1
1630   //      to:    R4_ARG2
1631   //      count: R5_ARG3 treated as signed
1632   //
1633   void generate_conjoint_int_copy_core(bool aligned) {
1634     // Do reverse copy.  We assume the case of actual overlap is rare enough
1635     // that we don't have to optimize it.
1636 
1637     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1638 
1639     Register tmp1 = R6_ARG4;
1640     Register tmp2 = R7_ARG5;
1641     Register tmp3 = R8_ARG6;
1642     Register tmp4 = R0;
1643 
1644     VectorSRegister tmp_vsr1  = VSR1;
1645     VectorSRegister tmp_vsr2  = VSR2;
1646 
1647     { // FasterArrayCopy
1648       __ cmpwi(CR0, R5_ARG3, 0);
1649       __ beq(CR0, l_6);
1650 
1651       __ sldi(R5_ARG3, R5_ARG3, 2);
1652       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1653       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1654       __ srdi(R5_ARG3, R5_ARG3, 2);
1655 
1656       if (!aligned) {
1657         // check if arrays have same alignment mod 8.
1658         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1659         __ andi_(R0, tmp1, 7);
1660         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1661         __ bne(CR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1662 
1663         // copy 1 element to align to and from on an 8 byte boundary
1664         __ andi_(R0, R3_ARG1, 7);
1665         __ beq(CR0, l_7);
1666 
1667         __ addi(R3_ARG1, R3_ARG1, -4);
1668         __ addi(R4_ARG2, R4_ARG2, -4);
1669         __ addi(R5_ARG3, R5_ARG3, -1);
1670         __ lwzx(tmp2, R3_ARG1);
1671         __ stwx(tmp2, R4_ARG2);
1672         __ bind(l_7);
1673       }
1674 
1675       __ cmpwi(CR0, R5_ARG3, 7);
1676       __ ble(CR0, l_5); // copy 1 at a time if less than 8 elements remain
1677 
1678       __ srdi(tmp1, R5_ARG3, 3);
1679       __ andi(R5_ARG3, R5_ARG3, 7);
1680       __ mtctr(tmp1);
1681 
1682       // Processor supports VSX, so use it to mass copy.
1683       // Prefetch the data into the L2 cache.
1684       __ dcbt(R3_ARG1, 0);
1685 
1686       // Set DSCR pre-fetch to deepest.
1687       if (VM_Version::has_mfdscr()) {
1688         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1689         __ mtdscr(tmp2);
1690       }
1691       __ li(tmp1, 16);
1692 
1693       // Backbranch target aligned to 32-byte. Not 16-byte align as
1694       // loop contains < 8 instructions that fit inside a single
1695       // i-cache sector.
1696       __ align(32);
1697 
1698       __ bind(l_4);
1699       // Use loop with VSX load/store instructions to
1700       // copy 8 elements a time.
1701       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1702       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1703       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1704       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1705       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1706       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1707       __ bdnz(l_4);
1708 
1709       // Restore DSCR pre-fetch value.
1710       if (VM_Version::has_mfdscr()) {
1711         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1712         __ mtdscr(tmp2);
1713       }
1714 
1715       __ cmpwi(CR0, R5_ARG3, 0);
1716       __ beq(CR0, l_6);
1717 
1718       __ bind(l_5);
1719       __ mtctr(R5_ARG3);
1720       __ bind(l_3);
1721       __ lwz(R0, -4, R3_ARG1);
1722       __ stw(R0, -4, R4_ARG2);
1723       __ addi(R3_ARG1, R3_ARG1, -4);
1724       __ addi(R4_ARG2, R4_ARG2, -4);
1725       __ bdnz(l_3);
1726 
1727       __ bind(l_6);
1728     }
1729   }
1730 
1731   // Generate stub for conjoint int copy.  If "aligned" is true, the
1732   // "from" and "to" addresses are assumed to be heapword aligned.
1733   //
1734   // Arguments for generated stub:
1735   //      from:  R3_ARG1
1736   //      to:    R4_ARG2
1737   //      count: R5_ARG3 treated as signed
1738   //
1739   address generate_conjoint_int_copy(StubId stub_id) {
1740     bool aligned;
1741     switch (stub_id) {
1742     case StubId::stubgen_jint_arraycopy_id:
1743       aligned = false;
1744       break;
1745     case StubId::stubgen_arrayof_jint_arraycopy_id:
1746       aligned = true;
1747       break;
1748     default:
1749       ShouldNotReachHere();
1750     }
1751 
1752     StubCodeMark mark(this, stub_id);
1753     address start = __ function_entry();
1754     assert_positive_int(R5_ARG3);
1755     address nooverlap_target = aligned ?
1756       STUB_ENTRY(arrayof_jint_disjoint_arraycopy()) :
1757       STUB_ENTRY(jint_disjoint_arraycopy());
1758 
1759     array_overlap_test(nooverlap_target, 2);
1760     {
1761       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1762       UnsafeMemoryAccessMark umam(this, !aligned, false);
1763       generate_conjoint_int_copy_core(aligned);
1764     }
1765 
1766     __ li(R3_RET, 0); // return 0
1767     __ blr();
1768 
1769     return start;
1770   }
1771 
1772   // Generate core code for disjoint long copy (and oop copy on
1773   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1774   // are assumed to be heapword aligned.
1775   //
1776   // Arguments:
1777   //      from:  R3_ARG1
1778   //      to:    R4_ARG2
1779   //      count: R5_ARG3 treated as signed
1780   //
1781   void generate_disjoint_long_copy_core(bool aligned) {
1782     Register tmp1 = R6_ARG4;
1783     Register tmp2 = R7_ARG5;
1784     Register tmp3 = R8_ARG6;
1785     Register tmp4 = R0;
1786 
1787     Label l_1, l_2, l_3, l_4, l_5;
1788 
1789     VectorSRegister tmp_vsr1  = VSR1;
1790     VectorSRegister tmp_vsr2  = VSR2;
1791 
1792     { // FasterArrayCopy
1793       __ cmpwi(CR0, R5_ARG3, 3);
1794       __ ble(CR0, l_3); // copy 1 at a time if less than 4 elements remain
1795 
1796       __ srdi(tmp1, R5_ARG3, 2);
1797       __ andi_(R5_ARG3, R5_ARG3, 3);
1798       __ mtctr(tmp1);
1799 
1800       // Processor supports VSX, so use it to mass copy.
1801 
1802       // Prefetch the data into the L2 cache.
1803       __ dcbt(R3_ARG1, 0);
1804 
1805       // Set DSCR pre-fetch to deepest.
1806       if (VM_Version::has_mfdscr()) {
1807         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1808         __ mtdscr(tmp2);
1809       }
1810       __ li(tmp1, 16);
1811 
1812       // Backbranch target aligned to 32-byte. Not 16-byte align as
1813       // loop contains < 8 instructions that fit inside a single
1814       // i-cache sector.
1815       __ align(32);
1816 
1817       __ bind(l_5);
1818       // Use loop with VSX load/store instructions to
1819       // copy 4 elements a time.
1820       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1821       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1822       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1823       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1824       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1825       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1826       __ bdnz(l_5);                        // Dec CTR and loop if not zero.
1827 
1828       // Restore DSCR pre-fetch value.
1829       if (VM_Version::has_mfdscr()) {
1830         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1831         __ mtdscr(tmp2);
1832       }
1833 
1834    } // FasterArrayCopy
1835 
1836     // copy 1 element at a time
1837     __ bind(l_3);
1838     __ cmpwi(CR0, R5_ARG3, 0);
1839     __ beq(CR0, l_1);
1840 
1841     { // FasterArrayCopy
1842       __ mtctr(R5_ARG3);
1843       __ addi(R3_ARG1, R3_ARG1, -8);
1844       __ addi(R4_ARG2, R4_ARG2, -8);
1845 
1846       __ bind(l_2);
1847       __ ldu(R0, 8, R3_ARG1);
1848       __ stdu(R0, 8, R4_ARG2);
1849       __ bdnz(l_2);
1850 
1851     }
1852     __ bind(l_1);
1853   }
1854 
1855   // Generate stub for disjoint long copy.  If "aligned" is true, the
1856   // "from" and "to" addresses are assumed to be heapword aligned.
1857   //
1858   // Arguments for generated stub:
1859   //      from:  R3_ARG1
1860   //      to:    R4_ARG2
1861   //      count: R5_ARG3 treated as signed
1862   //
1863   address generate_disjoint_long_copy(StubId stub_id) {
1864     bool aligned;
1865     switch (stub_id) {
1866     case StubId::stubgen_jlong_disjoint_arraycopy_id:
1867       aligned = false;
1868       break;
1869     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1870       aligned = true;
1871       break;
1872     default:
1873       ShouldNotReachHere();
1874     }
1875 
1876     StubCodeMark mark(this, stub_id);
1877     address start = __ function_entry();
1878     assert_positive_int(R5_ARG3);
1879     {
1880       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1881       UnsafeMemoryAccessMark umam(this, !aligned, false);
1882       generate_disjoint_long_copy_core(aligned);
1883     }
1884     __ li(R3_RET, 0); // return 0
1885     __ blr();
1886 
1887   return start;
1888   }
1889 
1890   // Generate core code for conjoint long copy (and oop copy on
1891   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1892   // are assumed to be heapword aligned.
1893   //
1894   // Arguments:
1895   //      from:  R3_ARG1
1896   //      to:    R4_ARG2
1897   //      count: R5_ARG3 treated as signed
1898   //
1899   void generate_conjoint_long_copy_core(bool aligned) {
1900     Register tmp1 = R6_ARG4;
1901     Register tmp2 = R7_ARG5;
1902     Register tmp3 = R8_ARG6;
1903     Register tmp4 = R0;
1904 
1905     VectorSRegister tmp_vsr1  = VSR1;
1906     VectorSRegister tmp_vsr2  = VSR2;
1907 
1908     Label l_1, l_2, l_3, l_4, l_5;
1909 
1910     __ cmpwi(CR0, R5_ARG3, 0);
1911     __ beq(CR0, l_1);
1912 
1913     { // FasterArrayCopy
1914       __ sldi(R5_ARG3, R5_ARG3, 3);
1915       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1916       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1917       __ srdi(R5_ARG3, R5_ARG3, 3);
1918 
1919       __ cmpwi(CR0, R5_ARG3, 3);
1920       __ ble(CR0, l_5); // copy 1 at a time if less than 4 elements remain
1921 
1922       __ srdi(tmp1, R5_ARG3, 2);
1923       __ andi(R5_ARG3, R5_ARG3, 3);
1924       __ mtctr(tmp1);
1925 
1926       // Processor supports VSX, so use it to mass copy.
1927       // Prefetch the data into the L2 cache.
1928       __ dcbt(R3_ARG1, 0);
1929 
1930       // Set DSCR pre-fetch to deepest.
1931       if (VM_Version::has_mfdscr()) {
1932         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1933         __ mtdscr(tmp2);
1934       }
1935       __ li(tmp1, 16);
1936 
1937       // Backbranch target aligned to 32-byte. Not 16-byte align as
1938       // loop contains < 8 instructions that fit inside a single
1939       // i-cache sector.
1940       __ align(32);
1941 
1942       __ bind(l_4);
1943       // Use loop with VSX load/store instructions to
1944       // copy 4 elements a time.
1945       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1946       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1947       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1948       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1949       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1950       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1951       __ bdnz(l_4);
1952 
1953       // Restore DSCR pre-fetch value.
1954       if (VM_Version::has_mfdscr()) {
1955         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1956         __ mtdscr(tmp2);
1957       }
1958 
1959       __ cmpwi(CR0, R5_ARG3, 0);
1960       __ beq(CR0, l_1);
1961 
1962       __ bind(l_5);
1963       __ mtctr(R5_ARG3);
1964       __ bind(l_3);
1965       __ ld(R0, -8, R3_ARG1);
1966       __ std(R0, -8, R4_ARG2);
1967       __ addi(R3_ARG1, R3_ARG1, -8);
1968       __ addi(R4_ARG2, R4_ARG2, -8);
1969       __ bdnz(l_3);
1970 
1971     }
1972     __ bind(l_1);
1973   }
1974 
1975   // Generate stub for conjoint long copy.  If "aligned" is true, the
1976   // "from" and "to" addresses are assumed to be heapword aligned.
1977   //
1978   // Arguments for generated stub:
1979   //      from:  R3_ARG1
1980   //      to:    R4_ARG2
1981   //      count: R5_ARG3 treated as signed
1982   //
1983   address generate_conjoint_long_copy(StubId stub_id) {
1984     bool aligned;
1985     switch (stub_id) {
1986     case StubId::stubgen_jlong_arraycopy_id:
1987       aligned = false;
1988       break;
1989     case StubId::stubgen_arrayof_jlong_arraycopy_id:
1990       aligned = true;
1991       break;
1992     default:
1993       ShouldNotReachHere();
1994     }
1995 
1996     StubCodeMark mark(this, stub_id);
1997     address start = __ function_entry();
1998     assert_positive_int(R5_ARG3);
1999     address nooverlap_target = aligned ?
2000       STUB_ENTRY(arrayof_jlong_disjoint_arraycopy()) :
2001       STUB_ENTRY(jlong_disjoint_arraycopy());
2002 
2003     array_overlap_test(nooverlap_target, 3);
2004     {
2005       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
2006       UnsafeMemoryAccessMark umam(this, !aligned, false);
2007       generate_conjoint_long_copy_core(aligned);
2008     }
2009     __ li(R3_RET, 0); // return 0
2010     __ blr();
2011 
2012     return start;
2013   }
2014 
2015   // Generate stub for conjoint oop copy.  If "aligned" is true, the
2016   // "from" and "to" addresses are assumed to be heapword aligned.
2017   //
2018   // Arguments for generated stub:
2019   //      from:  R3_ARG1
2020   //      to:    R4_ARG2
2021   //      count: R5_ARG3 treated as signed
2022   //      dest_uninitialized: G1 support
2023   //
2024   address generate_conjoint_oop_copy(StubId stub_id) {
2025     bool aligned;
2026     bool dest_uninitialized;
2027     switch (stub_id) {
2028     case StubId::stubgen_oop_arraycopy_id:
2029       aligned = false;
2030       dest_uninitialized = false;
2031       break;
2032     case StubId::stubgen_arrayof_oop_arraycopy_id:
2033       aligned = true;
2034       dest_uninitialized = false;
2035       break;
2036     case StubId::stubgen_oop_arraycopy_uninit_id:
2037       aligned = false;
2038       dest_uninitialized = true;
2039       break;
2040     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
2041       aligned = true;
2042       dest_uninitialized = true;
2043       break;
2044     default:
2045       ShouldNotReachHere();
2046     }
2047 
2048     StubCodeMark mark(this, stub_id);
2049     address start = __ function_entry();
2050     assert_positive_int(R5_ARG3);
2051     address nooverlap_target = aligned ?
2052       STUB_ENTRY(arrayof_oop_disjoint_arraycopy(dest_uninitialized)) :
2053       STUB_ENTRY(oop_disjoint_arraycopy(dest_uninitialized));
2054 
2055     array_overlap_test(nooverlap_target, UseCompressedOops ? 2 : 3);
2056 
2057     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2058     if (dest_uninitialized) {
2059       decorators |= IS_DEST_UNINITIALIZED;
2060     }
2061     if (aligned) {
2062       decorators |= ARRAYCOPY_ALIGNED;
2063     }
2064 
2065     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2066     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2067 
2068     if (UseCompressedOops) {
2069       generate_conjoint_int_copy_core(aligned);
2070     } else {
2071 #if INCLUDE_ZGC
2072       if (UseZGC) {
2073         ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
2074         zbs->generate_conjoint_oop_copy(_masm, dest_uninitialized);
2075       } else
2076 #endif
2077       generate_conjoint_long_copy_core(aligned);
2078     }
2079 
2080     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2081     __ li(R3_RET, 0); // return 0
2082     __ blr();
2083     return start;
2084   }
2085 
2086   // Generate stub for disjoint oop copy.  If "aligned" is true, the
2087   // "from" and "to" addresses are assumed to be heapword aligned.
2088   //
2089   // Arguments for generated stub:
2090   //      from:  R3_ARG1
2091   //      to:    R4_ARG2
2092   //      count: R5_ARG3 treated as signed
2093   //      dest_uninitialized: G1 support
2094   //
2095   address generate_disjoint_oop_copy(StubId stub_id) {
2096     bool aligned;
2097     bool dest_uninitialized;
2098     switch (stub_id) {
2099     case StubId::stubgen_oop_disjoint_arraycopy_id:
2100       aligned = false;
2101       dest_uninitialized = false;
2102       break;
2103     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
2104       aligned = true;
2105       dest_uninitialized = false;
2106       break;
2107     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2108       aligned = false;
2109       dest_uninitialized = true;
2110       break;
2111     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
2112       aligned = true;
2113       dest_uninitialized = true;
2114       break;
2115     default:
2116       ShouldNotReachHere();
2117     }
2118 
2119     StubCodeMark mark(this, stub_id);
2120     address start = __ function_entry();
2121     assert_positive_int(R5_ARG3);
2122 
2123     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2124     if (dest_uninitialized) {
2125       decorators |= IS_DEST_UNINITIALIZED;
2126     }
2127     if (aligned) {
2128       decorators |= ARRAYCOPY_ALIGNED;
2129     }
2130 
2131     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2132     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2133 
2134     if (UseCompressedOops) {
2135       generate_disjoint_int_copy_core(aligned);
2136     } else {
2137 #if INCLUDE_ZGC
2138       if (UseZGC) {
2139         ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
2140         zbs->generate_disjoint_oop_copy(_masm, dest_uninitialized);
2141       } else
2142 #endif
2143       generate_disjoint_long_copy_core(aligned);
2144     }
2145 
2146     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2147     __ li(R3_RET, 0); // return 0
2148     __ blr();
2149 
2150     return start;
2151   }
2152 
2153 
2154   // Helper for generating a dynamic type check.
2155   // Smashes only the given temp registers.
2156   void generate_type_check(Register sub_klass,
2157                            Register super_check_offset,
2158                            Register super_klass,
2159                            Register temp1,
2160                            Register temp2,
2161                            Label& L_success) {
2162     assert_different_registers(sub_klass, super_check_offset, super_klass);
2163 
2164     BLOCK_COMMENT("type_check:");
2165 
2166     Label L_miss;
2167 
2168     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp1, temp2, &L_success, &L_miss, nullptr,
2169                                      super_check_offset);
2170     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success);
2171 
2172     // Fall through on failure!
2173     __ bind(L_miss);
2174   }
2175 
2176 
2177   //  Generate stub for checked oop copy.
2178   //
2179   // Arguments for generated stub:
2180   //      from:  R3
2181   //      to:    R4
2182   //      count: R5 treated as signed
2183   //      ckoff: R6 (super_check_offset)
2184   //      ckval: R7 (super_klass)
2185   //      ret:   R3 zero for success; (-1^K) where K is partial transfer count
2186   //
2187   address generate_checkcast_copy(StubId stub_id) {
2188     const Register R3_from   = R3_ARG1;      // source array address
2189     const Register R4_to     = R4_ARG2;      // destination array address
2190     const Register R5_count  = R5_ARG3;      // elements count
2191     const Register R6_ckoff  = R6_ARG4;      // super_check_offset
2192     const Register R7_ckval  = R7_ARG5;      // super_klass
2193 
2194     const Register R8_offset = R8_ARG6;      // loop var, with stride wordSize
2195     const Register R9_remain = R9_ARG7;      // loop var, with stride -1
2196     const Register R10_oop   = R10_ARG8;     // actual oop copied
2197     const Register R11_klass = R11_scratch1; // oop._klass
2198     const Register R12_tmp   = R12_scratch2;
2199     const Register R2_tmp    = R2;
2200 
2201     bool dest_uninitialized;
2202     switch (stub_id) {
2203     case StubId::stubgen_checkcast_arraycopy_id:
2204       dest_uninitialized = false;
2205       break;
2206     case StubId::stubgen_checkcast_arraycopy_uninit_id:
2207       dest_uninitialized = true;
2208       break;
2209     default:
2210       ShouldNotReachHere();
2211     }
2212     //__ align(CodeEntryAlignment);
2213     StubCodeMark mark(this, stub_id);
2214     address start = __ function_entry();
2215 
2216     // Assert that int is 64 bit sign extended and arrays are not conjoint.
2217 #ifdef ASSERT
2218     {
2219     assert_positive_int(R5_ARG3);
2220     const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2;
2221     Label no_overlap;
2222     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
2223     __ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes
2224     __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
2225     __ cmpld(CR1, tmp1, tmp2);
2226     __ crnand(CR0, Assembler::less, CR1, Assembler::less);
2227     // Overlaps if Src before dst and distance smaller than size.
2228     // Branch to forward copy routine otherwise.
2229     __ blt(CR0, no_overlap);
2230     __ stop("overlap in checkcast_copy");
2231     __ bind(no_overlap);
2232     }
2233 #endif
2234 
2235     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2236     if (dest_uninitialized) {
2237       decorators |= IS_DEST_UNINITIALIZED;
2238     }
2239 
2240     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2241     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_from, R4_to, R5_count, /* preserve: */ R6_ckoff, R7_ckval);
2242 
2243     //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
2244 
2245     Label load_element, store_element, store_null, success, do_epilogue;
2246     __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
2247     __ li(R8_offset, 0);                   // Offset from start of arrays.
2248     __ bne(CR0, load_element);
2249 
2250     // Empty array: Nothing to do.
2251     __ li(R3_RET, 0);           // Return 0 on (trivial) success.
2252     __ blr();
2253 
2254     // ======== begin loop ========
2255     // (Entry is load_element.)
2256     __ align(OptoLoopAlignment);
2257     __ bind(store_element);
2258     if (UseCompressedOops) {
2259       __ encode_heap_oop_not_null(R10_oop);
2260       __ bind(store_null);
2261       __ stw(R10_oop, R8_offset, R4_to);
2262     } else {
2263       __ bind(store_null);
2264 #if INCLUDE_ZGC
2265       if (UseZGC) {
2266         __ store_heap_oop(R10_oop, R8_offset, R4_to, R11_scratch1, R12_tmp, noreg,
2267                           MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2268                           dest_uninitialized ? IS_DEST_UNINITIALIZED : 0);
2269       } else
2270 #endif
2271       __ std(R10_oop, R8_offset, R4_to);
2272     }
2273 
2274     __ addi(R8_offset, R8_offset, heapOopSize);   // Step to next offset.
2275     __ addic_(R9_remain, R9_remain, -1);          // Decrement the count.
2276     __ beq(CR0, success);
2277 
2278     // ======== loop entry is here ========
2279     __ bind(load_element);
2280 #if INCLUDE_ZGC
2281     if (UseZGC) {
2282       __ load_heap_oop(R10_oop, R8_offset, R3_from,
2283                        R11_scratch1, R12_tmp,
2284                        MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2285                        0, &store_null);
2286     } else
2287 #endif
2288     __ load_heap_oop(R10_oop, R8_offset, R3_from,
2289                      R11_scratch1, R12_tmp,
2290                      MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2291                      AS_RAW, &store_null);
2292 
2293     __ load_klass(R11_klass, R10_oop); // Query the object klass.
2294 
2295     generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, R2_tmp,
2296                         // Branch to this on success:
2297                         store_element);
2298     // ======== end loop ========
2299 
2300     // It was a real error; we must depend on the caller to finish the job.
2301     // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
2302     // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
2303     // and report their number to the caller.
2304     __ subf_(R5_count, R9_remain, R5_count);
2305     __ nand(R3_RET, R5_count, R5_count);   // report (-1^K) to caller
2306     __ bne(CR0, do_epilogue);
2307     __ blr();
2308 
2309     __ bind(success);
2310     __ li(R3_RET, 0);
2311 
2312     __ bind(do_epilogue);
2313     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_to, R5_count, /* preserve */ R3_RET);
2314 
2315     __ blr();
2316     return start;
2317   }
2318 
2319 
2320   //  Generate 'unsafe' array copy stub.
2321   //  Though just as safe as the other stubs, it takes an unscaled
2322   //  size_t argument instead of an element count.
2323   //
2324   // Arguments for generated stub:
2325   //      from:  R3
2326   //      to:    R4
2327   //      count: R5 byte count, treated as ssize_t, can be zero
2328   //
2329   // Examines the alignment of the operands and dispatches
2330   // to a long, int, short, or byte copy loop.
2331   //
2332   address generate_unsafe_copy(address byte_copy_entry,
2333                                address short_copy_entry,
2334                                address int_copy_entry,
2335                                address long_copy_entry) {
2336 
2337     const Register R3_from   = R3_ARG1;      // source array address
2338     const Register R4_to     = R4_ARG2;      // destination array address
2339     const Register R5_count  = R5_ARG3;      // elements count (as long on PPC64)
2340 
2341     const Register R6_bits   = R6_ARG4;      // test copy of low bits
2342     const Register R7_tmp    = R7_ARG5;
2343 
2344     //__ align(CodeEntryAlignment);
2345     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2346     StubCodeMark mark(this, stub_id);
2347     address start = __ function_entry();
2348 
2349     // Bump this on entry, not on exit:
2350     //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
2351 
2352     Label short_copy, int_copy, long_copy;
2353 
2354     __ orr(R6_bits, R3_from, R4_to);
2355     __ orr(R6_bits, R6_bits, R5_count);
2356     __ andi_(R0, R6_bits, (BytesPerLong-1));
2357     __ beq(CR0, long_copy);
2358 
2359     __ andi_(R0, R6_bits, (BytesPerInt-1));
2360     __ beq(CR0, int_copy);
2361 
2362     __ andi_(R0, R6_bits, (BytesPerShort-1));
2363     __ beq(CR0, short_copy);
2364 
2365     // byte_copy:
2366     __ b(byte_copy_entry);
2367 
2368     __ bind(short_copy);
2369     __ srwi(R5_count, R5_count, LogBytesPerShort);
2370     __ b(short_copy_entry);
2371 
2372     __ bind(int_copy);
2373     __ srwi(R5_count, R5_count, LogBytesPerInt);
2374     __ b(int_copy_entry);
2375 
2376     __ bind(long_copy);
2377     __ srwi(R5_count, R5_count, LogBytesPerLong);
2378     __ b(long_copy_entry);
2379 
2380     return start;
2381   }
2382 
2383 
2384   // Perform range checks on the proposed arraycopy.
2385   // Kills the two temps, but nothing else.
2386   // Also, clean the sign bits of src_pos and dst_pos.
2387   void arraycopy_range_checks(Register src,     // source array oop
2388                               Register src_pos, // source position
2389                               Register dst,     // destination array oop
2390                               Register dst_pos, // destination position
2391                               Register length,  // length of copy
2392                               Register temp1, Register temp2,
2393                               Label& L_failed) {
2394     BLOCK_COMMENT("arraycopy_range_checks:");
2395 
2396     const Register array_length = temp1;  // scratch
2397     const Register end_pos      = temp2;  // scratch
2398 
2399     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2400     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
2401     __ add(end_pos, src_pos, length);  // src_pos + length
2402     __ cmpd(CR0, end_pos, array_length);
2403     __ bgt(CR0, L_failed);
2404 
2405     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2406     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
2407     __ add(end_pos, dst_pos, length);  // src_pos + length
2408     __ cmpd(CR0, end_pos, array_length);
2409     __ bgt(CR0, L_failed);
2410 
2411     BLOCK_COMMENT("arraycopy_range_checks done");
2412   }
2413 
2414 
2415   // Helper for generate_unsafe_setmemory
2416   //
2417   // Atomically fill an array of memory using 1-, 2-, 4-, or 8-byte chunks and return.
2418   static void do_setmemory_atomic_loop(int elem_size, Register dest, Register size, Register byteVal,
2419                                        MacroAssembler *_masm) {
2420 
2421     Label L_Loop, L_Tail; // 2x unrolled loop
2422 
2423     // Propagate byte to required width
2424     if (elem_size > 1) __ rldimi(byteVal, byteVal,  8, 64 - 2 *  8);
2425     if (elem_size > 2) __ rldimi(byteVal, byteVal, 16, 64 - 2 * 16);
2426     if (elem_size > 4) __ rldimi(byteVal, byteVal, 32, 64 - 2 * 32);
2427 
2428     __ srwi_(R0, size, exact_log2(2 * elem_size)); // size is a 32 bit value
2429     __ beq(CR0, L_Tail);
2430     __ mtctr(R0);
2431 
2432     __ align(32); // loop alignment
2433     __ bind(L_Loop);
2434     __ store_sized_value(byteVal, 0, dest, elem_size);
2435     __ store_sized_value(byteVal, elem_size, dest, elem_size);
2436     __ addi(dest, dest, 2 * elem_size);
2437     __ bdnz(L_Loop);
2438 
2439     __ bind(L_Tail);
2440     __ andi_(R0, size, elem_size);
2441     __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintbhBCLRisReturn);
2442     __ store_sized_value(byteVal, 0, dest, elem_size);
2443     __ blr();
2444   }
2445 
2446   //
2447   //  Generate 'unsafe' set memory stub
2448   //  Though just as safe as the other stubs, it takes an unscaled
2449   //  size_t (# bytes) argument instead of an element count.
2450   //
2451   //  Input:
2452   //    R3_ARG1   - destination array address
2453   //    R4_ARG2   - byte count (size_t)
2454   //    R5_ARG3   - byte value
2455   //
2456   address generate_unsafe_setmemory(address unsafe_byte_fill) {
2457     __ align(CodeEntryAlignment);
2458     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2459     address start = __ function_entry();
2460 
2461     // bump this on entry, not on exit:
2462     // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
2463 
2464     {
2465       Label L_fill8Bytes, L_fill4Bytes, L_fillBytes;
2466 
2467       const Register dest = R3_ARG1;
2468       const Register size = R4_ARG2;
2469       const Register byteVal = R5_ARG3;
2470       const Register rScratch1 = R6;
2471 
2472       // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2473 
2474       // Check for pointer & size alignment
2475       __ orr(rScratch1, dest, size);
2476 
2477       __ andi_(R0, rScratch1, 7);
2478       __ beq(CR0, L_fill8Bytes);
2479 
2480       __ andi_(R0, rScratch1, 3);
2481       __ beq(CR0, L_fill4Bytes);
2482 
2483       __ andi_(R0, rScratch1, 1);
2484       __ bne(CR0, L_fillBytes);
2485 
2486       // Mark remaining code as such which performs Unsafe accesses.
2487       UnsafeMemoryAccessMark umam(this, true, false);
2488 
2489       // At this point, we know the lower bit of size is zero and a
2490       // multiple of 2
2491       do_setmemory_atomic_loop(2, dest, size, byteVal, _masm);
2492 
2493       __ align(32);
2494       __ bind(L_fill8Bytes);
2495       // At this point, we know the lower 3 bits of size are zero and a
2496       // multiple of 8
2497       do_setmemory_atomic_loop(8, dest, size, byteVal, _masm);
2498 
2499       __ align(32);
2500       __ bind(L_fill4Bytes);
2501       // At this point, we know the lower 2 bits of size are zero and a
2502       // multiple of 4
2503       do_setmemory_atomic_loop(4, dest, size, byteVal, _masm);
2504 
2505       __ align(32);
2506       __ bind(L_fillBytes);
2507       do_setmemory_atomic_loop(1, dest, size, byteVal, _masm);
2508     }
2509 
2510     return start;
2511   }
2512 
2513 
2514   //
2515   //  Generate generic array copy stubs
2516   //
2517   //  Input:
2518   //    R3    -  src oop
2519   //    R4    -  src_pos
2520   //    R5    -  dst oop
2521   //    R6    -  dst_pos
2522   //    R7    -  element count
2523   //
2524   //  Output:
2525   //    R3 ==  0  -  success
2526   //    R3 == -1  -  need to call System.arraycopy
2527   //
2528   address generate_generic_copy(address entry_jbyte_arraycopy,
2529                                 address entry_jshort_arraycopy,
2530                                 address entry_jint_arraycopy,
2531                                 address entry_oop_arraycopy,
2532                                 address entry_disjoint_oop_arraycopy,
2533                                 address entry_jlong_arraycopy,
2534                                 address entry_checkcast_arraycopy) {
2535     Label L_failed, L_objArray;
2536 
2537     // Input registers
2538     const Register src       = R3_ARG1;  // source array oop
2539     const Register src_pos   = R4_ARG2;  // source position
2540     const Register dst       = R5_ARG3;  // destination array oop
2541     const Register dst_pos   = R6_ARG4;  // destination position
2542     const Register length    = R7_ARG5;  // elements count
2543 
2544     // registers used as temp
2545     const Register src_klass = R8_ARG6;  // source array klass
2546     const Register dst_klass = R9_ARG7;  // destination array klass
2547     const Register lh        = R10_ARG8; // layout handler
2548     const Register temp      = R2;
2549 
2550     //__ align(CodeEntryAlignment);
2551     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2552     StubCodeMark mark(this, stub_id);
2553     address start = __ function_entry();
2554 
2555     // Bump this on entry, not on exit:
2556     //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
2557 
2558     // In principle, the int arguments could be dirty.
2559 
2560     //-----------------------------------------------------------------------
2561     // Assembler stubs will be used for this call to arraycopy
2562     // if the following conditions are met:
2563     //
2564     // (1) src and dst must not be null.
2565     // (2) src_pos must not be negative.
2566     // (3) dst_pos must not be negative.
2567     // (4) length  must not be negative.
2568     // (5) src klass and dst klass should be the same and not null.
2569     // (6) src and dst should be arrays.
2570     // (7) src_pos + length must not exceed length of src.
2571     // (8) dst_pos + length must not exceed length of dst.
2572     BLOCK_COMMENT("arraycopy initial argument checks");
2573 
2574     __ cmpdi(CR1, src, 0);      // if (src == nullptr) return -1;
2575     __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
2576     __ cmpdi(CR5, dst, 0);      // if (dst == nullptr) return -1;
2577     __ cror(CR1, Assembler::equal, CR0, Assembler::less);
2578     __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
2579     __ cror(CR5, Assembler::equal, CR0, Assembler::less);
2580     __ extsw_(length, length);   // if (length < 0) return -1;
2581     __ cror(CR1, Assembler::equal, CR5, Assembler::equal);
2582     __ cror(CR1, Assembler::equal, CR0, Assembler::less);
2583     __ beq(CR1, L_failed);
2584 
2585     BLOCK_COMMENT("arraycopy argument klass checks");
2586     __ load_klass(src_klass, src);
2587     __ load_klass(dst_klass, dst);
2588 
2589     // Load layout helper
2590     //
2591     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2592     // 32        30    24            16              8     2                 0
2593     //
2594     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2595     //
2596 
2597     int lh_offset = in_bytes(Klass::layout_helper_offset());
2598 
2599     // Load 32-bits signed value. Use br() instruction with it to check icc.
2600     __ lwz(lh, lh_offset, src_klass);
2601 
2602     // Handle objArrays completely differently...
2603     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2604     __ load_const_optimized(temp, objArray_lh, R0);
2605     __ cmpw(CR0, lh, temp);
2606     __ beq(CR0, L_objArray);
2607 
2608     __ cmpd(CR5, src_klass, dst_klass);          // if (src->klass() != dst->klass()) return -1;
2609     __ cmpwi(CR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
2610 
2611     __ crnand(CR5, Assembler::equal, CR6, Assembler::less);
2612     __ beq(CR5, L_failed);
2613 
2614     // At this point, it is known to be a typeArray (array_tag 0x3).
2615 #ifdef ASSERT
2616     { Label L;
2617       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2618       __ load_const_optimized(temp, lh_prim_tag_in_place, R0);
2619       __ cmpw(CR0, lh, temp);
2620       __ bge(CR0, L);
2621       __ stop("must be a primitive array");
2622       __ bind(L);
2623     }
2624 #endif
2625 
2626     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2627                            temp, dst_klass, L_failed);
2628 
2629     // TypeArrayKlass
2630     //
2631     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2632     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2633     //
2634 
2635     const Register offset = dst_klass;    // array offset
2636     const Register elsize = src_klass;    // log2 element size
2637 
2638     __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
2639     __ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
2640     __ add(src, offset, src);       // src array offset
2641     __ add(dst, offset, dst);       // dst array offset
2642 
2643     // Next registers should be set before the jump to corresponding stub.
2644     const Register from     = R3_ARG1;  // source array address
2645     const Register to       = R4_ARG2;  // destination array address
2646     const Register count    = R5_ARG3;  // elements count
2647 
2648     // 'from', 'to', 'count' registers should be set in this order
2649     // since they are the same as 'src', 'src_pos', 'dst'.
2650 
2651     BLOCK_COMMENT("scale indexes to element size");
2652     __ sld(src_pos, src_pos, elsize);
2653     __ sld(dst_pos, dst_pos, elsize);
2654     __ add(from, src_pos, src);  // src_addr
2655     __ add(to, dst_pos, dst);    // dst_addr
2656     __ mr(count, length);        // length
2657 
2658     BLOCK_COMMENT("choose copy loop based on element size");
2659     // Using conditional branches with range 32kB.
2660     const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CR0, Assembler::equal);
2661     __ cmpwi(CR0, elsize, 0);
2662     __ bc(bo, bi, entry_jbyte_arraycopy);
2663     __ cmpwi(CR0, elsize, LogBytesPerShort);
2664     __ bc(bo, bi, entry_jshort_arraycopy);
2665     __ cmpwi(CR0, elsize, LogBytesPerInt);
2666     __ bc(bo, bi, entry_jint_arraycopy);
2667 #ifdef ASSERT
2668     { Label L;
2669       __ cmpwi(CR0, elsize, LogBytesPerLong);
2670       __ beq(CR0, L);
2671       __ stop("must be long copy, but elsize is wrong");
2672       __ bind(L);
2673     }
2674 #endif
2675     __ b(entry_jlong_arraycopy);
2676 
2677     // ObjArrayKlass
2678   __ bind(L_objArray);
2679     // live at this point:  src_klass, dst_klass, src[_pos], dst[_pos], length
2680 
2681     Label L_disjoint_plain_copy, L_checkcast_copy;
2682     //  test array classes for subtyping
2683     __ cmpd(CR0, src_klass, dst_klass);         // usual case is exact equality
2684     __ bne(CR0, L_checkcast_copy);
2685 
2686     // Identically typed arrays can be copied without element-wise checks.
2687     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2688                            temp, lh, L_failed);
2689 
2690     __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2691     __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2692     __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2693     __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2694     __ add(from, src_pos, src);  // src_addr
2695     __ add(to, dst_pos, dst);    // dst_addr
2696     __ mr(count, length);        // length
2697     __ b(entry_oop_arraycopy);
2698 
2699   __ bind(L_checkcast_copy);
2700     // live at this point:  src_klass, dst_klass
2701     {
2702       // Before looking at dst.length, make sure dst is also an objArray.
2703       __ lwz(temp, lh_offset, dst_klass);
2704       __ cmpw(CR0, lh, temp);
2705       __ bne(CR0, L_failed);
2706 
2707       // It is safe to examine both src.length and dst.length.
2708       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2709                              temp, lh, L_failed);
2710 
2711       // Marshal the base address arguments now, freeing registers.
2712       __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2713       __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2714       __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2715       __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2716       __ add(from, src_pos, src);  // src_addr
2717       __ add(to, dst_pos, dst);    // dst_addr
2718       __ mr(count, length);        // length
2719 
2720       Register sco_temp = R6_ARG4;             // This register is free now.
2721       assert_different_registers(from, to, count, sco_temp,
2722                                  dst_klass, src_klass);
2723 
2724       // Generate the type check.
2725       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2726       __ lwz(sco_temp, sco_offset, dst_klass);
2727       generate_type_check(src_klass, sco_temp, dst_klass,
2728                           temp, /* temp */ R10_ARG8, L_disjoint_plain_copy);
2729 
2730       // Fetch destination element klass from the ObjArrayKlass header.
2731       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2732 
2733       // The checkcast_copy loop needs two extra arguments:
2734       __ ld(R7_ARG5, ek_offset, dst_klass);   // dest elem klass
2735       __ lwz(R6_ARG4, sco_offset, R7_ARG5);   // sco of elem klass
2736       __ b(entry_checkcast_arraycopy);
2737     }
2738 
2739     __ bind(L_disjoint_plain_copy);
2740     __ b(entry_disjoint_oop_arraycopy);
2741 
2742   __ bind(L_failed);
2743     __ li(R3_RET, -1); // return -1
2744     __ blr();
2745     return start;
2746   }
2747 
2748   // Arguments for generated stub:
2749   //   R3_ARG1   - source byte array address
2750   //   R4_ARG2   - destination byte array address
2751   //   R5_ARG3   - round key array
2752   address generate_aescrypt_encryptBlock() {
2753     assert(UseAES, "need AES instructions and misaligned SSE support");
2754     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2755     StubCodeMark mark(this, stub_id);
2756 
2757     address start = __ function_entry();
2758 
2759     Label L_doLast, L_error;
2760 
2761     Register from           = R3_ARG1;  // source array address
2762     Register to             = R4_ARG2;  // destination array address
2763     Register key            = R5_ARG3;  // round key array
2764 
2765     Register keylen         = R8;
2766     Register temp           = R9;
2767     Register keypos         = R10;
2768     Register fifteen        = R12;
2769 
2770     VectorRegister vRet     = VR0;
2771 
2772     VectorRegister vKey1    = VR1;
2773     VectorRegister vKey2    = VR2;
2774     VectorRegister vKey3    = VR3;
2775     VectorRegister vKey4    = VR4;
2776 
2777     VectorRegister fromPerm = VR5;
2778     VectorRegister keyPerm  = VR6;
2779     VectorRegister toPerm   = VR7;
2780     VectorRegister fSplt    = VR8;
2781 
2782     VectorRegister vTmp1    = VR9;
2783     VectorRegister vTmp2    = VR10;
2784     VectorRegister vTmp3    = VR11;
2785     VectorRegister vTmp4    = VR12;
2786 
2787     __ li              (fifteen, 15);
2788 
2789     // load unaligned from[0-15] to vRet
2790     __ lvx             (vRet, from);
2791     __ lvx             (vTmp1, fifteen, from);
2792     __ lvsl            (fromPerm, from);
2793 #ifdef VM_LITTLE_ENDIAN
2794     __ vspltisb        (fSplt, 0x0f);
2795     __ vxor            (fromPerm, fromPerm, fSplt);
2796 #endif
2797     __ vperm           (vRet, vRet, vTmp1, fromPerm);
2798 
2799     // load keylen (44 or 52 or 60)
2800     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2801 
2802     // to load keys
2803     __ load_perm       (keyPerm, key);
2804 #ifdef VM_LITTLE_ENDIAN
2805     __ vspltisb        (vTmp2, -16);
2806     __ vrld            (keyPerm, keyPerm, vTmp2);
2807     __ vrld            (keyPerm, keyPerm, vTmp2);
2808     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
2809 #endif
2810 
2811     // load the 1st round key to vTmp1
2812     __ lvx             (vTmp1, key);
2813     __ li              (keypos, 16);
2814     __ lvx             (vKey1, keypos, key);
2815     __ vec_perm        (vTmp1, vKey1, keyPerm);
2816 
2817     // 1st round
2818     __ vxor            (vRet, vRet, vTmp1);
2819 
2820     // load the 2nd round key to vKey1
2821     __ li              (keypos, 32);
2822     __ lvx             (vKey2, keypos, key);
2823     __ vec_perm        (vKey1, vKey2, keyPerm);
2824 
2825     // load the 3rd round key to vKey2
2826     __ li              (keypos, 48);
2827     __ lvx             (vKey3, keypos, key);
2828     __ vec_perm        (vKey2, vKey3, keyPerm);
2829 
2830     // load the 4th round key to vKey3
2831     __ li              (keypos, 64);
2832     __ lvx             (vKey4, keypos, key);
2833     __ vec_perm        (vKey3, vKey4, keyPerm);
2834 
2835     // load the 5th round key to vKey4
2836     __ li              (keypos, 80);
2837     __ lvx             (vTmp1, keypos, key);
2838     __ vec_perm        (vKey4, vTmp1, keyPerm);
2839 
2840     // 2nd - 5th rounds
2841     __ vcipher         (vRet, vRet, vKey1);
2842     __ vcipher         (vRet, vRet, vKey2);
2843     __ vcipher         (vRet, vRet, vKey3);
2844     __ vcipher         (vRet, vRet, vKey4);
2845 
2846     // load the 6th round key to vKey1
2847     __ li              (keypos, 96);
2848     __ lvx             (vKey2, keypos, key);
2849     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2850 
2851     // load the 7th round key to vKey2
2852     __ li              (keypos, 112);
2853     __ lvx             (vKey3, keypos, key);
2854     __ vec_perm        (vKey2, vKey3, keyPerm);
2855 
2856     // load the 8th round key to vKey3
2857     __ li              (keypos, 128);
2858     __ lvx             (vKey4, keypos, key);
2859     __ vec_perm        (vKey3, vKey4, keyPerm);
2860 
2861     // load the 9th round key to vKey4
2862     __ li              (keypos, 144);
2863     __ lvx             (vTmp1, keypos, key);
2864     __ vec_perm        (vKey4, vTmp1, keyPerm);
2865 
2866     // 6th - 9th rounds
2867     __ vcipher         (vRet, vRet, vKey1);
2868     __ vcipher         (vRet, vRet, vKey2);
2869     __ vcipher         (vRet, vRet, vKey3);
2870     __ vcipher         (vRet, vRet, vKey4);
2871 
2872     // load the 10th round key to vKey1
2873     __ li              (keypos, 160);
2874     __ lvx             (vKey2, keypos, key);
2875     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2876 
2877     // load the 11th round key to vKey2
2878     __ li              (keypos, 176);
2879     __ lvx             (vTmp1, keypos, key);
2880     __ vec_perm        (vKey2, vTmp1, keyPerm);
2881 
2882     // if all round keys are loaded, skip next 4 rounds
2883     __ cmpwi           (CR0, keylen, 44);
2884     __ beq             (CR0, L_doLast);
2885 
2886     // 10th - 11th rounds
2887     __ vcipher         (vRet, vRet, vKey1);
2888     __ vcipher         (vRet, vRet, vKey2);
2889 
2890     // load the 12th round key to vKey1
2891     __ li              (keypos, 192);
2892     __ lvx             (vKey2, keypos, key);
2893     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2894 
2895     // load the 13th round key to vKey2
2896     __ li              (keypos, 208);
2897     __ lvx             (vTmp1, keypos, key);
2898     __ vec_perm        (vKey2, vTmp1, keyPerm);
2899 
2900     // if all round keys are loaded, skip next 2 rounds
2901     __ cmpwi           (CR0, keylen, 52);
2902     __ beq             (CR0, L_doLast);
2903 
2904 #ifdef ASSERT
2905     __ cmpwi           (CR0, keylen, 60);
2906     __ bne             (CR0, L_error);
2907 #endif
2908 
2909     // 12th - 13th rounds
2910     __ vcipher         (vRet, vRet, vKey1);
2911     __ vcipher         (vRet, vRet, vKey2);
2912 
2913     // load the 14th round key to vKey1
2914     __ li              (keypos, 224);
2915     __ lvx             (vKey2, keypos, key);
2916     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2917 
2918     // load the 15th round key to vKey2
2919     __ li              (keypos, 240);
2920     __ lvx             (vTmp1, keypos, key);
2921     __ vec_perm        (vKey2, vTmp1, keyPerm);
2922 
2923     __ bind(L_doLast);
2924 
2925     // last two rounds
2926     __ vcipher         (vRet, vRet, vKey1);
2927     __ vcipherlast     (vRet, vRet, vKey2);
2928 
2929 #ifdef VM_LITTLE_ENDIAN
2930     // toPerm = 0x0F0E0D0C0B0A09080706050403020100
2931     __ lvsl            (toPerm, keypos); // keypos is a multiple of 16
2932     __ vxor            (toPerm, toPerm, fSplt);
2933 
2934     // Swap Bytes
2935     __ vperm           (vRet, vRet, vRet, toPerm);
2936 #endif
2937 
2938     // store result (unaligned)
2939     // Note: We can't use a read-modify-write sequence which touches additional Bytes.
2940     Register lo = temp, hi = fifteen; // Reuse
2941     __ vsldoi          (vTmp1, vRet, vRet, 8);
2942     __ mfvrd           (hi, vRet);
2943     __ mfvrd           (lo, vTmp1);
2944     __ std             (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
2945     __ std             (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
2946 
2947     __ blr();
2948 
2949 #ifdef ASSERT
2950     __ bind(L_error);
2951     __ stop("aescrypt_encryptBlock: invalid key length");
2952 #endif
2953      return start;
2954   }
2955 
2956   // Arguments for generated stub:
2957   //   R3_ARG1   - source byte array address
2958   //   R4_ARG2   - destination byte array address
2959   //   R5_ARG3   - sessionKe (key) in little endian int array
2960   address generate_aescrypt_decryptBlock() {
2961     assert(UseAES, "need AES instructions and misaligned SSE support");
2962     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2963     StubCodeMark mark(this, stub_id);
2964 
2965     address start = __ function_entry();
2966 
2967     Label L_doLast, L_do44, L_do52, L_error;
2968 
2969     Register from           = R3_ARG1;  // source array address
2970     Register to             = R4_ARG2;  // destination array address
2971     Register key            = R5_ARG3;  // round key array
2972 
2973     Register keylen         = R8;
2974     Register temp           = R9;
2975     Register keypos         = R10;
2976     Register fifteen        = R12;
2977 
2978     VectorRegister vRet     = VR0;
2979 
2980     VectorRegister vKey1    = VR1;
2981     VectorRegister vKey2    = VR2;
2982     VectorRegister vKey3    = VR3;
2983     VectorRegister vKey4    = VR4;
2984     VectorRegister vKey5    = VR5;
2985 
2986     VectorRegister fromPerm = VR6;
2987     VectorRegister keyPerm  = VR7;
2988     VectorRegister toPerm   = VR8;
2989     VectorRegister fSplt    = VR9;
2990 
2991     VectorRegister vTmp1    = VR10;
2992     VectorRegister vTmp2    = VR11;
2993     VectorRegister vTmp3    = VR12;
2994     VectorRegister vTmp4    = VR13;
2995 
2996     __ li              (fifteen, 15);
2997 
2998     // load unaligned from[0-15] to vRet
2999     __ lvx             (vRet, from);
3000     __ lvx             (vTmp1, fifteen, from);
3001     __ lvsl            (fromPerm, from);
3002 #ifdef VM_LITTLE_ENDIAN
3003     __ vspltisb        (fSplt, 0x0f);
3004     __ vxor            (fromPerm, fromPerm, fSplt);
3005 #endif
3006     __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
3007 
3008     // load keylen (44 or 52 or 60)
3009     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
3010 
3011     // to load keys
3012     __ load_perm       (keyPerm, key);
3013 #ifdef VM_LITTLE_ENDIAN
3014     __ vxor            (vTmp2, vTmp2, vTmp2);
3015     __ vspltisb        (vTmp2, -16);
3016     __ vrld            (keyPerm, keyPerm, vTmp2);
3017     __ vrld            (keyPerm, keyPerm, vTmp2);
3018     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
3019 #endif
3020 
3021     __ cmpwi           (CR0, keylen, 44);
3022     __ beq             (CR0, L_do44);
3023 
3024     __ cmpwi           (CR0, keylen, 52);
3025     __ beq             (CR0, L_do52);
3026 
3027 #ifdef ASSERT
3028     __ cmpwi           (CR0, keylen, 60);
3029     __ bne             (CR0, L_error);
3030 #endif
3031 
3032     // load the 15th round key to vKey1
3033     __ li              (keypos, 240);
3034     __ lvx             (vKey1, keypos, key);
3035     __ li              (keypos, 224);
3036     __ lvx             (vKey2, keypos, key);
3037     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
3038 
3039     // load the 14th round key to vKey2
3040     __ li              (keypos, 208);
3041     __ lvx             (vKey3, keypos, key);
3042     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3043 
3044     // load the 13th round key to vKey3
3045     __ li              (keypos, 192);
3046     __ lvx             (vKey4, keypos, key);
3047     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
3048 
3049     // load the 12th round key to vKey4
3050     __ li              (keypos, 176);
3051     __ lvx             (vKey5, keypos, key);
3052     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
3053 
3054     // load the 11th round key to vKey5
3055     __ li              (keypos, 160);
3056     __ lvx             (vTmp1, keypos, key);
3057     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
3058 
3059     // 1st - 5th rounds
3060     __ vxor            (vRet, vRet, vKey1);
3061     __ vncipher        (vRet, vRet, vKey2);
3062     __ vncipher        (vRet, vRet, vKey3);
3063     __ vncipher        (vRet, vRet, vKey4);
3064     __ vncipher        (vRet, vRet, vKey5);
3065 
3066     __ b               (L_doLast);
3067 
3068     __ align(32);
3069     __ bind            (L_do52);
3070 
3071     // load the 13th round key to vKey1
3072     __ li              (keypos, 208);
3073     __ lvx             (vKey1, keypos, key);
3074     __ li              (keypos, 192);
3075     __ lvx             (vKey2, keypos, key);
3076     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
3077 
3078     // load the 12th round key to vKey2
3079     __ li              (keypos, 176);
3080     __ lvx             (vKey3, keypos, key);
3081     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3082 
3083     // load the 11th round key to vKey3
3084     __ li              (keypos, 160);
3085     __ lvx             (vTmp1, keypos, key);
3086     __ vec_perm        (vKey3, vTmp1, vKey3, keyPerm);
3087 
3088     // 1st - 3rd rounds
3089     __ vxor            (vRet, vRet, vKey1);
3090     __ vncipher        (vRet, vRet, vKey2);
3091     __ vncipher        (vRet, vRet, vKey3);
3092 
3093     __ b               (L_doLast);
3094 
3095     __ align(32);
3096     __ bind            (L_do44);
3097 
3098     // load the 11th round key to vKey1
3099     __ li              (keypos, 176);
3100     __ lvx             (vKey1, keypos, key);
3101     __ li              (keypos, 160);
3102     __ lvx             (vTmp1, keypos, key);
3103     __ vec_perm        (vKey1, vTmp1, vKey1, keyPerm);
3104 
3105     // 1st round
3106     __ vxor            (vRet, vRet, vKey1);
3107 
3108     __ bind            (L_doLast);
3109 
3110     // load the 10th round key to vKey1
3111     __ li              (keypos, 144);
3112     __ lvx             (vKey2, keypos, key);
3113     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
3114 
3115     // load the 9th round key to vKey2
3116     __ li              (keypos, 128);
3117     __ lvx             (vKey3, keypos, key);
3118     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3119 
3120     // load the 8th round key to vKey3
3121     __ li              (keypos, 112);
3122     __ lvx             (vKey4, keypos, key);
3123     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
3124 
3125     // load the 7th round key to vKey4
3126     __ li              (keypos, 96);
3127     __ lvx             (vKey5, keypos, key);
3128     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
3129 
3130     // load the 6th round key to vKey5
3131     __ li              (keypos, 80);
3132     __ lvx             (vTmp1, keypos, key);
3133     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
3134 
3135     // last 10th - 6th rounds
3136     __ vncipher        (vRet, vRet, vKey1);
3137     __ vncipher        (vRet, vRet, vKey2);
3138     __ vncipher        (vRet, vRet, vKey3);
3139     __ vncipher        (vRet, vRet, vKey4);
3140     __ vncipher        (vRet, vRet, vKey5);
3141 
3142     // load the 5th round key to vKey1
3143     __ li              (keypos, 64);
3144     __ lvx             (vKey2, keypos, key);
3145     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
3146 
3147     // load the 4th round key to vKey2
3148     __ li              (keypos, 48);
3149     __ lvx             (vKey3, keypos, key);
3150     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3151 
3152     // load the 3rd round key to vKey3
3153     __ li              (keypos, 32);
3154     __ lvx             (vKey4, keypos, key);
3155     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
3156 
3157     // load the 2nd round key to vKey4
3158     __ li              (keypos, 16);
3159     __ lvx             (vKey5, keypos, key);
3160     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
3161 
3162     // load the 1st round key to vKey5
3163     __ lvx             (vTmp1, key);
3164     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
3165 
3166     // last 5th - 1th rounds
3167     __ vncipher        (vRet, vRet, vKey1);
3168     __ vncipher        (vRet, vRet, vKey2);
3169     __ vncipher        (vRet, vRet, vKey3);
3170     __ vncipher        (vRet, vRet, vKey4);
3171     __ vncipherlast    (vRet, vRet, vKey5);
3172 
3173 #ifdef VM_LITTLE_ENDIAN
3174     // toPerm = 0x0F0E0D0C0B0A09080706050403020100
3175     __ lvsl            (toPerm, keypos); // keypos is a multiple of 16
3176     __ vxor            (toPerm, toPerm, fSplt);
3177 
3178     // Swap Bytes
3179     __ vperm           (vRet, vRet, vRet, toPerm);
3180 #endif
3181 
3182     // store result (unaligned)
3183     // Note: We can't use a read-modify-write sequence which touches additional Bytes.
3184     Register lo = temp, hi = fifteen; // Reuse
3185     __ vsldoi          (vTmp1, vRet, vRet, 8);
3186     __ mfvrd           (hi, vRet);
3187     __ mfvrd           (lo, vTmp1);
3188     __ std             (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
3189     __ std             (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
3190 
3191     __ blr();
3192 
3193 #ifdef ASSERT
3194     __ bind(L_error);
3195     __ stop("aescrypt_decryptBlock: invalid key length");
3196 #endif
3197      return start;
3198   }
3199 
3200   address generate_sha256_implCompress(StubId stub_id) {
3201     assert(UseSHA, "need SHA instructions");
3202     bool multi_block;
3203     switch (stub_id) {
3204     case StubId::stubgen_sha256_implCompress_id:
3205       multi_block = false;
3206       break;
3207     case StubId::stubgen_sha256_implCompressMB_id:
3208       multi_block = true;
3209       break;
3210     default:
3211       ShouldNotReachHere();
3212     }
3213     StubCodeMark mark(this, stub_id);
3214     address start = __ function_entry();
3215 
3216     __ sha256 (multi_block);
3217     __ blr();
3218 
3219     return start;
3220   }
3221 
3222   address generate_sha512_implCompress(StubId stub_id) {
3223     assert(UseSHA, "need SHA instructions");
3224     bool multi_block;
3225     switch (stub_id) {
3226     case StubId::stubgen_sha512_implCompress_id:
3227       multi_block = false;
3228       break;
3229     case StubId::stubgen_sha512_implCompressMB_id:
3230       multi_block = true;
3231       break;
3232     default:
3233       ShouldNotReachHere();
3234     }
3235     StubCodeMark mark(this, stub_id);
3236     address start = __ function_entry();
3237 
3238     __ sha512 (multi_block);
3239     __ blr();
3240 
3241     return start;
3242   }
3243 
3244   address generate_data_cache_writeback() {
3245     const Register cacheline = R3_ARG1;
3246     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
3247     StubCodeMark mark(this, stub_id);
3248     address start = __ pc();
3249 
3250     __ cache_wb(Address(cacheline));
3251     __ blr();
3252 
3253     return start;
3254   }
3255 
3256   address generate_data_cache_writeback_sync() {
3257     const Register is_presync = R3_ARG1;
3258     Register temp = R4;
3259     Label SKIP;
3260     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
3261     StubCodeMark mark(this, stub_id);
3262     address start = __ pc();
3263 
3264     __ andi_(temp, is_presync, 1);
3265     __ bne(CR0, SKIP);
3266     __ cache_wbsync(false); // post sync => emit 'sync'
3267     __ bind(SKIP);          // pre sync => emit nothing
3268     __ blr();
3269 
3270     return start;
3271   }
3272 
3273   void generate_arraycopy_stubs() {
3274     // generate the common exit first so later stubs can rely on it if
3275     // they want an UnsafeMemoryAccess exit non-local to the stub
3276     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
3277     // register the stub as the default exit with class UnsafeMemoryAccess
3278     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
3279 
3280     // Note: the disjoint stubs must be generated first, some of the
3281     //       conjoint stubs use them.
3282 
3283     // Note: chaining of stubs does not rely on branching to an
3284     //       auxiliary post-push entry because none of the stubs
3285     //       push/pop a frame.
3286 
3287     // non-aligned disjoint versions
3288     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id);
3289     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(StubId::stubgen_jshort_disjoint_arraycopy_id);
3290     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(StubId::stubgen_jint_disjoint_arraycopy_id);
3291     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(StubId::stubgen_jlong_disjoint_arraycopy_id);
3292     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id);
3293     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
3294 
3295     // aligned disjoint versions
3296     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id);
3297     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id);
3298     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id);
3299     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id);
3300     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id);
3301     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
3302 
3303     // non-aligned conjoint versions
3304     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(StubId::stubgen_jbyte_arraycopy_id);
3305     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(StubId::stubgen_jshort_arraycopy_id);
3306     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(StubId::stubgen_jint_arraycopy_id);
3307     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(StubId::stubgen_jlong_arraycopy_id);
3308     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_id);
3309     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id);
3310 
3311     // aligned conjoint versions
3312     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id);
3313     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(StubId::stubgen_arrayof_jshort_arraycopy_id);
3314     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(StubId::stubgen_arrayof_jint_arraycopy_id);
3315     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(StubId::stubgen_arrayof_jlong_arraycopy_id);
3316     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
3317     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
3318 
3319     // special/generic versions
3320     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id);
3321     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id);
3322 
3323     StubRoutines::_unsafe_arraycopy  = generate_unsafe_copy(STUB_ENTRY(jbyte_arraycopy()),
3324                                                             STUB_ENTRY(jshort_arraycopy()),
3325                                                             STUB_ENTRY(jint_arraycopy()),
3326                                                             STUB_ENTRY(jlong_arraycopy()));
3327     StubRoutines::_generic_arraycopy = generate_generic_copy(STUB_ENTRY(jbyte_arraycopy()),
3328                                                              STUB_ENTRY(jshort_arraycopy()),
3329                                                              STUB_ENTRY(jint_arraycopy()),
3330                                                              STUB_ENTRY(oop_arraycopy()),
3331                                                              STUB_ENTRY(oop_disjoint_arraycopy()),
3332                                                              STUB_ENTRY(jlong_arraycopy()),
3333                                                              STUB_ENTRY(checkcast_arraycopy()));
3334 
3335     // fill routines
3336 #ifdef COMPILER2
3337     if (OptimizeFill) {
3338       StubRoutines::_jbyte_fill          = generate_fill(StubId::stubgen_jbyte_fill_id);
3339       StubRoutines::_jshort_fill         = generate_fill(StubId::stubgen_jshort_fill_id);
3340       StubRoutines::_jint_fill           = generate_fill(StubId::stubgen_jint_fill_id);
3341       StubRoutines::_arrayof_jbyte_fill  = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
3342       StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
3343       StubRoutines::_arrayof_jint_fill   = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
3344     }
3345     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
3346 #endif
3347   }
3348 
3349   // Stub for BigInteger::multiplyToLen()
3350   //
3351   //  Arguments:
3352   //
3353   //  Input:
3354   //    R3 - x address
3355   //    R4 - x length
3356   //    R5 - y address
3357   //    R6 - y length
3358   //    R7 - z address
3359   //
3360   address generate_multiplyToLen() {
3361 
3362     StubId stub_id = StubId::stubgen_multiplyToLen_id;
3363     StubCodeMark mark(this, stub_id);
3364 
3365     address start = __ function_entry();
3366 
3367     const Register x     = R3;
3368     const Register xlen  = R4;
3369     const Register y     = R5;
3370     const Register ylen  = R6;
3371     const Register z     = R7;
3372 
3373     const Register tmp1  = R2; // TOC not used.
3374     const Register tmp2  = R9;
3375     const Register tmp3  = R10;
3376     const Register tmp4  = R11;
3377     const Register tmp5  = R12;
3378 
3379     // non-volatile regs
3380     const Register tmp6  = R31;
3381     const Register tmp7  = R30;
3382     const Register tmp8  = R29;
3383     const Register tmp9  = R28;
3384     const Register tmp10 = R27;
3385     const Register tmp11 = R26;
3386     const Register tmp12 = R25;
3387     const Register tmp13 = R24;
3388 
3389     BLOCK_COMMENT("Entry:");
3390 
3391     // C2 does not respect int to long conversion for stub calls.
3392     __ clrldi(xlen, xlen, 32);
3393     __ clrldi(ylen, ylen, 32);
3394 
3395     // Save non-volatile regs (frameless).
3396     int current_offs = 8;
3397     __ std(R24, -current_offs, R1_SP); current_offs += 8;
3398     __ std(R25, -current_offs, R1_SP); current_offs += 8;
3399     __ std(R26, -current_offs, R1_SP); current_offs += 8;
3400     __ std(R27, -current_offs, R1_SP); current_offs += 8;
3401     __ std(R28, -current_offs, R1_SP); current_offs += 8;
3402     __ std(R29, -current_offs, R1_SP); current_offs += 8;
3403     __ std(R30, -current_offs, R1_SP); current_offs += 8;
3404     __ std(R31, -current_offs, R1_SP);
3405 
3406     __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5,
3407                        tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
3408 
3409     // Restore non-volatile regs.
3410     current_offs = 8;
3411     __ ld(R24, -current_offs, R1_SP); current_offs += 8;
3412     __ ld(R25, -current_offs, R1_SP); current_offs += 8;
3413     __ ld(R26, -current_offs, R1_SP); current_offs += 8;
3414     __ ld(R27, -current_offs, R1_SP); current_offs += 8;
3415     __ ld(R28, -current_offs, R1_SP); current_offs += 8;
3416     __ ld(R29, -current_offs, R1_SP); current_offs += 8;
3417     __ ld(R30, -current_offs, R1_SP); current_offs += 8;
3418     __ ld(R31, -current_offs, R1_SP);
3419 
3420     __ blr();  // Return to caller.
3421 
3422     return start;
3423   }
3424 
3425   /**
3426   *  Arguments:
3427   *
3428   *  Input:
3429   *   R3_ARG1    - out address
3430   *   R4_ARG2    - in address
3431   *   R5_ARG3    - offset
3432   *   R6_ARG4    - len
3433   *   R7_ARG5    - k
3434   *  Output:
3435   *   R3_RET     - carry
3436   */
3437   address generate_mulAdd() {
3438     __ align(CodeEntryAlignment);
3439     StubId stub_id = StubId::stubgen_mulAdd_id;
3440     StubCodeMark mark(this, stub_id);
3441 
3442     address start = __ function_entry();
3443 
3444     // C2 does not sign extend signed parameters to full 64 bits registers:
3445     __ rldic (R5_ARG3, R5_ARG3, 2, 32);  // always positive
3446     __ clrldi(R6_ARG4, R6_ARG4, 32);     // force zero bits on higher word
3447     __ clrldi(R7_ARG5, R7_ARG5, 32);     // force zero bits on higher word
3448 
3449     __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
3450 
3451     // Moves output carry to return register
3452     __ mr    (R3_RET,  R10);
3453 
3454     __ blr();
3455 
3456     return start;
3457   }
3458 
3459   /**
3460   *  Arguments:
3461   *
3462   *  Input:
3463   *   R3_ARG1    - in address
3464   *   R4_ARG2    - in length
3465   *   R5_ARG3    - out address
3466   *   R6_ARG4    - out length
3467   */
3468   address generate_squareToLen() {
3469     __ align(CodeEntryAlignment);
3470     StubId stub_id = StubId::stubgen_squareToLen_id;
3471     StubCodeMark mark(this, stub_id);
3472 
3473     address start = __ function_entry();
3474 
3475     // args - higher word is cleaned (unsignedly) due to int to long casting
3476     const Register in        = R3_ARG1;
3477     const Register in_len    = R4_ARG2;
3478     __ clrldi(in_len, in_len, 32);
3479     const Register out       = R5_ARG3;
3480     const Register out_len   = R6_ARG4;
3481     __ clrldi(out_len, out_len, 32);
3482 
3483     // output
3484     const Register ret       = R3_RET;
3485 
3486     // temporaries
3487     const Register lplw_s    = R7;
3488     const Register in_aux    = R8;
3489     const Register out_aux   = R9;
3490     const Register piece     = R10;
3491     const Register product   = R14;
3492     const Register lplw      = R15;
3493     const Register i_minus1  = R16;
3494     const Register carry     = R17;
3495     const Register offset    = R18;
3496     const Register off_aux   = R19;
3497     const Register t         = R20;
3498     const Register mlen      = R21;
3499     const Register len       = R22;
3500     const Register a         = R23;
3501     const Register b         = R24;
3502     const Register i         = R25;
3503     const Register c         = R26;
3504     const Register cs        = R27;
3505 
3506     // Labels
3507     Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_LOOP_SQUARE;
3508     Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_SQUARE;
3509 
3510     // Save non-volatile regs (frameless).
3511     int current_offs = -8;
3512     __ std(R28, current_offs, R1_SP); current_offs -= 8;
3513     __ std(R27, current_offs, R1_SP); current_offs -= 8;
3514     __ std(R26, current_offs, R1_SP); current_offs -= 8;
3515     __ std(R25, current_offs, R1_SP); current_offs -= 8;
3516     __ std(R24, current_offs, R1_SP); current_offs -= 8;
3517     __ std(R23, current_offs, R1_SP); current_offs -= 8;
3518     __ std(R22, current_offs, R1_SP); current_offs -= 8;
3519     __ std(R21, current_offs, R1_SP); current_offs -= 8;
3520     __ std(R20, current_offs, R1_SP); current_offs -= 8;
3521     __ std(R19, current_offs, R1_SP); current_offs -= 8;
3522     __ std(R18, current_offs, R1_SP); current_offs -= 8;
3523     __ std(R17, current_offs, R1_SP); current_offs -= 8;
3524     __ std(R16, current_offs, R1_SP); current_offs -= 8;
3525     __ std(R15, current_offs, R1_SP); current_offs -= 8;
3526     __ std(R14, current_offs, R1_SP);
3527 
3528     // Store the squares, right shifted one bit (i.e., divided by 2)
3529     __ subi   (out_aux,   out,       8);
3530     __ subi   (in_aux,    in,        4);
3531     __ cmpwi  (CR0,      in_len,    0);
3532     // Initialize lplw outside of the loop
3533     __ xorr   (lplw,      lplw,      lplw);
3534     __ ble    (CR0,      SKIP_LOOP_SQUARE);    // in_len <= 0
3535     __ mtctr  (in_len);
3536 
3537     __ bind(LOOP_SQUARE);
3538     __ lwzu   (piece,     4,         in_aux);
3539     __ mulld  (product,   piece,     piece);
3540     // shift left 63 bits and only keep the MSB
3541     __ rldic  (lplw_s,    lplw,      63, 0);
3542     __ mr     (lplw,      product);
3543     // shift right 1 bit without sign extension
3544     __ srdi   (product,   product,   1);
3545     // join them to the same register and store it
3546     __ orr    (product,   lplw_s,    product);
3547 #ifdef VM_LITTLE_ENDIAN
3548     // Swap low and high words for little endian
3549     __ rldicl (product,   product,   32, 0);
3550 #endif
3551     __ stdu   (product,   8,         out_aux);
3552     __ bdnz   (LOOP_SQUARE);
3553 
3554     __ bind(SKIP_LOOP_SQUARE);
3555 
3556     // Add in off-diagonal sums
3557     __ cmpwi  (CR0,      in_len,    0);
3558     __ ble    (CR0,      SKIP_DIAGONAL_SUM);
3559     // Avoid CTR usage here in order to use it at mulAdd
3560     __ subi   (i_minus1,  in_len,    1);
3561     __ li     (offset,    4);
3562 
3563     __ bind(LOOP_DIAGONAL_SUM);
3564 
3565     __ sldi   (off_aux,   out_len,   2);
3566     __ sub    (off_aux,   off_aux,   offset);
3567 
3568     __ mr     (len,       i_minus1);
3569     __ sldi   (mlen,      i_minus1,  2);
3570     __ lwzx   (t,         in,        mlen);
3571 
3572     __ muladd (out, in, off_aux, len, t, a, b, carry);
3573 
3574     // begin<addOne>
3575     // off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
3576     __ addi   (mlen,      mlen,      4);
3577     __ sldi   (a,         out_len,   2);
3578     __ subi   (a,         a,         4);
3579     __ sub    (a,         a,         mlen);
3580     __ subi   (off_aux,   offset,    4);
3581     __ sub    (off_aux,   a,         off_aux);
3582 
3583     __ lwzx   (b,         off_aux,   out);
3584     __ add    (b,         b,         carry);
3585     __ stwx   (b,         off_aux,   out);
3586 
3587     // if (((uint64_t)s >> 32) != 0) {
3588     __ srdi_  (a,         b,         32);
3589     __ beq    (CR0,      SKIP_ADDONE);
3590 
3591     // while (--mlen >= 0) {
3592     __ bind(LOOP_ADDONE);
3593     __ subi   (mlen,      mlen,      4);
3594     __ cmpwi  (CR0,      mlen,      0);
3595     __ beq    (CR0,      SKIP_ADDONE);
3596 
3597     // if (--offset_aux < 0) { // Carry out of number
3598     __ subi   (off_aux,   off_aux,   4);
3599     __ cmpwi  (CR0,      off_aux,   0);
3600     __ blt    (CR0,      SKIP_ADDONE);
3601 
3602     // } else {
3603     __ lwzx   (b,         off_aux,   out);
3604     __ addi   (b,         b,         1);
3605     __ stwx   (b,         off_aux,   out);
3606     __ cmpwi  (CR0,      b,         0);
3607     __ bne    (CR0,      SKIP_ADDONE);
3608     __ b      (LOOP_ADDONE);
3609 
3610     __ bind(SKIP_ADDONE);
3611     // } } } end<addOne>
3612 
3613     __ addi   (offset,    offset,    8);
3614     __ subi   (i_minus1,  i_minus1,  1);
3615     __ cmpwi  (CR0,      i_minus1,  0);
3616     __ bge    (CR0,      LOOP_DIAGONAL_SUM);
3617 
3618     __ bind(SKIP_DIAGONAL_SUM);
3619 
3620     // Shift back up and set low bit
3621     // Shifts 1 bit left up to len positions. Assumes no leading zeros
3622     // begin<primitiveLeftShift>
3623     __ cmpwi  (CR0,      out_len,   0);
3624     __ ble    (CR0,      SKIP_LSHIFT);
3625     __ li     (i,         0);
3626     __ lwz    (c,         0,         out);
3627     __ subi   (b,         out_len,   1);
3628     __ mtctr  (b);
3629 
3630     __ bind(LOOP_LSHIFT);
3631     __ mr     (b,         c);
3632     __ addi   (cs,        i,         4);
3633     __ lwzx   (c,         out,       cs);
3634 
3635     __ sldi   (b,         b,         1);
3636     __ srwi   (cs,        c,         31);
3637     __ orr    (b,         b,         cs);
3638     __ stwx   (b,         i,         out);
3639 
3640     __ addi   (i,         i,         4);
3641     __ bdnz   (LOOP_LSHIFT);
3642 
3643     __ sldi   (c,         out_len,   2);
3644     __ subi   (c,         c,         4);
3645     __ lwzx   (b,         out,       c);
3646     __ sldi   (b,         b,         1);
3647     __ stwx   (b,         out,       c);
3648 
3649     __ bind(SKIP_LSHIFT);
3650     // end<primitiveLeftShift>
3651 
3652     // Set low bit
3653     __ sldi   (i,         in_len,    2);
3654     __ subi   (i,         i,         4);
3655     __ lwzx   (i,         in,        i);
3656     __ sldi   (c,         out_len,   2);
3657     __ subi   (c,         c,         4);
3658     __ lwzx   (b,         out,       c);
3659 
3660     __ andi   (i,         i,         1);
3661     __ orr    (i,         b,         i);
3662 
3663     __ stwx   (i,         out,       c);
3664 
3665     // Restore non-volatile regs.
3666     current_offs = -8;
3667     __ ld(R28, current_offs, R1_SP); current_offs -= 8;
3668     __ ld(R27, current_offs, R1_SP); current_offs -= 8;
3669     __ ld(R26, current_offs, R1_SP); current_offs -= 8;
3670     __ ld(R25, current_offs, R1_SP); current_offs -= 8;
3671     __ ld(R24, current_offs, R1_SP); current_offs -= 8;
3672     __ ld(R23, current_offs, R1_SP); current_offs -= 8;
3673     __ ld(R22, current_offs, R1_SP); current_offs -= 8;
3674     __ ld(R21, current_offs, R1_SP); current_offs -= 8;
3675     __ ld(R20, current_offs, R1_SP); current_offs -= 8;
3676     __ ld(R19, current_offs, R1_SP); current_offs -= 8;
3677     __ ld(R18, current_offs, R1_SP); current_offs -= 8;
3678     __ ld(R17, current_offs, R1_SP); current_offs -= 8;
3679     __ ld(R16, current_offs, R1_SP); current_offs -= 8;
3680     __ ld(R15, current_offs, R1_SP); current_offs -= 8;
3681     __ ld(R14, current_offs, R1_SP);
3682 
3683     __ mr(ret, out);
3684     __ blr();
3685 
3686     return start;
3687   }
3688 
3689   /**
3690    * Arguments:
3691    *
3692    * Inputs:
3693    *   R3_ARG1    - int   crc
3694    *   R4_ARG2    - byte* buf
3695    *   R5_ARG3    - int   length (of buffer)
3696    *
3697    * scratch:
3698    *   R2, R6-R12
3699    *
3700    * Output:
3701    *   R3_RET     - int   crc result
3702    */
3703   // Compute CRC32 function.
3704   address generate_CRC32_updateBytes(StubId stub_id) {
3705     bool is_crc32c;
3706     switch (stub_id) {
3707     case StubId::stubgen_updateBytesCRC32_id:
3708       is_crc32c = false;
3709       break;
3710     case StubId::stubgen_updateBytesCRC32C_id:
3711       is_crc32c = true;
3712       break;
3713     default:
3714       ShouldNotReachHere();
3715     }
3716     __ align(CodeEntryAlignment);
3717     StubCodeMark mark(this, stub_id);
3718     address start = __ function_entry();  // Remember stub start address (is rtn value).
3719     __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
3720     __ blr();
3721     return start;
3722   }
3723 
3724   address generate_floatToFloat16() {
3725     __ align(CodeEntryAlignment);
3726     StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
3727     address start = __ function_entry();
3728     __ f2hf(R3_RET, F1_ARG1, F0);
3729     __ blr();
3730     return start;
3731   }
3732 
3733   address generate_float16ToFloat() {
3734     __ align(CodeEntryAlignment);
3735     StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
3736     address start = __ function_entry();
3737     __ hf2f(F1_RET, R3_ARG1);
3738     __ blr();
3739     return start;
3740   }
3741 
3742   address generate_method_entry_barrier() {
3743     __ align(CodeEntryAlignment);
3744     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
3745     StubCodeMark mark(this, stub_id);
3746 
3747     address stub_address = __ pc();
3748 
3749     int nbytes_save = MacroAssembler::num_volatile_regs * BytesPerWord;
3750     __ save_volatile_gprs(R1_SP, -nbytes_save, true);
3751 
3752     // Link register points to instruction in prologue of the guarded nmethod.
3753     // As the stub requires one layer of indirection (argument is of type address* and not address),
3754     // passing the link register's value directly doesn't work.
3755     // Since we have to save the link register on the stack anyway, we calculate the corresponding stack address
3756     // and pass that one instead.
3757     __ addi(R3_ARG1, R1_SP, _abi0(lr));
3758 
3759     __ save_LR(R0);
3760     __ push_frame_reg_args(nbytes_save, R0);
3761 
3762     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier));
3763     __ mr(R0, R3_RET);
3764 
3765     __ pop_frame();
3766     __ restore_LR(R3_RET /* used as tmp register */);
3767     __ restore_volatile_gprs(R1_SP, -nbytes_save, true);
3768 
3769     __ cmpdi(CR0, R0, 0);
3770 
3771     // Return to prologue if no deoptimization is required (bnelr)
3772     __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintIsTaken);
3773 
3774     // Deoptimization required.
3775     // For actually handling the deoptimization, the 'wrong method stub' is invoked.
3776     __ load_const_optimized(R0, SharedRuntime::get_handle_wrong_method_stub());
3777     __ mtctr(R0);
3778 
3779     // Pop the frame built in the prologue.
3780     __ pop_frame();
3781 
3782     // Restore link register.  Required as the 'wrong method stub' needs the caller's frame
3783     // to properly deoptimize this method (e.g. by re-resolving the call site for compiled methods).
3784     // This method's prologue is aborted.
3785     __ restore_LR(R0);
3786 
3787     __ bctr();
3788     return stub_address;
3789   }
3790 
3791 #ifdef VM_LITTLE_ENDIAN
3792 // The following Base64 decode intrinsic is based on an algorithm outlined
3793 // in here:
3794 // http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html
3795 // in the section titled "Vector lookup (pshufb with bitmask)"
3796 //
3797 // This implementation differs in the following ways:
3798 //  * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions
3799 //    are used instead.  It turns out that some of the vector operations
3800 //    needed in the algorithm require fewer AltiVec instructions.
3801 //  * The algorithm in the above mentioned paper doesn't handle the
3802 //    Base64-URL variant in RFC 4648.  Adjustments to both the code and to two
3803 //    lookup tables are needed for this.
3804 //  * The "Pack" section of the code is a complete rewrite for Power because we
3805 //    can utilize better instructions for this step.
3806 //
3807 
3808 // Offsets per group of Base64 characters
3809 // Uppercase
3810 #define UC  (signed char)((-'A' + 0) & 0xff)
3811 // Lowercase
3812 #define LC  (signed char)((-'a' + 26) & 0xff)
3813 // Digits
3814 #define DIG (signed char)((-'0' + 52) & 0xff)
3815 // Plus sign (URL = 0)
3816 #define PLS (signed char)((-'+' + 62) & 0xff)
3817 // Hyphen (URL = 1)
3818 #define HYP (signed char)((-'-' + 62) & 0xff)
3819 // Slash (URL = 0)
3820 #define SLS (signed char)((-'/' + 63) & 0xff)
3821 // Underscore (URL = 1)
3822 #define US  (signed char)((-'_' + 63) & 0xff)
3823 
3824 // For P10 (or later) only
3825 #define VALID_B64 0x80
3826 #define VB64(x) (VALID_B64 | x)
3827 
3828 #define BLK_OFFSETOF(x) (offsetof(constant_block, x))
3829 
3830 // In little-endian mode, the lxv instruction loads the element at EA into
3831 // element 15 of the vector register, EA+1 goes into element 14, and so
3832 // on.
3833 //
3834 // To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the
3835 // order of the elements in a vector initialization.
3836 #define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0
3837 
3838   //
3839   // Base64 decodeBlock intrinsic
3840   address generate_base64_decodeBlock() {
3841     __ align(CodeEntryAlignment);
3842     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
3843     StubCodeMark mark(this, stub_id);
3844     address start   = __ function_entry();
3845 
3846     typedef struct {
3847       signed char offsetLUT_val[16];
3848       signed char offsetLUT_URL_val[16];
3849       unsigned char maskLUT_val[16];
3850       unsigned char maskLUT_URL_val[16];
3851       unsigned char bitposLUT_val[16];
3852       unsigned char table_32_47_val[16];
3853       unsigned char table_32_47_URL_val[16];
3854       unsigned char table_48_63_val[16];
3855       unsigned char table_64_79_val[16];
3856       unsigned char table_80_95_val[16];
3857       unsigned char table_80_95_URL_val[16];
3858       unsigned char table_96_111_val[16];
3859       unsigned char table_112_127_val[16];
3860       unsigned char pack_lshift_val[16];
3861       unsigned char pack_rshift_val[16];
3862       unsigned char pack_permute_val[16];
3863     } constant_block;
3864 
3865     alignas(16) static const constant_block const_block = {
3866 
3867       .offsetLUT_val = {
3868         ARRAY_TO_LXV_ORDER(
3869         0,   0, PLS, DIG,  UC,  UC,  LC,  LC,
3870         0,   0,   0,   0,   0,   0,   0,   0 ) },
3871 
3872       .offsetLUT_URL_val = {
3873         ARRAY_TO_LXV_ORDER(
3874         0,   0, HYP, DIG,  UC,  UC,  LC,  LC,
3875         0,   0,   0,   0,   0,   0,   0,   0 ) },
3876 
3877       .maskLUT_val = {
3878         ARRAY_TO_LXV_ORDER(
3879         /* 0        */ (unsigned char)0b10101000,
3880         /* 1 .. 9   */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3881                        (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3882                        (unsigned char)0b11111000,
3883         /* 10       */ (unsigned char)0b11110000,
3884         /* 11       */ (unsigned char)0b01010100,
3885         /* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000,
3886         /* 15       */ (unsigned char)0b01010100 ) },
3887 
3888       .maskLUT_URL_val = {
3889         ARRAY_TO_LXV_ORDER(
3890         /* 0        */ (unsigned char)0b10101000,
3891         /* 1 .. 9   */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3892                        (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3893                        (unsigned char)0b11111000,
3894         /* 10       */ (unsigned char)0b11110000,
3895         /* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000,
3896         /* 13       */ (unsigned char)0b01010100,
3897         /* 14       */ (unsigned char)0b01010000,
3898         /* 15       */ (unsigned char)0b01110000 ) },
3899 
3900       .bitposLUT_val = {
3901         ARRAY_TO_LXV_ORDER(
3902         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80,
3903         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) },
3904 
3905       // In the following table_*_val constants, a 0 value means the
3906       // character is not in the Base64 character set
3907       .table_32_47_val = {
3908         ARRAY_TO_LXV_ORDER (
3909          /* space .. '*' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '+' = 62 */ VB64(62), /* ',' .. '.' = 0 */ 0, 0, 0, /* '/' = 63 */ VB64(63) ) },
3910 
3911       .table_32_47_URL_val = {
3912         ARRAY_TO_LXV_ORDER(
3913          /* space .. ',' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '-' = 62 */ VB64(62), /* '.' .. '/' */ 0, 0 ) },
3914 
3915       .table_48_63_val = {
3916         ARRAY_TO_LXV_ORDER(
3917          /* '0' .. '9' = 52 .. 61 */ VB64(52), VB64(53), VB64(54), VB64(55), VB64(56), VB64(57), VB64(58), VB64(59), VB64(60), VB64(61),
3918          /* ':' .. '?' = 0 */ 0, 0, 0, 0, 0, 0 ) },
3919 
3920       .table_64_79_val = {
3921         ARRAY_TO_LXV_ORDER(
3922          /* '@' = 0 */ 0, /* 'A' .. 'O' = 0 .. 14 */ VB64(0), VB64(1), VB64(2), VB64(3), VB64(4), VB64(5), VB64(6), VB64(7), VB64(8),
3923          VB64(9), VB64(10), VB64(11), VB64(12), VB64(13), VB64(14) ) },
3924 
3925       .table_80_95_val = {
3926         ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3927         VB64(23), VB64(24), VB64(25), /* '[' .. '_' = 0 */ 0, 0, 0, 0, 0 ) },
3928 
3929       .table_80_95_URL_val = {
3930         ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3931         VB64(23), VB64(24), VB64(25), /* '[' .. '^' = 0 */ 0, 0, 0, 0, /* '_' = 63 */ VB64(63) ) },
3932 
3933       .table_96_111_val = {
3934         ARRAY_TO_LXV_ORDER(/* '`' = 0 */ 0, /* 'a' .. 'o' = 26 .. 40 */ VB64(26), VB64(27), VB64(28), VB64(29), VB64(30), VB64(31),
3935         VB64(32), VB64(33), VB64(34), VB64(35), VB64(36), VB64(37), VB64(38), VB64(39), VB64(40) ) },
3936 
3937       .table_112_127_val = {
3938         ARRAY_TO_LXV_ORDER(/* 'p' .. 'z' = 41 .. 51 */ VB64(41), VB64(42), VB64(43), VB64(44), VB64(45), VB64(46), VB64(47), VB64(48),
3939         VB64(49), VB64(50), VB64(51), /* '{' .. DEL = 0 */ 0, 0, 0, 0, 0 ) },
3940 
3941       .pack_lshift_val = {
3942         ARRAY_TO_LXV_ORDER(
3943         0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2 ) },
3944 
3945       .pack_rshift_val = {
3946         ARRAY_TO_LXV_ORDER(
3947         0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0 ) },
3948 
3949       // The first 4 index values are "don't care" because
3950       // we only use the first 12 bytes of the vector,
3951       // which are decoded from 16 bytes of Base64 characters.
3952       .pack_permute_val = {
3953         ARRAY_TO_LXV_ORDER(
3954          0, 0, 0, 0,
3955          0,  1,  2,
3956          4,  5,  6,
3957          8,  9, 10,
3958         12, 13, 14 ) }
3959     };
3960 
3961     const unsigned block_size = 16;  // number of bytes to process in each pass through the loop
3962     const unsigned block_size_shift = 4;
3963 
3964     // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
3965     Register s      = R3_ARG1; // source starting address of Base64 characters
3966     Register sp     = R4_ARG2; // source offset
3967     Register sl     = R5_ARG3; // source length = # of Base64 characters to be processed
3968     Register d      = R6_ARG4; // destination address
3969     Register dp     = R7_ARG5; // destination offset
3970     Register isURL  = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
3971     Register isMIME = R9_ARG7; // boolean, if non-zero indicates use of RFC 2045 MIME encoding - not used
3972 
3973     // Local variables
3974     Register const_ptr     = R9;  // used for loading constants
3975     Register tmp_reg       = R10; // used for speeding up load_constant_optimized()
3976 
3977     // Re-use R9 and R10 to avoid using non-volatile registers (requires save/restore)
3978     Register out           = R9;  // moving out (destination) pointer
3979     Register in            = R10; // moving in (source) pointer
3980 
3981     // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
3982     // VR Constants
3983     VectorRegister  vec_0s                  = VR0;
3984     VectorRegister  vec_4s                  = VR1;
3985     VectorRegister  vec_8s                  = VR2;
3986     VectorRegister  vec_special_case_char   = VR3;
3987     VectorRegister  pack_rshift             = VR4;
3988     VectorRegister  pack_lshift             = VR5;
3989 
3990     // VSR Constants
3991     VectorSRegister offsetLUT               = VSR0;
3992     VectorSRegister maskLUT                 = VSR1;
3993     VectorSRegister bitposLUT               = VSR2;
3994     VectorSRegister vec_0xfs                = VSR3;
3995     VectorSRegister vec_special_case_offset = VSR4;
3996     VectorSRegister pack_permute            = VSR5;
3997 
3998     // P10 (or later) VSR lookup constants
3999     VectorSRegister table_32_47             = VSR0;
4000     VectorSRegister table_48_63             = VSR1;
4001     VectorSRegister table_64_79             = VSR2;
4002     VectorSRegister table_80_95             = VSR3;
4003     VectorSRegister table_96_111            = VSR4;
4004     VectorSRegister table_112_127           = VSR6;
4005 
4006     // Data read in and later converted
4007     VectorRegister  input                   = VR6;
4008     // Variable for testing Base64 validity
4009     VectorRegister  non_match               = VR10;
4010 
4011     // P9 VR Variables for lookup
4012     VectorRegister  higher_nibble           = VR7;
4013     VectorRegister  eq_special_case_char    = VR8;
4014     VectorRegister  offsets                 = VR9;
4015 
4016     // P9 VSR lookup variables
4017     VectorSRegister bit                     = VSR6;
4018     VectorSRegister lower_nibble            = VSR7;
4019     VectorSRegister M                       = VSR8;
4020 
4021     // P10 (or later) VSR lookup variables
4022     VectorSRegister  xlate_a                = VSR7;
4023     VectorSRegister  xlate_b                = VSR8;
4024 
4025     // Variables for pack
4026     // VR
4027     VectorRegister  l                       = VR7;  // reuse higher_nibble's register
4028     VectorRegister  r                       = VR8;  // reuse eq_special_case_char's register
4029     VectorRegister  gathered                = VR10; // reuse non_match's register
4030 
4031     Label not_URL, calculate_size, loop_start, loop_exit, return_zero;
4032 
4033     // The upper 32 bits of the non-pointer parameter registers are not
4034     // guaranteed to be zero, so mask off those upper bits.
4035     __ clrldi(sp, sp, 32);
4036     __ clrldi(sl, sl, 32);
4037 
4038     // Don't handle the last 4 characters of the source, because this
4039     // VSX-based algorithm doesn't handle padding characters.  Also the
4040     // vector code will always write 16 bytes of decoded data on each pass,
4041     // but only the first 12 of those 16 bytes are valid data (16 base64
4042     // characters become 12 bytes of binary data), so for this reason we
4043     // need to subtract an additional 8 bytes from the source length, in
4044     // order not to write past the end of the destination buffer.  The
4045     // result of this subtraction implies that a Java function in the
4046     // Base64 class will be used to process the last 12 characters.
4047     __ sub(sl, sl, sp);
4048     __ subi(sl, sl, 12);
4049 
4050     // Load CTR with the number of passes through the loop
4051     // = sl >> block_size_shift.  After the shift, if sl <= 0, there's too
4052     // little data to be processed by this intrinsic.
4053     __ srawi_(sl, sl, block_size_shift);
4054     __ ble(CR0, return_zero);
4055     __ mtctr(sl);
4056 
4057     // Clear the other two parameter registers upper 32 bits.
4058     __ clrldi(isURL, isURL, 32);
4059     __ clrldi(dp, dp, 32);
4060 
4061     // Load constant vec registers that need to be loaded from memory
4062     __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
4063     __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
4064     __ lxv(pack_rshift->to_vsr(), BLK_OFFSETOF(pack_rshift_val), const_ptr);
4065     __ lxv(pack_lshift->to_vsr(), BLK_OFFSETOF(pack_lshift_val), const_ptr);
4066     __ lxv(pack_permute, BLK_OFFSETOF(pack_permute_val), const_ptr);
4067 
4068     // Splat the constants that can use xxspltib
4069     __ xxspltib(vec_0s->to_vsr(), 0);
4070     __ xxspltib(vec_8s->to_vsr(), 8);
4071     if (PowerArchitecturePPC64 >= 10) {
4072       // Using VALID_B64 for the offsets effectively strips the upper bit
4073       // of each byte that was selected from the table.  Setting the upper
4074       // bit gives us a way to distinguish between the 6-bit value of 0
4075       // from an error code of 0, which will happen if the character is
4076       // outside the range of the lookup, or is an illegal Base64
4077       // character, such as %.
4078       __ xxspltib(offsets->to_vsr(), VALID_B64);
4079 
4080       __ lxv(table_48_63, BLK_OFFSETOF(table_48_63_val), const_ptr);
4081       __ lxv(table_64_79, BLK_OFFSETOF(table_64_79_val), const_ptr);
4082       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
4083       __ lxv(table_96_111, BLK_OFFSETOF(table_96_111_val), const_ptr);
4084       __ lxv(table_112_127, BLK_OFFSETOF(table_112_127_val), const_ptr);
4085     } else {
4086       __ xxspltib(vec_4s->to_vsr(), 4);
4087       __ xxspltib(vec_0xfs, 0xf);
4088       __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
4089     }
4090 
4091     // The rest of the constants use different values depending on the
4092     // setting of isURL
4093     __ cmpwi(CR0, isURL, 0);
4094     __ beq(CR0, not_URL);
4095 
4096     // isURL != 0 (true)
4097     if (PowerArchitecturePPC64 >= 10) {
4098       __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_URL_val), const_ptr);
4099       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_URL_val), const_ptr);
4100     } else {
4101       __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_URL_val), const_ptr);
4102       __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_URL_val), const_ptr);
4103       __ xxspltib(vec_special_case_char->to_vsr(), '_');
4104       __ xxspltib(vec_special_case_offset, (unsigned char)US);
4105     }
4106     __ b(calculate_size);
4107 
4108     // isURL = 0 (false)
4109     __ bind(not_URL);
4110     if (PowerArchitecturePPC64 >= 10) {
4111       __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_val), const_ptr);
4112       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
4113     } else {
4114       __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_val), const_ptr);
4115       __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_val), const_ptr);
4116       __ xxspltib(vec_special_case_char->to_vsr(), '/');
4117       __ xxspltib(vec_special_case_offset, (unsigned char)SLS);
4118     }
4119 
4120     __ bind(calculate_size);
4121 
4122     // out starts at d + dp
4123     __ add(out, d, dp);
4124 
4125     // in starts at s + sp
4126     __ add(in, s, sp);
4127 
4128     __ align(32);
4129     __ bind(loop_start);
4130     __ lxv(input->to_vsr(), 0, in); // offset=0
4131 
4132     //
4133     // Lookup
4134     //
4135     if (PowerArchitecturePPC64 >= 10) {
4136       // Use xxpermx to do a lookup of each Base64 character in the
4137       // input vector and translate it to a 6-bit value + 0x80.
4138       // Characters which are not valid Base64 characters will result
4139       // in a zero in the corresponding byte.
4140       //
4141       // Note that due to align(32) call above, the xxpermx instructions do
4142       // not require align_prefix() calls, since the final xxpermx
4143       // prefix+opcode is at byte 24.
4144       __ xxpermx(xlate_a, table_32_47, table_48_63, input->to_vsr(), 1);    // offset=4
4145       __ xxpermx(xlate_b, table_64_79, table_80_95, input->to_vsr(), 2);    // offset=12
4146       __ xxlor(xlate_b, xlate_a, xlate_b);                                  // offset=20
4147       __ xxpermx(xlate_a, table_96_111, table_112_127, input->to_vsr(), 3); // offset=24
4148       __ xxlor(input->to_vsr(), xlate_a, xlate_b);
4149       // Check for non-Base64 characters by comparing each byte to zero.
4150       __ vcmpequb_(non_match, input, vec_0s);
4151     } else {
4152       // Isolate the upper 4 bits of each character by shifting it right 4 bits
4153       __ vsrb(higher_nibble, input, vec_4s);
4154       // Isolate the lower 4 bits by masking
4155       __ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
4156 
4157       // Get the offset (the value to subtract from the byte) by using
4158       // a lookup table indexed by the upper 4 bits of the character
4159       __ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
4160 
4161       // Find out which elements are the special case character (isURL ? '/' : '-')
4162       __ vcmpequb(eq_special_case_char, input, vec_special_case_char);
4163 
4164       // For each character in the input which is a special case
4165       // character, replace its offset with one that is special for that
4166       // character.
4167       __ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
4168 
4169       // Use the lower_nibble to select a mask "M" from the lookup table.
4170       __ xxperm(M, maskLUT, lower_nibble);
4171 
4172       // "bit" is used to isolate which of the bits in M is relevant.
4173       __ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
4174 
4175       // Each element of non_match correspond to one each of the 16 input
4176       // characters.  Those elements that become 0x00 after the xxland
4177       // instruction are invalid Base64 characters.
4178       __ xxland(non_match->to_vsr(), M, bit);
4179 
4180       // Compare each element to zero
4181       //
4182       __ vcmpequb_(non_match, non_match, vec_0s);
4183     }
4184     // vmcmpequb_ sets the EQ bit of CR6 if no elements compare equal.
4185     // Any element comparing equal to zero means there is an error in
4186     // that element.  Note that the comparison result register
4187     // non_match is not referenced again.  Only CR6-EQ matters.
4188     __ bne_predict_not_taken(CR6, loop_exit);
4189 
4190     // The Base64 characters had no errors, so add the offsets, which in
4191     // the case of Power10 is a constant vector of all 0x80's (see earlier
4192     // comment where the offsets register is loaded).
4193     __ vaddubm(input, input, offsets);
4194 
4195     // Pack
4196     //
4197     // In the tables below, b0, b1, .. b15 are the bytes of decoded
4198     // binary data, the first line of each of the cells (except for
4199     // the constants) uses the bit-field nomenclature from the
4200     // above-linked paper, whereas the second line is more specific
4201     // about which exact bits are present, and is constructed using the
4202     // Power ISA 3.x document style, where:
4203     //
4204     // * The specifier after the colon depicts which bits are there.
4205     // * The bit numbering is big endian style (bit 0 is the most
4206     //   significant).
4207     // * || is a concatenate operator.
4208     // * Strings of 0's are a field of zeros with the shown length, and
4209     //   likewise for strings of 1's.
4210 
4211     // Note that only e12..e15 are shown here because the shifting
4212     // and OR'ing pattern replicates for e8..e11, e4..7, and
4213     // e0..e3.
4214     //
4215     // +======================+=================+======================+======================+=============+
4216     // |        Vector        |       e12       |         e13          |         e14          |     e15     |
4217     // |       Element        |                 |                      |                      |             |
4218     // +======================+=================+======================+======================+=============+
4219     // |    after vaddubm     |    00dddddd     |       00cccccc       |       00bbbbbb       |  00aaaaaa   |
4220     // |                      |   00||b2:2..7   | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4221     // +----------------------+-----------------+----------------------+----------------------+-------------+
4222     // |     pack_lshift      |                 |         << 6         |         << 4         |    << 2     |
4223     // +----------------------+-----------------+----------------------+----------------------+-------------+
4224     // |     l after vslb     |    00dddddd     |       cc000000       |       bbbb0000       |  aaaaaa00   |
4225     // |                      |   00||b2:2..7   |   b2:0..1||000000    |    b1:0..3||0000     | b0:0..5||00 |
4226     // +----------------------+-----------------+----------------------+----------------------+-------------+
4227     // |     l after vslo     |    cc000000     |       bbbb0000       |       aaaaaa00       |  00000000   |
4228     // |                      | b2:0..1||000000 |    b1:0..3||0000     |     b0:0..5||00      |  00000000   |
4229     // +----------------------+-----------------+----------------------+----------------------+-------------+
4230     // |     pack_rshift      |                 |         >> 2         |         >> 4         |             |
4231     // +----------------------+-----------------+----------------------+----------------------+-------------+
4232     // |     r after vsrb     |    00dddddd     |       0000cccc       |       000000bb       |  00aaaaaa   |
4233     // |                      |   00||b2:2..7   |    0000||b1:4..7     |   000000||b0:6..7    | 00||b0:0..5 |
4234     // +----------------------+-----------------+----------------------+----------------------+-------------+
4235     // | gathered after xxlor |    ccdddddd     |       bbbbcccc       |       aaaaaabb       |  00aaaaaa   |
4236     // |                      |     b2:0..7     |       b1:0..7        |       b0:0..7        | 00||b0:0..5 |
4237     // +======================+=================+======================+======================+=============+
4238     //
4239     // Note: there is a typo in the above-linked paper that shows the result of the gathering process is:
4240     // [ddddddcc|bbbbcccc|aaaaaabb]
4241     // but should be:
4242     // [ccdddddd|bbbbcccc|aaaaaabb]
4243     //
4244     __ vslb(l, input, pack_lshift);
4245     // vslo of vec_8s shifts the vector by one octet toward lower
4246     // element numbers, discarding element 0.  This means it actually
4247     // shifts to the right (not left) according to the order of the
4248     // table above.
4249     __ vslo(l, l, vec_8s);
4250     __ vsrb(r, input, pack_rshift);
4251     __ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
4252 
4253     // Final rearrangement of bytes into their correct positions.
4254     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4255     // |    Vector    |  e0  |  e1  |  e2  |  e3  | e4  | e5  | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
4256     // |   Elements   |      |      |      |      |     |     |    |    |    |    |     |     |     |     |     |     |
4257     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4258     // | after xxlor  | b11  | b10  |  b9  |  xx  | b8  | b7  | b6 | xx | b5 | b4 | b3  | xx  | b2  | b1  | b0  | xx  |
4259     // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4260     // | pack_permute |  0   |  0   |  0   |  0   |  0  |  1  | 2  | 4  | 5  | 6  |  8  |  9  | 10  | 12  | 13  | 14  |
4261     // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4262     // | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5  | b4  | b3  | b2  | b1  | b0  |
4263     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4264     // xx bytes are not used to form the final data
4265     // b0..b15 are the decoded and reassembled 8-bit bytes of data
4266     // b11 with asterisk is a "don't care", because these bytes will be
4267     // overwritten on the next iteration.
4268     __ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
4269 
4270     // We cannot use a static displacement on the store, since it's a
4271     // multiple of 12, not 16.  Note that this stxv instruction actually
4272     // writes 16 bytes, even though only the first 12 are valid data.
4273     __ stxv(gathered->to_vsr(), 0, out);
4274     __ addi(out, out, 12);
4275     __ addi(in, in, 16);
4276     __ bdnz(loop_start);
4277 
4278     __ bind(loop_exit);
4279 
4280     // Return the number of out bytes produced, which is (out - (d + dp)) == out - d - dp;
4281     __ sub(R3_RET, out, d);
4282     __ sub(R3_RET, R3_RET, dp);
4283 
4284     __ blr();
4285 
4286     __ bind(return_zero);
4287     __ li(R3_RET, 0);
4288     __ blr();
4289 
4290     return start;
4291   }
4292 
4293 #undef UC
4294 #undef LC
4295 #undef DIG
4296 #undef PLS
4297 #undef HYP
4298 #undef SLS
4299 #undef US
4300 
4301 // This algorithm is based on the methods described in this paper:
4302 // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
4303 //
4304 // The details of this implementation vary from the paper due to the
4305 // difference in the ISA between SSE and AltiVec, especially in the
4306 // splitting bytes section where there is no need on Power to mask after
4307 // the shift because the shift is byte-wise rather than an entire an entire
4308 // 128-bit word.
4309 //
4310 // For the lookup part of the algorithm, different logic is used than
4311 // described in the paper because of the availability of vperm, which can
4312 // do a 64-byte table lookup in four instructions, while preserving the
4313 // branchless nature.
4314 //
4315 // Description of the ENCODE_CORE macro
4316 //
4317 // Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2
4318 // bits of each byte are zeros)
4319 //
4320 // (Note: e7..e0 are not shown because they follow the same pattern as
4321 // e8..e15)
4322 //
4323 // In the table below, b0, b1, .. b15 are the bytes of unencoded
4324 // binary data, the first line of each of the cells (except for
4325 // the constants) uses the bit-field nomenclature from the
4326 // above-linked paper, whereas the second line is more specific
4327 // about which exact bits are present, and is constructed using the
4328 // Power ISA 3.x document style, where:
4329 //
4330 // * The specifier after the colon depicts which bits are there.
4331 // * The bit numbering is big endian style (bit 0 is the most
4332 //   significant).
4333 // * || is a concatenate operator.
4334 // * Strings of 0's are a field of zeros with the shown length, and
4335 //   likewise for strings of 1's.
4336 //
4337 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4338 // |          Vector          |     e8      |          e9          |         e10          |     e11     |     e12     |         e13          |         e14          |     e15     |
4339 // |         Element          |             |                      |                      |             |             |                      |                      |             |
4340 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4341 // |        after lxv         |  jjjjkkkk   |       iiiiiijj       |       gghhhhhh       |  ffffgggg   |  eeeeeeff   |       ccdddddd       |       bbbbcccc       |  aaaaaabb   |
4342 // |                          |     b7      |          b6          |          b5          |     b4      |     b3      |          b2          |          b1          |     b0      |
4343 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4344 // |      xxperm indexes      |      0      |          10          |          11          |     12      |      0      |          13          |          14          |     15      |
4345 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4346 // |     (1) after xxperm     |             |       gghhhhhh       |       ffffgggg       |  eeeeeeff   |             |       ccdddddd       |       bbbbcccc       |  aaaaaabb   |
4347 // |                          |    (b15)    |          b5          |          b4          |     b3      |    (b15)    |          b2          |          b1          |     b0      |
4348 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4349 // |      rshift_amount       |      0      |          6           |          4           |      2      |      0      |          6           |          4           |      2      |
4350 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4351 // |        after vsrb        |             |       000000gg       |       0000ffff       |  00eeeeee   |             |       000000cc       |       0000bbbb       |  00aaaaaa   |
4352 // |                          |    (b15)    |   000000||b5:0..1    |    0000||b4:0..3     | 00||b3:0..5 |    (b15)    |   000000||b2:0..1    |    0000||b1:0..3     | 00||b0:0..5 |
4353 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4354 // |       rshift_mask        |  00000000   |      000000||11      |      0000||1111      | 00||111111  |  00000000   |      000000||11      |      0000||1111      | 00||111111  |
4355 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4356 // |    rshift after vand     |  00000000   |       000000gg       |       0000ffff       |  00eeeeee   |  00000000   |       000000cc       |       0000bbbb       |  00aaaaaa   |
4357 // |                          |  00000000   |   000000||b5:0..1    |    0000||b4:0..3     | 00||b3:0..5 |  00000000   |   000000||b2:0..1    |    0000||b1:0..3     | 00||b0:0..5 |
4358 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4359 // |    1 octet lshift (1)    |  gghhhhhh   |       ffffgggg       |       eeeeeeff       |             |  ccdddddd   |       bbbbcccc       |       aaaaaabb       |  00000000   |
4360 // |                          |     b5      |          b4          |          b3          |    (b15)    |     b2      |          b1          |          b0          |  00000000   |
4361 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4362 // |      lshift_amount       |      0      |          2           |          4           |      0      |      0      |          2           |          4           |      0      |
4363 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4364 // |        after vslb        |  gghhhhhh   |       ffgggg00       |       eeff0000       |             |  ccdddddd   |       bbcccc00       |       aabb0000       |  00000000   |
4365 // |                          |     b5      |     b4:2..7||00      |    b3:4..7||0000     |    (b15)    |   b2:0..7   |     b1:2..7||00      |    b0:4..7||0000     |  00000000   |
4366 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4367 // |       lshift_mask        | 00||111111  |     00||1111||00     |     00||11||0000     |  00000000   | 00||111111  |     00||1111||00     |     00||11||0000     |  00000000   |
4368 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4369 // |    lshift after vand     |  00hhhhhh   |       00gggg00       |       00ff0000       |  00000000   |  00dddddd   |       00cccc00       |       00bb0000       |  00000000   |
4370 // |                          | 00||b5:2..7 |   00||b4:4..7||00    |  00||b3:6..7||0000   |  00000000   | 00||b2:2..7 |   00||b1:4..7||00    |  00||b0:6..7||0000   |  00000000   |
4371 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4372 // | after vor lshift, rshift |  00hhhhhh   |       00gggggg       |       00ffffff       |  00eeeeee   |  00dddddd   |       00cccccc       |       00bbbbbb       |  00aaaaaa   |
4373 // |                          | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4374 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4375 //
4376 // Expand the first 12 bytes into 16 bytes, leaving every 4th byte
4377 // blank for now.
4378 // __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);
4379 //
4380 // Generate two bit-shifted pieces - rshift and lshift - that will
4381 // later be OR'd together.
4382 //
4383 // First the right-shifted piece
4384 // __ vsrb(rshift, input, expand_rshift);
4385 // __ vand(rshift, rshift, expand_rshift_mask);
4386 //
4387 // Now the left-shifted piece, which is done by octet shifting
4388 // the input one byte to the left, then doing a variable shift,
4389 // followed by a mask operation.
4390 //
4391 // __ vslo(lshift, input, vec_8s);
4392 // __ vslb(lshift, lshift, expand_lshift);
4393 // __ vand(lshift, lshift, expand_lshift_mask);
4394 //
4395 // Combine the two pieces by OR'ing
4396 // __ vor(expanded, rshift, lshift);
4397 //
4398 // At this point, expanded is a vector containing a 6-bit value in each
4399 // byte.  These values are used as indexes into a 64-byte lookup table that
4400 // is contained in four vector registers.  The lookup operation is done
4401 // using vperm instructions with the same indexes for the lower 32 and
4402 // upper 32 bytes.  To figure out which of the two looked-up bytes to use
4403 // at each location, all values in expanded are compared to 31.  Using
4404 // vsel, values higher than 31 use the results from the upper 32 bytes of
4405 // the lookup operation, while values less than or equal to 31 use the
4406 // lower 32 bytes of the lookup operation.
4407 //
4408 // Note: it's tempting to use a xxpermx,xxpermx,vor sequence here on
4409 // Power10 (or later), but experiments doing so on Power10 yielded a slight
4410 // performance drop, perhaps due to the need for xxpermx instruction
4411 // prefixes.
4412 
4413 #define ENCODE_CORE                                                        \
4414     __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);           \
4415     __ vsrb(rshift, input, expand_rshift);                                 \
4416     __ vand(rshift, rshift, expand_rshift_mask);                           \
4417     __ vslo(lshift, input, vec_8s);                                        \
4418     __ vslb(lshift, lshift, expand_lshift);                                \
4419     __ vand(lshift, lshift, expand_lshift_mask);                           \
4420     __ vor(expanded, rshift, lshift);                                      \
4421     __ vperm(encoded_00_31, vec_base64_00_15, vec_base64_16_31, expanded); \
4422     __ vperm(encoded_32_63, vec_base64_32_47, vec_base64_48_63, expanded); \
4423     __ vcmpgtub(gt_31, expanded, vec_31s);                                 \
4424     __ vsel(expanded, encoded_00_31, encoded_32_63, gt_31);
4425 
4426 // Intrinsic function prototype in Base64.java:
4427 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
4428 
4429   address generate_base64_encodeBlock() {
4430     __ align(CodeEntryAlignment);
4431     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
4432     StubCodeMark mark(this, stub_id);
4433     address start   = __ function_entry();
4434 
4435     typedef struct {
4436       unsigned char expand_permute_val[16];
4437       unsigned char expand_rshift_val[16];
4438       unsigned char expand_rshift_mask_val[16];
4439       unsigned char expand_lshift_val[16];
4440       unsigned char expand_lshift_mask_val[16];
4441       unsigned char base64_00_15_val[16];
4442       unsigned char base64_16_31_val[16];
4443       unsigned char base64_32_47_val[16];
4444       unsigned char base64_48_63_val[16];
4445       unsigned char base64_48_63_URL_val[16];
4446     } constant_block;
4447 
4448     alignas(16) static const constant_block const_block = {
4449       .expand_permute_val = {
4450         ARRAY_TO_LXV_ORDER(
4451         0,  4,  5,  6,
4452         0,  7,  8,  9,
4453         0, 10, 11, 12,
4454         0, 13, 14, 15 ) },
4455 
4456       .expand_rshift_val = {
4457         ARRAY_TO_LXV_ORDER(
4458         0, 6, 4, 2,
4459         0, 6, 4, 2,
4460         0, 6, 4, 2,
4461         0, 6, 4, 2 ) },
4462 
4463       .expand_rshift_mask_val = {
4464         ARRAY_TO_LXV_ORDER(
4465         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4466         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4467         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4468         0b00000000, 0b00000011, 0b00001111, 0b00111111 ) },
4469 
4470       .expand_lshift_val = {
4471         ARRAY_TO_LXV_ORDER(
4472         0, 2, 4, 0,
4473         0, 2, 4, 0,
4474         0, 2, 4, 0,
4475         0, 2, 4, 0 ) },
4476 
4477       .expand_lshift_mask_val = {
4478         ARRAY_TO_LXV_ORDER(
4479         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4480         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4481         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4482         0b00111111, 0b00111100, 0b00110000, 0b00000000 ) },
4483 
4484       .base64_00_15_val = {
4485         ARRAY_TO_LXV_ORDER(
4486         'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P' ) },
4487 
4488       .base64_16_31_val = {
4489         ARRAY_TO_LXV_ORDER(
4490         'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f' ) },
4491 
4492       .base64_32_47_val = {
4493         ARRAY_TO_LXV_ORDER(
4494         'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v' ) },
4495 
4496       .base64_48_63_val = {
4497         ARRAY_TO_LXV_ORDER(
4498         'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/' ) },
4499 
4500       .base64_48_63_URL_val = {
4501         ARRAY_TO_LXV_ORDER(
4502         'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) }
4503     };
4504 
4505     // Number of bytes to process in each pass through the main loop.
4506     // 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes.
4507     const unsigned block_size = 12;
4508 
4509     // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
4510     Register src       = R3_ARG1; // source starting address of Base64 characters
4511     Register sp        = R4_ARG2; // source starting position
4512     Register sl        = R5_ARG3; // total source length of the Base64 characters to be processed
4513     Register dst       = R6_ARG4; // destination address
4514     Register dp        = R7_ARG5; // destination starting position
4515     Register isURL     = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
4516 
4517     // Local variables
4518     Register const_ptr     = R12; // used for loading constants (reuses isURL's register)
4519     Register tmp_reg       = R9;  // used for speeding up load_constant()
4520 
4521     Register size           = R9;  // number of bytes to process (reuses tmp_reg's register)
4522     Register blocked_size   = R10; // number of bytes to process a block at a time
4523     Register block_modulo   = R12; // == block_size (reuse const_ptr)
4524     Register remaining      = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg)
4525     Register in             = R4;  // current input (source) pointer (reuse sp's register)
4526     Register num_blocks     = R11; // number of blocks to be processed by the loop
4527     Register out            = R8;  // current output (destination) pointer (reuse const_ptr's register)
4528     Register three          = R9;  // constant divisor (reuse size's register)
4529     Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register)
4530     Register tmp1           = R7;  // temp register for lxvl length (reuse dp's register)
4531     Register modulo_chars   = R7;  // number of bytes written during the final write % 4 (reuse tmp1's register)
4532     Register pad_char       = R6;  // literal '=' (reuse dst's register)
4533 
4534     // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
4535     // VR Constants
4536     VectorRegister  vec_8s             = VR0;
4537     VectorRegister  vec_31s            = VR1;
4538     VectorRegister  vec_base64_00_15   = VR2;
4539     VectorRegister  vec_base64_16_31   = VR3;
4540     VectorRegister  vec_base64_32_47   = VR4;
4541     VectorRegister  vec_base64_48_63   = VR5;
4542     VectorRegister  expand_rshift      = VR6;
4543     VectorRegister  expand_rshift_mask = VR7;
4544     VectorRegister  expand_lshift      = VR8;
4545     VectorRegister  expand_lshift_mask = VR9;
4546 
4547     // VR variables for expand
4548     VectorRegister  input              = VR10;
4549     VectorRegister  rshift             = VR11;
4550     VectorRegister  lshift             = VR12;
4551     VectorRegister  expanded           = VR13;
4552 
4553     // VR variables for lookup
4554     VectorRegister  encoded_00_31      = VR10; // (reuse input)
4555     VectorRegister  encoded_32_63      = VR11; // (reuse rshift)
4556     VectorRegister  gt_31              = VR12; // (reuse lshift)
4557 
4558     // VSR Constants
4559     VectorSRegister expand_permute     = VSR0;
4560 
4561     Label not_URL, calculate_size, calculate_blocked_size, skip_loop;
4562     Label loop_start, le_16_to_write, no_pad, one_pad_char;
4563 
4564     // The upper 32 bits of the non-pointer parameter registers are not
4565     // guaranteed to be zero, so mask off those upper bits.
4566     __ clrldi(sp, sp, 32);
4567     __ clrldi(sl, sl, 32);
4568     __ clrldi(dp, dp, 32);
4569     __ clrldi(isURL, isURL, 32);
4570 
4571     // load up the constants
4572     __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
4573     __ lxv(expand_permute,               BLK_OFFSETOF(expand_permute_val),     const_ptr);
4574     __ lxv(expand_rshift->to_vsr(),      BLK_OFFSETOF(expand_rshift_val),      const_ptr);
4575     __ lxv(expand_rshift_mask->to_vsr(), BLK_OFFSETOF(expand_rshift_mask_val), const_ptr);
4576     __ lxv(expand_lshift->to_vsr(),      BLK_OFFSETOF(expand_lshift_val),      const_ptr);
4577     __ lxv(expand_lshift_mask->to_vsr(), BLK_OFFSETOF(expand_lshift_mask_val), const_ptr);
4578     __ lxv(vec_base64_00_15->to_vsr(),   BLK_OFFSETOF(base64_00_15_val),       const_ptr);
4579     __ lxv(vec_base64_16_31->to_vsr(),   BLK_OFFSETOF(base64_16_31_val),       const_ptr);
4580     __ lxv(vec_base64_32_47->to_vsr(),   BLK_OFFSETOF(base64_32_47_val),       const_ptr);
4581 
4582     // Splat the constants that can use xxspltib
4583     __ xxspltib(vec_8s->to_vsr(), 8);
4584     __ xxspltib(vec_31s->to_vsr(), 31);
4585 
4586 
4587     // Use a different translation lookup table depending on the
4588     // setting of isURL
4589     __ cmpdi(CR0, isURL, 0);
4590     __ beq(CR0, not_URL);
4591     __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr);
4592     __ b(calculate_size);
4593 
4594     __ bind(not_URL);
4595     __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_val), const_ptr);
4596 
4597     __ bind(calculate_size);
4598 
4599     // size = sl - sp - 4 (*)
4600     // (*) Don't process the last four bytes in the main loop because
4601     // we don't want the lxv instruction to read past the end of the src
4602     // data, in case those four bytes are on the start of an unmapped or
4603     // otherwise inaccessible page.
4604     //
4605     __ sub(size, sl, sp);
4606     __ subi(size, size, 4);
4607     __ cmpdi(CR7, size, block_size);
4608     __ bgt(CR7, calculate_blocked_size);
4609     __ mr(remaining, size);
4610     // Add the 4 back into remaining again
4611     __ addi(remaining, remaining, 4);
4612     // make "in" point to the beginning of the source data: in = src + sp
4613     __ add(in, src, sp);
4614     // out = dst + dp
4615     __ add(out, dst, dp);
4616     __ b(skip_loop);
4617 
4618     __ bind(calculate_blocked_size);
4619     __ li(block_modulo, block_size);
4620     // num_blocks = size / block_modulo
4621     __ divwu(num_blocks, size, block_modulo);
4622     // blocked_size = num_blocks * size
4623     __ mullw(blocked_size, num_blocks, block_modulo);
4624     // remaining = size - blocked_size
4625     __ sub(remaining, size, blocked_size);
4626     __ mtctr(num_blocks);
4627 
4628     // Add the 4 back in to remaining again
4629     __ addi(remaining, remaining, 4);
4630 
4631     // make "in" point to the beginning of the source data: in = src + sp
4632     __ add(in, src, sp);
4633 
4634     // out = dst + dp
4635     __ add(out, dst, dp);
4636 
4637     __ align(32);
4638     __ bind(loop_start);
4639 
4640     __ lxv(input->to_vsr(), 0, in);
4641 
4642     ENCODE_CORE
4643 
4644     __ stxv(expanded->to_vsr(), 0, out);
4645     __ addi(in, in, 12);
4646     __ addi(out, out, 16);
4647     __ bdnz(loop_start);
4648 
4649     __ bind(skip_loop);
4650 
4651     // When there are less than 16 bytes left, we need to be careful not to
4652     // read beyond the end of the src buffer, which might be in an unmapped
4653     // page.
4654     // Load the remaining bytes using lxvl.
4655     __ rldicr(tmp1, remaining, 56, 7);
4656     __ lxvl(input->to_vsr(), in, tmp1);
4657 
4658     ENCODE_CORE
4659 
4660     // bytes_to_write = ((remaining * 4) + 2) / 3
4661     __ li(three, 3);
4662     __ rlwinm(bytes_to_write, remaining, 2, 0, 29); // remaining * 4
4663     __ addi(bytes_to_write, bytes_to_write, 2);
4664     __ divwu(bytes_to_write, bytes_to_write, three);
4665 
4666     __ cmpwi(CR7, bytes_to_write, 16);
4667     __ ble_predict_taken(CR7, le_16_to_write);
4668     __ stxv(expanded->to_vsr(), 0, out);
4669 
4670     // We've processed 12 of the 13-15 data bytes, so advance the pointers,
4671     // and do one final pass for the remaining 1-3 bytes.
4672     __ addi(in, in, 12);
4673     __ addi(out, out, 16);
4674     __ subi(remaining, remaining, 12);
4675     __ subi(bytes_to_write, bytes_to_write, 16);
4676     __ rldicr(tmp1, bytes_to_write, 56, 7);
4677     __ lxvl(input->to_vsr(), in, tmp1);
4678 
4679     ENCODE_CORE
4680 
4681     __ bind(le_16_to_write);
4682     // shift bytes_to_write into the upper 8 bits of t1 for use by stxvl
4683     __ rldicr(tmp1, bytes_to_write, 56, 7);
4684     __ stxvl(expanded->to_vsr(), out, tmp1);
4685     __ add(out, out, bytes_to_write);
4686 
4687     __ li(pad_char, '=');
4688     __ rlwinm_(modulo_chars, bytes_to_write, 0, 30, 31); // bytes_to_write % 4, set CR0
4689     // Examples:
4690     //    remaining  bytes_to_write  modulo_chars  num pad chars
4691     //        0            0               0            0
4692     //        1            2               2            2
4693     //        2            3               3            1
4694     //        3            4               0            0
4695     //        4            6               2            2
4696     //        5            7               3            1
4697     //        ...
4698     //       12           16               0            0
4699     //       13           18               2            2
4700     //       14           19               3            1
4701     //       15           20               0            0
4702     __ beq(CR0, no_pad);
4703     __ cmpwi(CR7, modulo_chars, 3);
4704     __ beq(CR7, one_pad_char);
4705 
4706     // two pad chars
4707     __ stb(pad_char, out);
4708     __ addi(out, out, 1);
4709 
4710     __ bind(one_pad_char);
4711     __ stb(pad_char, out);
4712 
4713     __ bind(no_pad);
4714 
4715     __ blr();
4716     return start;
4717   }
4718 
4719 #endif // VM_LITTLE_ENDIAN
4720 
4721 void generate_lookup_secondary_supers_table_stub() {
4722     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
4723     StubCodeMark mark(this, stub_id);
4724 
4725     const Register
4726       r_super_klass  = R4_ARG2,
4727       r_array_base   = R3_ARG1,
4728       r_array_length = R7_ARG5,
4729       r_array_index  = R6_ARG4,
4730       r_sub_klass    = R5_ARG3,
4731       r_bitmap       = R11_scratch1,
4732       result         = R8_ARG6;
4733 
4734     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
4735       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
4736       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
4737                                              r_array_base, r_array_length, r_array_index,
4738                                              r_bitmap, result, slot);
4739       __ blr();
4740     }
4741   }
4742 
4743   // Slow path implementation for UseSecondarySupersTable.
4744   address generate_lookup_secondary_supers_table_slow_path_stub() {
4745     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
4746     StubCodeMark mark(this, stub_id);
4747 
4748     address start = __ pc();
4749     const Register
4750       r_super_klass  = R4_ARG2,
4751       r_array_base   = R3_ARG1,
4752       temp1          = R7_ARG5,
4753       r_array_index  = R6_ARG4,
4754       r_bitmap       = R11_scratch1,
4755       result         = R8_ARG6;
4756 
4757     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
4758     __ blr();
4759 
4760     return start;
4761   }
4762 
4763   address generate_cont_thaw(StubId stub_id) {
4764     if (!Continuations::enabled()) return nullptr;
4765 
4766     Continuation::thaw_kind kind;
4767     bool return_barrier;
4768     bool return_barrier_exception;
4769 
4770     switch (stub_id) {
4771     case StubId::stubgen_cont_thaw_id:
4772       kind = Continuation::thaw_top;
4773       return_barrier = false;
4774       return_barrier_exception = false;
4775       break;
4776     case StubId::stubgen_cont_returnBarrier_id:
4777       kind = Continuation::thaw_return_barrier;
4778       return_barrier = true;
4779       return_barrier_exception = false;
4780       break;
4781     case StubId::stubgen_cont_returnBarrierExc_id:
4782       kind = Continuation::thaw_return_barrier_exception;
4783       return_barrier = true;
4784       return_barrier_exception = true;
4785       break;
4786     default:
4787       ShouldNotReachHere();
4788     }
4789     StubCodeMark mark(this, stub_id);
4790 
4791     Register tmp1 = R10_ARG8;
4792     Register tmp2 = R9_ARG7;
4793     Register tmp3 = R8_ARG6;
4794     Register nvtmp = R15_esp;   // nonvolatile tmp register
4795     FloatRegister nvftmp = F20; // nonvolatile fp tmp register
4796 
4797     address start = __ pc();
4798 
4799     if (kind == Continuation::thaw_top) {
4800       __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
4801     }
4802 
4803     if (return_barrier) {
4804       __ mr(nvtmp, R3_RET); __ fmr(nvftmp, F1_RET); // preserve possible return value from a method returning to the return barrier
4805       DEBUG_ONLY(__ ld_ptr(tmp1, _abi0(callers_sp), R1_SP);)
4806       __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
4807 #ifdef ASSERT
4808       __ ld_ptr(tmp2, _abi0(callers_sp), R1_SP);
4809       __ cmpd(CR0, tmp1, tmp2);
4810       __ asm_assert_eq(FILE_AND_LINE ": callers sp is corrupt");
4811 #endif
4812     }
4813 #ifdef ASSERT
4814     __ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread);
4815     __ cmpd(CR0, R1_SP, tmp1);
4816     __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
4817 #endif
4818 
4819     __ li(R4_ARG2, return_barrier ? 1 : 0);
4820     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), R16_thread, R4_ARG2);
4821 
4822 #ifdef ASSERT
4823     DEBUG_ONLY(__ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread));
4824     DEBUG_ONLY(__ cmpd(CR0, R1_SP, tmp1));
4825     __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
4826 #endif
4827 
4828     // R3_RET contains the size of the frames to thaw, 0 if overflow or no more frames
4829     Label thaw_success;
4830     __ cmpdi(CR0, R3_RET, 0);
4831     __ bne(CR0, thaw_success);
4832     __ load_const_optimized(tmp1, (SharedRuntime::throw_StackOverflowError_entry()), R0);
4833     __ mtctr(tmp1); __ bctr();
4834     __ bind(thaw_success);
4835 
4836     __ addi(R3_RET, R3_RET, frame::native_abi_reg_args_size); // Large abi required for C++ calls.
4837     __ neg(R3_RET, R3_RET);
4838     // align down resulting in a smaller negative offset
4839     __ clrrdi(R3_RET, R3_RET, exact_log2(frame::alignment_in_bytes));
4840     DEBUG_ONLY(__ mr(tmp1, R1_SP);)
4841     __ resize_frame(R3_RET, tmp2);  // make room for the thawed frames
4842 
4843     __ li(R4_ARG2, kind);
4844     __ call_VM_leaf(Continuation::thaw_entry(), R16_thread, R4_ARG2);
4845     __ mr(R1_SP, R3_RET); // R3_RET contains the SP of the thawed top frame
4846 
4847     if (return_barrier) {
4848       // we're now in the caller of the frame that returned to the barrier
4849       __ mr(R3_RET, nvtmp); __ fmr(F1_RET, nvftmp); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4850     } else {
4851       // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
4852       __ li(R3_RET, 0); // return 0 (success) from doYield
4853     }
4854 
4855     if (return_barrier_exception) {
4856       Register ex_pc = R17_tos;   // nonvolatile register
4857       __ ld(ex_pc, _abi0(lr), R1_SP); // LR
4858       __ mr(nvtmp, R3_RET); // save return value containing the exception oop
4859       // The thawed top frame has got a frame::java_abi. This is not sufficient for the runtime call.
4860       __ push_frame_reg_args(0, tmp1);
4861       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), R16_thread, ex_pc);
4862       __ mtlr(R3_RET); // the exception handler
4863       __ pop_frame();
4864       // See OptoRuntime::generate_exception_blob for register arguments
4865       __ mr(R3_ARG1, nvtmp); // exception oop
4866       __ mr(R4_ARG2, ex_pc); // exception pc
4867     } else {
4868       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4869       __ ld(R0, _abi0(lr), R1_SP); // LR
4870       __ mtlr(R0);
4871     }
4872     __ blr();
4873 
4874     return start;
4875   }
4876 
4877   address generate_cont_thaw() {
4878     return generate_cont_thaw(StubId::stubgen_cont_thaw_id);
4879   }
4880 
4881   // TODO: will probably need multiple return barriers depending on return type
4882 
4883   address generate_cont_returnBarrier() {
4884     return generate_cont_thaw(StubId::stubgen_cont_returnBarrier_id);
4885   }
4886 
4887   address generate_cont_returnBarrier_exception() {
4888     return generate_cont_thaw(StubId::stubgen_cont_returnBarrierExc_id);
4889   }
4890 
4891   address generate_cont_preempt_stub() {
4892     if (!Continuations::enabled()) return nullptr;
4893     StubId stub_id = StubId::stubgen_cont_preempt_id;
4894     StubCodeMark mark(this, stub_id);
4895     address start = __ pc();
4896 
4897     __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
4898 
4899     __ reset_last_Java_frame(false /*check_last_java_sp*/);
4900 
4901     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4902     __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
4903 
4904     Label preemption_cancelled;
4905     __ lbz(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
4906     __ cmpwi(CR0, R11_scratch1, 0);
4907     __ bne(CR0, preemption_cancelled);
4908 
4909     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4910     SharedRuntime::continuation_enter_cleanup(_masm);
4911     __ pop_frame();
4912     __ restore_LR(R11_scratch1);
4913     __ blr();
4914 
4915     // We acquired the monitor after freezing the frames so call thaw to continue execution.
4916     __ bind(preemption_cancelled);
4917     __ li(R11_scratch1, 0); // false
4918     __ stb(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
4919     int simm16_offs = __ load_const_optimized(R11_scratch1, ContinuationEntry::thaw_call_pc_address(), R0, true);
4920     __ ld(R11_scratch1, simm16_offs, R11_scratch1);
4921     __ mtctr(R11_scratch1);
4922     __ bctr();
4923 
4924     return start;
4925   }
4926 
4927   // exception handler for upcall stubs
4928   address generate_upcall_stub_exception_handler() {
4929     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
4930     StubCodeMark mark(this, stub_id);
4931     address start = __ pc();
4932 
4933     // Native caller has no idea how to handle exceptions,
4934     // so we just crash here. Up to callee to catch exceptions.
4935     __ verify_oop(R3_ARG1);
4936     __ load_const_optimized(R12_scratch2, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception), R0);
4937     __ call_c(R12_scratch2);
4938     __ should_not_reach_here();
4939 
4940     return start;
4941   }
4942 
4943   // load Method* target of MethodHandle
4944   // R3_ARG1 = jobject receiver
4945   // R19_method = result Method*
4946   address generate_upcall_stub_load_target() {
4947 
4948     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
4949     StubCodeMark mark(this, stub_id);
4950     address start = __ pc();
4951 
4952     __ resolve_global_jobject(R3_ARG1, R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS);
4953     // Load target method from receiver
4954     __ load_heap_oop(R19_method, java_lang_invoke_MethodHandle::form_offset(), R3_ARG1,
4955                      R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4956     __ load_heap_oop(R19_method, java_lang_invoke_LambdaForm::vmentry_offset(), R19_method,
4957                      R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4958     __ load_heap_oop(R19_method, java_lang_invoke_MemberName::method_offset(), R19_method,
4959                      R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4960     __ ld(R19_method, java_lang_invoke_ResolvedMethodName::vmtarget_offset(), R19_method);
4961     __ std(R19_method, in_bytes(JavaThread::callee_target_offset()), R16_thread); // just in case callee is deoptimized
4962 
4963     __ blr();
4964 
4965     return start;
4966   }
4967 
4968   // Initialization
4969   void generate_preuniverse_stubs() {
4970     // preuniverse stubs are not needed for ppc
4971   }
4972 
4973   void generate_initial_stubs() {
4974     // Generates all stubs and initializes the entry points
4975 
4976     // Entry points that exist in all platforms.
4977     // Note: This is code that could be shared among different platforms - however the
4978     // benefit seems to be smaller than the disadvantage of having a
4979     // much more complicated generator structure. See also comment in
4980     // stubRoutines.hpp.
4981 
4982     StubRoutines::_forward_exception_entry          = generate_forward_exception();
4983     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
4984     StubRoutines::_catch_exception_entry            = generate_catch_exception();
4985 
4986     if (UnsafeMemoryAccess::_table == nullptr) {
4987       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
4988     }
4989 
4990     // CRC32 Intrinsics.
4991     if (UseCRC32Intrinsics) {
4992       StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32_id);
4993     }
4994 
4995     // CRC32C Intrinsics.
4996     if (UseCRC32CIntrinsics) {
4997       StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32C_id);
4998     }
4999 
5000     if (VM_Version::supports_float16()) {
5001       // For results consistency both intrinsics should be enabled.
5002       StubRoutines::_hf2f = generate_float16ToFloat();
5003       StubRoutines::_f2hf = generate_floatToFloat16();
5004     }
5005   }
5006 
5007   void generate_continuation_stubs() {
5008     // Continuation stubs:
5009     StubRoutines::_cont_thaw          = generate_cont_thaw();
5010     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
5011     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
5012     StubRoutines::_cont_preempt_stub  = generate_cont_preempt_stub();
5013   }
5014 
5015   void generate_final_stubs() {
5016     // Generates all stubs and initializes the entry points
5017 
5018     // support for verify_oop (must happen after universe_init)
5019     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();
5020 
5021     // nmethod entry barriers for concurrent class unloading
5022     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
5023 
5024     // arraycopy stubs used by compilers
5025     generate_arraycopy_stubs();
5026 
5027 #ifdef COMPILER2
5028     if (UseSecondarySupersTable) {
5029       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
5030       if (!InlineSecondarySupersTest) {
5031         generate_lookup_secondary_supers_table_stub();
5032       }
5033     }
5034 #endif // COMPILER2
5035 
5036     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
5037     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
5038   }
5039 
5040   void generate_compiler_stubs() {
5041 #ifdef COMPILER2
5042 
5043     if (UseMultiplyToLenIntrinsic) {
5044       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5045     }
5046     if (UseSquareToLenIntrinsic) {
5047       StubRoutines::_squareToLen = generate_squareToLen();
5048     }
5049     if (UseMulAddIntrinsic) {
5050       StubRoutines::_mulAdd = generate_mulAdd();
5051     }
5052     if (UseMontgomeryMultiplyIntrinsic) {
5053       StubRoutines::_montgomeryMultiply
5054         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
5055     }
5056     if (UseMontgomerySquareIntrinsic) {
5057       StubRoutines::_montgomerySquare
5058         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
5059     }
5060 
5061     // data cache line writeback
5062     if (VM_Version::supports_data_cache_line_flush()) {
5063       StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5064       StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5065     }
5066 
5067     if (UseGHASHIntrinsics) {
5068       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5069     }
5070 
5071     if (UseAESIntrinsics) {
5072       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5073       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5074     }
5075 
5076     if (UseSHA256Intrinsics) {
5077       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
5078       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
5079     }
5080     if (UseSHA512Intrinsics) {
5081       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
5082       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
5083     }
5084 
5085 #ifdef VM_LITTLE_ENDIAN
5086     // Currently supported on PPC64LE only
5087     if (UseBASE64Intrinsics) {
5088       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
5089       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
5090     }
5091 #endif
5092 #endif // COMPILER2
5093   }
5094 
5095  public:
5096   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData *stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
5097     switch(blob_id) {
5098     case BlobId::stubgen_preuniverse_id:
5099       generate_preuniverse_stubs();
5100       break;
5101     case BlobId::stubgen_initial_id:
5102       generate_initial_stubs();
5103       break;
5104     case BlobId::stubgen_continuation_id:
5105       generate_continuation_stubs();
5106       break;
5107     case BlobId::stubgen_compiler_id:
5108       generate_compiler_stubs();
5109       break;
5110     case BlobId::stubgen_final_id:
5111       generate_final_stubs();
5112       break;
5113     default:
5114       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
5115       break;
5116     };
5117   }
5118 };
5119 
5120 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData *stub_data) {
5121   StubGenerator g(code, blob_id, stub_data);
5122 }
5123