1 /*
   2  * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2025 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/macroAssembler.inline.hpp"
  27 #include "compiler/oopMap.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "gc/shared/barrierSetNMethod.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_ppc.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "prims/upcallLinker.hpp"
  39 #include "runtime/continuation.hpp"
  40 #include "runtime/continuationEntry.inline.hpp"
  41 #include "runtime/frame.inline.hpp"
  42 #include "runtime/handles.inline.hpp"
  43 #include "runtime/javaThread.hpp"
  44 #include "runtime/sharedRuntime.hpp"
  45 #include "runtime/stubCodeGenerator.hpp"
  46 #include "runtime/stubRoutines.hpp"
  47 #include "runtime/vm_version.hpp"
  48 #include "utilities/align.hpp"
  49 #include "utilities/powerOfTwo.hpp"
  50 #if INCLUDE_ZGC
  51 #include "gc/z/zBarrierSetAssembler.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp.
  57 
  58 #define __ _masm->
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) // nothing
  62 #else
  63 #define BLOCK_COMMENT(str) __ block_comment(str)
  64 #endif
  65 
  66 #if defined(ABI_ELFv2)
  67 #define STUB_ENTRY(name) StubRoutines::name
  68 #else
  69 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name)->entry()
  70 #endif
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75   // Call stubs are used to call Java from C
  76   //
  77   // Arguments:
  78   //
  79   //   R3  - call wrapper address     : address
  80   //   R4  - result                   : intptr_t*
  81   //   R5  - result type              : BasicType
  82   //   R6  - method                   : Method
  83   //   R7  - frame mgr entry point    : address
  84   //   R8  - parameter block          : intptr_t*
  85   //   R9  - parameter count in words : int
  86   //   R10 - thread                   : Thread*
  87   //
  88   address generate_call_stub(address& return_address) {
  89     // Setup a new c frame, copy java arguments, call frame manager or
  90     // native_entry, and process result.
  91 
  92     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  93     StubCodeMark mark(this, stub_id);
  94 
  95     address start = __ function_entry();
  96 
  97     int save_nonvolatile_registers_size = __ save_nonvolatile_registers_size(true, SuperwordUseVSX);
  98 
  99     // some sanity checks
 100     STATIC_ASSERT(StackAlignmentInBytes == 16);
 101     assert((sizeof(frame::native_abi_minframe) % 16) == 0,    "unaligned");
 102     assert((sizeof(frame::native_abi_reg_args) % 16) == 0,    "unaligned");
 103     assert((save_nonvolatile_registers_size % 16) == 0,       "unaligned");
 104     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
 105     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");
 106 
 107     Register r_arg_call_wrapper_addr        = R3;
 108     Register r_arg_result_addr              = R4;
 109     Register r_arg_result_type              = R5;
 110     Register r_arg_method                   = R6;
 111     Register r_arg_entry                    = R7;
 112     Register r_arg_argument_addr            = R8;
 113     Register r_arg_argument_count           = R9;
 114     Register r_arg_thread                   = R10;
 115 
 116     Register r_entryframe_fp                = R2; // volatile
 117     Register r_argument_size                = R11_scratch1; // volatile
 118     Register r_top_of_arguments_addr        = R21_tmp1;
 119 
 120     {
 121       // Stack on entry to call_stub:
 122       //
 123       //      F1      [C_FRAME]
 124       //              ...
 125       Register r_frame_size  = R12_scratch2; // volatile
 126       Label arguments_copied;
 127 
 128       // Save LR/CR to caller's C_FRAME.
 129       __ save_LR_CR(R0);
 130 
 131       // Keep copy of our frame pointer (caller's SP).
 132       __ mr(r_entryframe_fp, R1_SP);
 133 
 134       // calculate frame size
 135       STATIC_ASSERT(Interpreter::logStackElementSize == 3);
 136 
 137       // space for arguments aligned up: ((arg_count + 1) * 8) &~ 15
 138       __ addi(r_frame_size, r_arg_argument_count, 1);
 139       __ rldicr(r_frame_size, r_frame_size, 3, 63 - 4);
 140 
 141       // this is the pure space for arguments (excluding alignment padding)
 142       __ sldi(r_argument_size, r_arg_argument_count, 3);
 143 
 144       __ addi(r_frame_size, r_frame_size,
 145               save_nonvolatile_registers_size + frame::entry_frame_locals_size + frame::top_ijava_frame_abi_size);
 146 
 147       // push ENTRY_FRAME
 148       __ push_frame(r_frame_size, R0);
 149 
 150       // Save non-volatiles registers to ENTRY_FRAME.
 151       __ save_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
 152                                     true, SuperwordUseVSX);
 153 
 154       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
 155       // Push ENTRY_FRAME including arguments:
 156       //
 157       //      F0      [TOP_IJAVA_FRAME_ABI]
 158       //              alignment (optional)
 159       //              [outgoing Java arguments]
 160       //              [non-volatiles]
 161       //              [ENTRY_FRAME_LOCALS]
 162       //      F1      [C_FRAME]
 163       //              ...
 164 
 165       // initialize call_stub locals (step 1)
 166       __ std(r_arg_call_wrapper_addr, _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
 167       __ std(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
 168       __ std(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
 169       // we will save arguments_tos_address later
 170 
 171       BLOCK_COMMENT("Copy Java arguments");
 172       // copy Java arguments
 173 
 174       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
 175       __ addi(r_top_of_arguments_addr, r_entryframe_fp,
 176               -(save_nonvolatile_registers_size + frame::entry_frame_locals_size));
 177       __ sub(r_top_of_arguments_addr, r_top_of_arguments_addr, r_argument_size);
 178 
 179       // any arguments to copy?
 180       __ cmpdi(CR0, r_arg_argument_count, 0);
 181       __ beq(CR0, arguments_copied);
 182 
 183       // prepare loop and copy arguments in reverse order
 184       {
 185         Register r_argument_addr     = R22_tmp2;
 186         Register r_argumentcopy_addr = R23_tmp3;
 187         // init CTR with arg_argument_count
 188         __ mtctr(r_arg_argument_count);
 189 
 190         // let r_argumentcopy_addr point to last outgoing Java arguments P
 191         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
 192 
 193         // let r_argument_addr point to last incoming java argument
 194         __ add(r_argument_addr, r_arg_argument_addr, r_argument_size);
 195         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 196 
 197         // now loop while CTR > 0 and copy arguments
 198         {
 199           Label next_argument;
 200           __ bind(next_argument);
 201 
 202           __ ld(R0, 0, r_argument_addr);
 203           // argument_addr--;
 204           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 205           __ std(R0, 0, r_argumentcopy_addr);
 206           // argumentcopy_addr++;
 207           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
 208 
 209           __ bdnz(next_argument);
 210         }
 211       }
 212 
 213       // Arguments copied, continue.
 214       __ bind(arguments_copied);
 215     }
 216 
 217     {
 218       BLOCK_COMMENT("Call frame manager or native entry.");
 219       // Call frame manager or native entry.
 220       assert_different_registers(r_arg_entry, r_top_of_arguments_addr, r_arg_method, r_arg_thread);
 221 
 222       // Register state on entry to frame manager / native entry:
 223       //
 224       //   tos         -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
 225       //   R19_method  -  Method
 226       //   R16_thread  -  JavaThread*
 227 
 228       // Tos must point to last argument - element_size.
 229       const Register tos = R15_esp;
 230 
 231       __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
 232 
 233       // initialize call_stub locals (step 2)
 234       // now save tos as arguments_tos_address
 235       __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
 236 
 237       // load argument registers for call
 238       __ mr(R19_method, r_arg_method);
 239       __ mr(R16_thread, r_arg_thread);
 240       assert(tos != r_arg_method, "trashed r_arg_method");
 241       assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
 242 
 243       // Set R15_prev_state to 0 for simplifying checks in callee.
 244       __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R0);
 245       // Stack on entry to frame manager / native entry:
 246       //
 247       //      F0      [TOP_IJAVA_FRAME_ABI]
 248       //              alignment (optional)
 249       //              [outgoing Java arguments]
 250       //              [non-volatiles]
 251       //              [ENTRY_FRAME_LOCALS]
 252       //      F1      [C_FRAME]
 253       //              ...
 254       //
 255 
 256       // global toc register
 257       __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R0);
 258       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
 259       // when called via a c2i.
 260 
 261       // Pass initial_caller_sp to framemanager.
 262       __ mr(R21_sender_SP, R1_SP);
 263 
 264       // Do a light-weight C-call here, r_arg_entry holds the address
 265       // of the interpreter entry point (frame manager or native entry)
 266       // and save runtime-value of LR in return_address.
 267       assert(r_arg_entry != tos && r_arg_entry != R19_method && r_arg_entry != R16_thread,
 268              "trashed r_arg_entry");
 269       return_address = __ call_stub(r_arg_entry);
 270     }
 271 
 272     {
 273       BLOCK_COMMENT("Returned from frame manager or native entry.");
 274       // Returned from frame manager or native entry.
 275       // Now pop frame, process result, and return to caller.
 276 
 277       // Stack on exit from frame manager / native entry:
 278       //
 279       //      F0      [ABI]
 280       //              ...
 281       //              [non-volatiles]
 282       //              [ENTRY_FRAME_LOCALS]
 283       //      F1      [C_FRAME]
 284       //              ...
 285       //
 286       // Just pop the topmost frame ...
 287       //
 288 
 289       Label ret_is_object;
 290       Label ret_is_long;
 291       Label ret_is_float;
 292       Label ret_is_double;
 293 
 294       Register r_lr = R11_scratch1;
 295       Register r_cr = R12_scratch2;
 296 
 297       // Reload some volatile registers which we've spilled before the call
 298       // to frame manager / native entry.
 299       // Access all locals via frame pointer, because we know nothing about
 300       // the topmost frame's size.
 301       __ ld(r_entryframe_fp, _abi0(callers_sp), R1_SP); // restore after call
 302       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
 303       __ ld(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
 304       __ ld(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
 305       __ ld(r_cr, _abi0(cr), r_entryframe_fp);
 306       __ ld(r_lr, _abi0(lr), r_entryframe_fp);
 307       __ mtcr(r_cr); // restore CR
 308       __ mtlr(r_lr); // restore LR
 309 
 310       // Store result depending on type. Everything that is not
 311       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
 312       // Using volatile CRs.
 313       __ cmpwi(CR1, r_arg_result_type, T_OBJECT);
 314       __ cmpwi(CR5, r_arg_result_type, T_LONG);
 315       __ cmpwi(CR6, r_arg_result_type, T_FLOAT);
 316       __ cmpwi(CR7, r_arg_result_type, T_DOUBLE);
 317 
 318       __ pop_cont_fastpath(); // kills CR0, uses R16_thread
 319 
 320       // restore non-volatile registers
 321       __ restore_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
 322                                        true, SuperwordUseVSX);
 323 
 324       // pop frame
 325       __ mr(R1_SP, r_entryframe_fp);
 326 
 327       // Stack on exit from call_stub:
 328       //
 329       //      0       [C_FRAME]
 330       //              ...
 331       //
 332       //  no call_stub frames left.
 333 
 334       __ beq(CR1, ret_is_object);
 335       __ beq(CR5, ret_is_long);
 336       __ beq(CR6, ret_is_float);
 337       __ beq(CR7, ret_is_double);
 338 
 339       // default:
 340       __ stw(R3_RET, 0, r_arg_result_addr);
 341       __ blr(); // return to caller
 342 
 343       // case T_OBJECT:
 344       // case T_LONG:
 345       __ bind(ret_is_object);
 346       __ bind(ret_is_long);
 347       __ std(R3_RET, 0, r_arg_result_addr);
 348       __ blr(); // return to caller
 349 
 350       // case T_FLOAT:
 351       __ bind(ret_is_float);
 352       __ stfs(F1_RET, 0, r_arg_result_addr);
 353       __ blr(); // return to caller
 354 
 355       // case T_DOUBLE:
 356       __ bind(ret_is_double);
 357       __ stfd(F1_RET, 0, r_arg_result_addr);
 358       __ blr(); // return to caller
 359     }
 360 
 361     return start;
 362   }
 363 
 364   // Return point for a Java call if there's an exception thrown in
 365   // Java code.  The exception is caught and transformed into a
 366   // pending exception stored in JavaThread that can be tested from
 367   // within the VM.
 368   //
 369   address generate_catch_exception() {
 370     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
 371     StubCodeMark mark(this, stub_id);
 372 
 373     address start = __ pc();
 374 
 375     // Registers alive
 376     //
 377     //  R16_thread
 378     //  R3_ARG1 - address of pending exception
 379     //  R4_ARG2 - return address in call stub
 380 
 381     const Register exception_file = R21_tmp1;
 382     const Register exception_line = R22_tmp2;
 383 
 384     __ load_const(exception_file, (void*)__FILE__);
 385     __ load_const(exception_line, (void*)__LINE__);
 386 
 387     __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
 388     // store into `char *'
 389     __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
 390     // store into `int'
 391     __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
 392 
 393     // complete return to VM
 394     assert(StubRoutines::_call_stub_return_address != nullptr, "must have been generated before");
 395 
 396     __ mtlr(R4_ARG2);
 397     // continue in call stub
 398     __ blr();
 399 
 400     return start;
 401   }
 402 
 403   // Continuation point for runtime calls returning with a pending
 404   // exception.  The pending exception check happened in the runtime
 405   // or native call stub.  The pending exception in Thread is
 406   // converted into a Java-level exception.
 407   //
 408   // Read:
 409   //
 410   //   LR:     The pc the runtime library callee wants to return to.
 411   //           Since the exception occurred in the callee, the return pc
 412   //           from the point of view of Java is the exception pc.
 413   //   thread: Needed for method handles.
 414   //
 415   // Invalidate:
 416   //
 417   //   volatile registers (except below).
 418   //
 419   // Update:
 420   //
 421   //   R4_ARG2: exception
 422   //
 423   // (LR is unchanged and is live out).
 424   //
 425   address generate_forward_exception() {
 426     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
 427     StubCodeMark mark(this, stub_id);
 428     address start = __ pc();
 429 
 430     if (VerifyOops) {
 431       // Get pending exception oop.
 432       __ ld(R3_ARG1,
 433                 in_bytes(Thread::pending_exception_offset()),
 434                 R16_thread);
 435       // Make sure that this code is only executed if there is a pending exception.
 436       {
 437         Label L;
 438         __ cmpdi(CR0, R3_ARG1, 0);
 439         __ bne(CR0, L);
 440         __ stop("StubRoutines::forward exception: no pending exception (1)");
 441         __ bind(L);
 442       }
 443       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
 444     }
 445 
 446     // Save LR/CR and copy exception pc (LR) into R4_ARG2.
 447     __ save_LR(R4_ARG2);
 448     __ push_frame_reg_args(0, R0);
 449     // Find exception handler.
 450     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 451                      SharedRuntime::exception_handler_for_return_address),
 452                     R16_thread,
 453                     R4_ARG2);
 454     // Copy handler's address.
 455     __ mtctr(R3_RET);
 456     __ pop_frame();
 457     __ restore_LR(R0);
 458 
 459     // Set up the arguments for the exception handler:
 460     //  - R3_ARG1: exception oop
 461     //  - R4_ARG2: exception pc.
 462 
 463     // Load pending exception oop.
 464     __ ld(R3_ARG1,
 465               in_bytes(Thread::pending_exception_offset()),
 466               R16_thread);
 467 
 468     // The exception pc is the return address in the caller.
 469     // Must load it into R4_ARG2.
 470     __ mflr(R4_ARG2);
 471 
 472 #ifdef ASSERT
 473     // Make sure exception is set.
 474     {
 475       Label L;
 476       __ cmpdi(CR0, R3_ARG1, 0);
 477       __ bne(CR0, L);
 478       __ stop("StubRoutines::forward exception: no pending exception (2)");
 479       __ bind(L);
 480     }
 481 #endif
 482 
 483     // Clear the pending exception.
 484     __ li(R0, 0);
 485     __ std(R0,
 486                in_bytes(Thread::pending_exception_offset()),
 487                R16_thread);
 488     // Jump to exception handler.
 489     __ bctr();
 490 
 491     return start;
 492   }
 493 
 494 #undef __
 495 #define __ _masm->
 496 
 497 #if !defined(PRODUCT)
 498   // Wrapper which calls oopDesc::is_oop_or_null()
 499   // Only called by MacroAssembler::verify_oop
 500   static void verify_oop_helper(const char* message, oopDesc* o) {
 501     if (!oopDesc::is_oop_or_null(o)) {
 502       fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
 503     }
 504     ++ StubRoutines::_verify_oop_count;
 505   }
 506 #endif
 507 
 508   // Return address of code to be called from code generated by
 509   // MacroAssembler::verify_oop.
 510   //
 511   // Don't generate, rather use C++ code.
 512   address generate_verify_oop() {
 513     // this is actually a `FunctionDescriptor*'.
 514     address start = nullptr;
 515 
 516 #if !defined(PRODUCT)
 517     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 518 #endif
 519 
 520     return start;
 521   }
 522 
 523   // Computes the Galois/Counter Mode (GCM) product and reduction.
 524   //
 525   // This function performs polynomial multiplication of the subkey H with
 526   // the current GHASH state using vectorized polynomial multiplication (`vpmsumd`).
 527   // The subkey H is divided into lower, middle, and higher halves.
 528   // The multiplication results are reduced using `vConstC2` to stay within GF(2^128).
 529   // The final computed value is stored back into `vState`.
 530   static void computeGCMProduct(MacroAssembler* _masm,
 531                                 VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH,
 532                                 VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState,
 533                                 VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct,
 534                                 VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9,
 535                                 VectorRegister vCombinedResult, VectorRegister vSwappedH) {
 536     __ vxor(vH, vH, vState);
 537     __ vpmsumd(vLowProduct, vLowerH, vH);                          // L : Lower Half of subkey H
 538     __ vpmsumd(vMidProduct, vSwappedH, vH);                        // M : Combined halves of subkey H
 539     __ vpmsumd(vHighProduct, vHigherH, vH);                        // H : Higher Half of subkey H
 540     __ vpmsumd(vReducedLow, vLowProduct, vConstC2);                // Reduction
 541     __ vsldoi(vTmp8, vMidProduct, vZero, 8);                       // mL : Extract the lower 64 bits of M
 542     __ vsldoi(vTmp9, vZero, vMidProduct, 8);                       // mH : Extract the higher 64 bits of M
 543     __ vxor(vLowProduct, vLowProduct, vTmp8);                      // LL + mL : Partial result for lower half
 544     __ vxor(vHighProduct, vHighProduct, vTmp9);                    // HH + mH : Partial result for upper half
 545     __ vsldoi(vLowProduct, vLowProduct, vLowProduct, 8);           // Swap
 546     __ vxor(vLowProduct, vLowProduct, vReducedLow);
 547     __ vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8);       // Swap
 548     __ vpmsumd(vLowProduct, vLowProduct, vConstC2);                // Reduction using constant
 549     __ vxor(vCombinedResult, vCombinedResult, vHighProduct);       // Combine reduced Low & High products
 550     __ vxor(vState, vLowProduct, vCombinedResult);
 551   }
 552 
 553   // Generate stub for ghash process blocks.
 554   //
 555   // Arguments for generated stub:
 556   //      state:    R3_ARG1 (long[] state)
 557   //      subkeyH:  R4_ARG2 (long[] subH)
 558   //      data:     R5_ARG3 (byte[] data)
 559   //      blocks:   R6_ARG4 (number of 16-byte blocks to process)
 560   //
 561   // The polynomials are processed in bit-reflected order for efficiency reasons.
 562   // This optimization leverages the structure of the Galois field arithmetic
 563   // to minimize the number of bit manipulations required during multiplication.
 564   // For an explanation of how this works, refer :
 565   // Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich,
 566   // Martin Dixon. "Optimized Galois-Counter-Mode Implementation on IntelĀ®
 567   // Architecture Processor"
 568   // http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf
 569   //
 570   //
 571   address generate_ghash_processBlocks() {
 572     StubCodeMark mark(this, "StubRoutines", "ghash");
 573     address start = __ function_entry();
 574 
 575     // Registers for parameters
 576     Register state = R3_ARG1;                     // long[] state
 577     Register subkeyH = R4_ARG2;                   // long[] subH
 578     Register data = R5_ARG3;                      // byte[] data
 579     Register blocks = R6_ARG4;
 580     Register temp1 = R8;
 581     // Vector Registers
 582     VectorRegister vZero = VR0;
 583     VectorRegister vH = VR1;
 584     VectorRegister vLowerH = VR2;
 585     VectorRegister vHigherH = VR3;
 586     VectorRegister vLowProduct = VR4;
 587     VectorRegister vMidProduct = VR5;
 588     VectorRegister vHighProduct = VR6;
 589     VectorRegister vReducedLow = VR7;
 590     VectorRegister vTmp8 = VR8;
 591     VectorRegister vTmp9 = VR9;
 592     VectorRegister vTmp10 = VR10;
 593     VectorRegister vSwappedH = VR11;
 594     VectorRegister vTmp12 = VR12;
 595     VectorRegister loadOrder = VR13;
 596     VectorRegister vHigh = VR14;
 597     VectorRegister vLow = VR15;
 598     VectorRegister vState = VR16;
 599     VectorRegister vPerm = VR17;
 600     VectorRegister vCombinedResult = VR18;
 601     VectorRegister vConstC2 = VR19;
 602 
 603     __ li(temp1, 0xc2);
 604     __ sldi(temp1, temp1, 56);
 605     __ vspltisb(vZero, 0);
 606     __ mtvrd(vConstC2, temp1);
 607     __ lxvd2x(vH->to_vsr(), subkeyH);
 608     __ lxvd2x(vState->to_vsr(), state);
 609     // Operations to obtain lower and higher bytes of subkey H.
 610     __ vspltisb(vReducedLow, 1);
 611     __ vspltisb(vTmp10, 7);
 612     __ vsldoi(vTmp8, vZero, vReducedLow, 1);            // 0x1
 613     __ vor(vTmp8, vConstC2, vTmp8);                     // 0xC2...1
 614     __ vsplt(vTmp9, 0, vH);                             // MSB of H
 615     __ vsl(vH, vH, vReducedLow);                        // Carry = H<<7
 616     __ vsrab(vTmp9, vTmp9, vTmp10);
 617     __ vand(vTmp9, vTmp9, vTmp8);                       // Carry
 618     __ vxor(vTmp10, vH, vTmp9);
 619     __ vsldoi(vConstC2, vZero, vConstC2, 8);
 620     __ vsldoi(vSwappedH, vTmp10, vTmp10, 8);            // swap Lower and Higher Halves of subkey H
 621     __ vsldoi(vLowerH, vZero, vSwappedH, 8);            // H.L
 622     __ vsldoi(vHigherH, vSwappedH, vZero, 8);           // H.H
 623 #ifdef ASSERT
 624     __ cmpwi(CR0, blocks, 0);                           // Compare 'blocks' (R6_ARG4) with zero
 625     __ asm_assert_ne("blocks should NOT be zero");
 626 #endif
 627     __ clrldi(blocks, blocks, 32);
 628     __ mtctr(blocks);
 629     __ lvsl(loadOrder, temp1);
 630 #ifdef VM_LITTLE_ENDIAN
 631     __ vspltisb(vTmp12, 0xf);
 632     __ vxor(loadOrder, loadOrder, vTmp12);
 633 #define LE_swap_bytes(x) __ vec_perm(x, x, x, loadOrder)
 634 #else
 635 #define LE_swap_bytes(x)
 636 #endif
 637 
 638     // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation.
 639     //
 640     // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts,
 641     // performing three 128-bit multiplications and combining the results efficiently.
 642     //
 643     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
 644     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
 645     //
 646     // Inputs:
 647     // - vH:       The data vector (state), containing both B0 (lower half) and B1 (higher half).
 648     // - vLowerH:  Lower half of the subkey H (A0).
 649     // - vHigherH: Higher half of the subkey H (A1).
 650     // - vConstC2: Constant used for reduction (for final processing).
 651     //
 652     // References:
 653     // Shay Gueron, Michael E. Kounavis.
 654     // "IntelĀ® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode"
 655     // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918
 656     //
 657     Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop;
 658     __ andi(temp1, data, 15);
 659     __ cmpwi(CR0, temp1, 0);
 660     __ bne(CR0, L_initialize_unaligned_loop);
 661 
 662     __ bind(L_aligned_loop);
 663       __ lvx(vH, temp1, data);
 664       LE_swap_bytes(vH);
 665       computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
 666                     vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
 667       __ addi(data, data, 16);
 668     __ bdnz(L_aligned_loop);
 669     __ b(L_store);
 670 
 671     __ bind(L_initialize_unaligned_loop);
 672     __ li(temp1, 0);
 673     __ lvsl(vPerm, temp1, data);
 674     __ lvx(vHigh, temp1, data);
 675 #ifdef VM_LITTLE_ENDIAN
 676     __ vspltisb(vTmp12, -1);
 677     __ vxor(vPerm, vPerm, vTmp12);
 678 #endif
 679     __ bind(L_unaligned_loop);
 680       __ addi(data, data, 16);
 681       __ lvx(vLow, temp1, data);
 682       __ vec_perm(vH, vHigh, vLow, vPerm);
 683       computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
 684                     vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
 685       __ vmr(vHigh, vLow);
 686     __ bdnz(L_unaligned_loop);
 687 
 688     __ bind(L_store);
 689     __ stxvd2x(vState->to_vsr(), state);
 690     __ blr();
 691 
 692     return start;
 693   }
 694   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
 695   //
 696   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
 697   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
 698   //
 699   // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
 700   // for turning on loop predication optimization, and hence the behavior of "array range check"
 701   // and "loop invariant check" could be influenced, which potentially boosted JVM98.
 702   //
 703   // Generate stub for disjoint short fill. If "aligned" is true, the
 704   // "to" address is assumed to be heapword aligned.
 705   //
 706   // Arguments for generated stub:
 707   //   to:    R3_ARG1
 708   //   value: R4_ARG2
 709   //   count: R5_ARG3 treated as signed
 710   //
 711   address generate_fill(StubGenStubId stub_id) {
 712     BasicType t;
 713     bool aligned;
 714 
 715     switch (stub_id) {
 716     case jbyte_fill_id:
 717       t = T_BYTE;
 718       aligned = false;
 719       break;
 720     case jshort_fill_id:
 721       t = T_SHORT;
 722       aligned = false;
 723       break;
 724     case jint_fill_id:
 725       t = T_INT;
 726       aligned = false;
 727       break;
 728     case arrayof_jbyte_fill_id:
 729       t = T_BYTE;
 730       aligned = true;
 731       break;
 732     case arrayof_jshort_fill_id:
 733       t = T_SHORT;
 734       aligned = true;
 735       break;
 736     case arrayof_jint_fill_id:
 737       t = T_INT;
 738       aligned = true;
 739       break;
 740     default:
 741       ShouldNotReachHere();
 742     }
 743 
 744     StubCodeMark mark(this, stub_id);
 745     address start = __ function_entry();
 746 
 747     const Register to    = R3_ARG1;   // source array address
 748     const Register value = R4_ARG2;   // fill value
 749     const Register count = R5_ARG3;   // elements count
 750     const Register temp  = R6_ARG4;   // temp register
 751 
 752     //assert_clean_int(count, O3);    // Make sure 'count' is clean int.
 753 
 754     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
 755     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
 756 
 757     int shift = -1;
 758     switch (t) {
 759        case T_BYTE:
 760         shift = 2;
 761         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 762         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit
 763         __ cmpdi(CR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 764         __ blt(CR0, L_fill_elements);
 765         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 766         break;
 767        case T_SHORT:
 768         shift = 1;
 769         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 770         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 771         __ cmpdi(CR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 772         __ blt(CR0, L_fill_elements);
 773         break;
 774       case T_INT:
 775         shift = 0;
 776         __ cmpdi(CR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 777         __ blt(CR0, L_fill_4_bytes);
 778         break;
 779       default: ShouldNotReachHere();
 780     }
 781 
 782     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
 783       // Align source address at 4 bytes address boundary.
 784       if (t == T_BYTE) {
 785         // One byte misalignment happens only for byte arrays.
 786         __ andi_(temp, to, 1);
 787         __ beq(CR0, L_skip_align1);
 788         __ stb(value, 0, to);
 789         __ addi(to, to, 1);
 790         __ addi(count, count, -1);
 791         __ bind(L_skip_align1);
 792       }
 793       // Two bytes misalignment happens only for byte and short (char) arrays.
 794       __ andi_(temp, to, 2);
 795       __ beq(CR0, L_skip_align2);
 796       __ sth(value, 0, to);
 797       __ addi(to, to, 2);
 798       __ addi(count, count, -(1 << (shift - 1)));
 799       __ bind(L_skip_align2);
 800     }
 801 
 802     if (!aligned) {
 803       // Align to 8 bytes, we know we are 4 byte aligned to start.
 804       __ andi_(temp, to, 7);
 805       __ beq(CR0, L_fill_32_bytes);
 806       __ stw(value, 0, to);
 807       __ addi(to, to, 4);
 808       __ addi(count, count, -(1 << shift));
 809       __ bind(L_fill_32_bytes);
 810     }
 811 
 812     __ li(temp, 8<<shift);                  // Prepare for 32 byte loop.
 813     // Clone bytes int->long as above.
 814     __ rldimi(value, value, 32, 0);         // 32 bit -> 64 bit
 815 
 816     Label L_check_fill_8_bytes;
 817     // Fill 32-byte chunks.
 818     __ subf_(count, temp, count);
 819     __ blt(CR0, L_check_fill_8_bytes);
 820 
 821     Label L_fill_32_bytes_loop;
 822     __ align(32);
 823     __ bind(L_fill_32_bytes_loop);
 824 
 825     __ std(value, 0, to);
 826     __ std(value, 8, to);
 827     __ subf_(count, temp, count);           // Update count.
 828     __ std(value, 16, to);
 829     __ std(value, 24, to);
 830 
 831     __ addi(to, to, 32);
 832     __ bge(CR0, L_fill_32_bytes_loop);
 833 
 834     __ bind(L_check_fill_8_bytes);
 835     __ add_(count, temp, count);
 836     __ beq(CR0, L_exit);
 837     __ addic_(count, count, -(2 << shift));
 838     __ blt(CR0, L_fill_4_bytes);
 839 
 840     //
 841     // Length is too short, just fill 8 bytes at a time.
 842     //
 843     Label L_fill_8_bytes_loop;
 844     __ bind(L_fill_8_bytes_loop);
 845     __ std(value, 0, to);
 846     __ addic_(count, count, -(2 << shift));
 847     __ addi(to, to, 8);
 848     __ bge(CR0, L_fill_8_bytes_loop);
 849 
 850     // Fill trailing 4 bytes.
 851     __ bind(L_fill_4_bytes);
 852     __ andi_(temp, count, 1<<shift);
 853     __ beq(CR0, L_fill_2_bytes);
 854 
 855     __ stw(value, 0, to);
 856     if (t == T_BYTE || t == T_SHORT) {
 857       __ addi(to, to, 4);
 858       // Fill trailing 2 bytes.
 859       __ bind(L_fill_2_bytes);
 860       __ andi_(temp, count, 1<<(shift-1));
 861       __ beq(CR0, L_fill_byte);
 862       __ sth(value, 0, to);
 863       if (t == T_BYTE) {
 864         __ addi(to, to, 2);
 865         // Fill trailing byte.
 866         __ bind(L_fill_byte);
 867         __ andi_(count, count, 1);
 868         __ beq(CR0, L_exit);
 869         __ stb(value, 0, to);
 870       } else {
 871         __ bind(L_fill_byte);
 872       }
 873     } else {
 874       __ bind(L_fill_2_bytes);
 875     }
 876     __ bind(L_exit);
 877     __ blr();
 878 
 879     // Handle copies less than 8 bytes. Int is handled elsewhere.
 880     if (t == T_BYTE) {
 881       __ bind(L_fill_elements);
 882       Label L_fill_2, L_fill_4;
 883       __ andi_(temp, count, 1);
 884       __ beq(CR0, L_fill_2);
 885       __ stb(value, 0, to);
 886       __ addi(to, to, 1);
 887       __ bind(L_fill_2);
 888       __ andi_(temp, count, 2);
 889       __ beq(CR0, L_fill_4);
 890       __ stb(value, 0, to);
 891       __ stb(value, 0, to);
 892       __ addi(to, to, 2);
 893       __ bind(L_fill_4);
 894       __ andi_(temp, count, 4);
 895       __ beq(CR0, L_exit);
 896       __ stb(value, 0, to);
 897       __ stb(value, 1, to);
 898       __ stb(value, 2, to);
 899       __ stb(value, 3, to);
 900       __ blr();
 901     }
 902 
 903     if (t == T_SHORT) {
 904       Label L_fill_2;
 905       __ bind(L_fill_elements);
 906       __ andi_(temp, count, 1);
 907       __ beq(CR0, L_fill_2);
 908       __ sth(value, 0, to);
 909       __ addi(to, to, 2);
 910       __ bind(L_fill_2);
 911       __ andi_(temp, count, 2);
 912       __ beq(CR0, L_exit);
 913       __ sth(value, 0, to);
 914       __ sth(value, 2, to);
 915       __ blr();
 916     }
 917     return start;
 918   }
 919 
 920   inline void assert_positive_int(Register count) {
 921 #ifdef ASSERT
 922     __ srdi_(R0, count, 31);
 923     __ asm_assert_eq("missing zero extend");
 924 #endif
 925   }
 926 
 927   // Generate overlap test for array copy stubs.
 928   //
 929   // Input:
 930   //   R3_ARG1    -  from
 931   //   R4_ARG2    -  to
 932   //   R5_ARG3    -  element count
 933   //
 934   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
 935     Register tmp1 = R6_ARG4;
 936     Register tmp2 = R7_ARG5;
 937 
 938     assert_positive_int(R5_ARG3);
 939 
 940     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
 941     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
 942     __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
 943     __ cmpld(CR1, tmp1, tmp2);
 944     __ crnand(CR0, Assembler::less, CR1, Assembler::less);
 945     // Overlaps if Src before dst and distance smaller than size.
 946     // Branch to forward copy routine otherwise (within range of 32kB).
 947     __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::less), no_overlap_target);
 948 
 949     // need to copy backwards
 950   }
 951 
 952   // This is common errorexit stub for UnsafeMemoryAccess.
 953   address generate_unsafecopy_common_error_exit() {
 954     address start_pc = __ pc();
 955     Register tmp1 = R6_ARG4;
 956     // probably copy stub would have changed value reset it.
 957     __ load_const_optimized(tmp1, VM_Version::_dscr_val);
 958     __ mtdscr(tmp1);
 959     __ li(R3_RET, 0); // return 0
 960     __ blr();
 961     return start_pc;
 962   }
 963 
 964   // The guideline in the implementations of generate_disjoint_xxx_copy
 965   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
 966   // single instructions, but to avoid alignment interrupts (see subsequent
 967   // comment). Furthermore, we try to minimize misaligned access, even
 968   // though they cause no alignment interrupt.
 969   //
 970   // In Big-Endian mode, the PowerPC architecture requires implementations to
 971   // handle automatically misaligned integer halfword and word accesses,
 972   // word-aligned integer doubleword accesses, and word-aligned floating-point
 973   // accesses. Other accesses may or may not generate an Alignment interrupt
 974   // depending on the implementation.
 975   // Alignment interrupt handling may require on the order of hundreds of cycles,
 976   // so every effort should be made to avoid misaligned memory values.
 977   //
 978   //
 979   // Generate stub for disjoint byte copy.  If "aligned" is true, the
 980   // "from" and "to" addresses are assumed to be heapword aligned.
 981   //
 982   // Arguments for generated stub:
 983   //      from:  R3_ARG1
 984   //      to:    R4_ARG2
 985   //      count: R5_ARG3 treated as signed
 986   //
 987   address generate_disjoint_byte_copy(StubGenStubId stub_id) {
 988     bool aligned;
 989     switch (stub_id) {
 990     case jbyte_disjoint_arraycopy_id:
 991       aligned = false;
 992       break;
 993     case arrayof_jbyte_disjoint_arraycopy_id:
 994       aligned = true;
 995       break;
 996     default:
 997       ShouldNotReachHere();
 998     }
 999 
1000     StubCodeMark mark(this, stub_id);
1001     address start = __ function_entry();
1002     assert_positive_int(R5_ARG3);
1003 
1004     Register tmp1 = R6_ARG4;
1005     Register tmp2 = R7_ARG5;
1006     Register tmp3 = R8_ARG6;
1007     Register tmp4 = R9_ARG7;
1008 
1009     VectorSRegister tmp_vsr1  = VSR1;
1010     VectorSRegister tmp_vsr2  = VSR2;
1011 
1012     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
1013     {
1014       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1015       UnsafeMemoryAccessMark umam(this, !aligned, false);
1016 
1017       // Don't try anything fancy if arrays don't have many elements.
1018       __ li(tmp3, 0);
1019       __ cmpwi(CR0, R5_ARG3, 17);
1020       __ ble(CR0, l_6); // copy 4 at a time
1021 
1022       if (!aligned) {
1023         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1024         __ andi_(tmp1, tmp1, 3);
1025         __ bne(CR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1026 
1027         // Copy elements if necessary to align to 4 bytes.
1028         __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1029         __ andi_(tmp1, tmp1, 3);
1030         __ beq(CR0, l_2);
1031 
1032         __ subf(R5_ARG3, tmp1, R5_ARG3);
1033         __ bind(l_9);
1034         __ lbz(tmp2, 0, R3_ARG1);
1035         __ addic_(tmp1, tmp1, -1);
1036         __ stb(tmp2, 0, R4_ARG2);
1037         __ addi(R3_ARG1, R3_ARG1, 1);
1038         __ addi(R4_ARG2, R4_ARG2, 1);
1039         __ bne(CR0, l_9);
1040 
1041         __ bind(l_2);
1042       }
1043 
1044       // copy 8 elements at a time
1045       __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1046       __ andi_(tmp1, tmp2, 7);
1047       __ bne(CR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1048 
1049       // copy a 2-element word if necessary to align to 8 bytes
1050       __ andi_(R0, R3_ARG1, 7);
1051       __ beq(CR0, l_7);
1052 
1053       __ lwzx(tmp2, R3_ARG1, tmp3);
1054       __ addi(R5_ARG3, R5_ARG3, -4);
1055       __ stwx(tmp2, R4_ARG2, tmp3);
1056       { // FasterArrayCopy
1057         __ addi(R3_ARG1, R3_ARG1, 4);
1058         __ addi(R4_ARG2, R4_ARG2, 4);
1059       }
1060       __ bind(l_7);
1061 
1062       { // FasterArrayCopy
1063         __ cmpwi(CR0, R5_ARG3, 31);
1064         __ ble(CR0, l_6); // copy 2 at a time if less than 32 elements remain
1065 
1066         __ srdi(tmp1, R5_ARG3, 5);
1067         __ andi_(R5_ARG3, R5_ARG3, 31);
1068         __ mtctr(tmp1);
1069 
1070 
1071         // Prefetch the data into the L2 cache.
1072         __ dcbt(R3_ARG1, 0);
1073 
1074         // If supported set DSCR pre-fetch to deepest.
1075         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1076         __ mtdscr(tmp2);
1077 
1078         __ li(tmp1, 16);
1079 
1080         // Backbranch target aligned to 32-byte. Not 16-byte align as
1081         // loop contains < 8 instructions that fit inside a single
1082         // i-cache sector.
1083         __ align(32);
1084 
1085         __ bind(l_10);
1086         // Use loop with VSX load/store instructions to
1087         // copy 32 elements a time.
1088         __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1089         __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1090         __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1091         __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1092         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1093         __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1094         __ bdnz(l_10);                       // Dec CTR and loop if not zero.
1095 
1096         // Restore DSCR pre-fetch value.
1097         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1098         __ mtdscr(tmp2);
1099 
1100      } // FasterArrayCopy
1101 
1102       __ bind(l_6);
1103 
1104       // copy 4 elements at a time
1105       __ cmpwi(CR0, R5_ARG3, 4);
1106       __ blt(CR0, l_1);
1107       __ srdi(tmp1, R5_ARG3, 2);
1108       __ mtctr(tmp1); // is > 0
1109       __ andi_(R5_ARG3, R5_ARG3, 3);
1110 
1111       { // FasterArrayCopy
1112         __ addi(R3_ARG1, R3_ARG1, -4);
1113         __ addi(R4_ARG2, R4_ARG2, -4);
1114         __ bind(l_3);
1115         __ lwzu(tmp2, 4, R3_ARG1);
1116         __ stwu(tmp2, 4, R4_ARG2);
1117         __ bdnz(l_3);
1118         __ addi(R3_ARG1, R3_ARG1, 4);
1119         __ addi(R4_ARG2, R4_ARG2, 4);
1120       }
1121 
1122       // do single element copy
1123       __ bind(l_1);
1124       __ cmpwi(CR0, R5_ARG3, 0);
1125       __ beq(CR0, l_4);
1126 
1127       { // FasterArrayCopy
1128         __ mtctr(R5_ARG3);
1129         __ addi(R3_ARG1, R3_ARG1, -1);
1130         __ addi(R4_ARG2, R4_ARG2, -1);
1131 
1132         __ bind(l_5);
1133         __ lbzu(tmp2, 1, R3_ARG1);
1134         __ stbu(tmp2, 1, R4_ARG2);
1135         __ bdnz(l_5);
1136       }
1137     }
1138 
1139     __ bind(l_4);
1140     __ li(R3_RET, 0); // return 0
1141     __ blr();
1142 
1143     return start;
1144   }
1145 
1146   // Generate stub for conjoint byte copy.  If "aligned" is true, the
1147   // "from" and "to" addresses are assumed to be heapword aligned.
1148   //
1149   // Arguments for generated stub:
1150   //      from:  R3_ARG1
1151   //      to:    R4_ARG2
1152   //      count: R5_ARG3 treated as signed
1153   //
1154   address generate_conjoint_byte_copy(StubGenStubId stub_id) {
1155     bool aligned;
1156     switch (stub_id) {
1157     case jbyte_arraycopy_id:
1158       aligned = false;
1159       break;
1160     case arrayof_jbyte_arraycopy_id:
1161       aligned = true;
1162       break;
1163     default:
1164       ShouldNotReachHere();
1165     }
1166 
1167     StubCodeMark mark(this, stub_id);
1168     address start = __ function_entry();
1169     assert_positive_int(R5_ARG3);
1170 
1171     Register tmp1 = R6_ARG4;
1172     Register tmp2 = R7_ARG5;
1173     Register tmp3 = R8_ARG6;
1174 
1175     address nooverlap_target = aligned ?
1176       STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy()) :
1177       STUB_ENTRY(jbyte_disjoint_arraycopy());
1178 
1179     array_overlap_test(nooverlap_target, 0);
1180     // Do reverse copy. We assume the case of actual overlap is rare enough
1181     // that we don't have to optimize it.
1182     Label l_1, l_2;
1183     {
1184       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1185       UnsafeMemoryAccessMark umam(this, !aligned, false);
1186       __ b(l_2);
1187       __ bind(l_1);
1188       __ stbx(tmp1, R4_ARG2, R5_ARG3);
1189       __ bind(l_2);
1190       __ addic_(R5_ARG3, R5_ARG3, -1);
1191       __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1192       __ bge(CR0, l_1);
1193     }
1194     __ li(R3_RET, 0); // return 0
1195     __ blr();
1196 
1197     return start;
1198   }
1199 
1200   // Generate stub for disjoint short copy.  If "aligned" is true, the
1201   // "from" and "to" addresses are assumed to be heapword aligned.
1202   //
1203   // Arguments for generated stub:
1204   //      from:  R3_ARG1
1205   //      to:    R4_ARG2
1206   //  elm.count: R5_ARG3 treated as signed
1207   //
1208   // Strategy for aligned==true:
1209   //
1210   //  If length <= 9:
1211   //     1. copy 2 elements at a time (l_6)
1212   //     2. copy last element if original element count was odd (l_1)
1213   //
1214   //  If length > 9:
1215   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
1216   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
1217   //     3. copy last element if one was left in step 2. (l_1)
1218   //
1219   //
1220   // Strategy for aligned==false:
1221   //
1222   //  If length <= 9: same as aligned==true case, but NOTE: load/stores
1223   //                  can be unaligned (see comment below)
1224   //
1225   //  If length > 9:
1226   //     1. continue with step 6. if the alignment of from and to mod 4
1227   //        is different.
1228   //     2. align from and to to 4 bytes by copying 1 element if necessary
1229   //     3. at l_2 from and to are 4 byte aligned; continue with
1230   //        5. if they cannot be aligned to 8 bytes because they have
1231   //        got different alignment mod 8.
1232   //     4. at this point we know that both, from and to, have the same
1233   //        alignment mod 8, now copy one element if necessary to get
1234   //        8 byte alignment of from and to.
1235   //     5. copy 4 elements at a time until less than 4 elements are
1236   //        left; depending on step 3. all load/stores are aligned or
1237   //        either all loads or all stores are unaligned.
1238   //     6. copy 2 elements at a time until less than 2 elements are
1239   //        left (l_6); arriving here from step 1., there is a chance
1240   //        that all accesses are unaligned.
1241   //     7. copy last element if one was left in step 6. (l_1)
1242   //
1243   //  There are unaligned data accesses using integer load/store
1244   //  instructions in this stub. POWER allows such accesses.
1245   //
1246   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1247   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
1248   //  integer load/stores have good performance. Only unaligned
1249   //  floating point load/stores can have poor performance.
1250   //
1251   //  TODO:
1252   //
1253   //  1. check if aligning the backbranch target of loops is beneficial
1254   //
1255   address generate_disjoint_short_copy(StubGenStubId stub_id) {
1256     bool aligned;
1257     switch (stub_id) {
1258     case jshort_disjoint_arraycopy_id:
1259       aligned = false;
1260       break;
1261     case arrayof_jshort_disjoint_arraycopy_id:
1262       aligned = true;
1263       break;
1264     default:
1265       ShouldNotReachHere();
1266     }
1267 
1268     StubCodeMark mark(this, stub_id);
1269 
1270     Register tmp1 = R6_ARG4;
1271     Register tmp2 = R7_ARG5;
1272     Register tmp3 = R8_ARG6;
1273     Register tmp4 = R9_ARG7;
1274 
1275     VectorSRegister tmp_vsr1  = VSR1;
1276     VectorSRegister tmp_vsr2  = VSR2;
1277 
1278     address start = __ function_entry();
1279     assert_positive_int(R5_ARG3);
1280 
1281     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1282     {
1283       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1284       UnsafeMemoryAccessMark umam(this, !aligned, false);
1285       // don't try anything fancy if arrays don't have many elements
1286       __ li(tmp3, 0);
1287       __ cmpwi(CR0, R5_ARG3, 9);
1288       __ ble(CR0, l_6); // copy 2 at a time
1289 
1290       if (!aligned) {
1291         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1292         __ andi_(tmp1, tmp1, 3);
1293         __ bne(CR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1294 
1295         // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1296 
1297         // Copy 1 element if necessary to align to 4 bytes.
1298         __ andi_(tmp1, R3_ARG1, 3);
1299         __ beq(CR0, l_2);
1300 
1301         __ lhz(tmp2, 0, R3_ARG1);
1302         __ addi(R3_ARG1, R3_ARG1, 2);
1303         __ sth(tmp2, 0, R4_ARG2);
1304         __ addi(R4_ARG2, R4_ARG2, 2);
1305         __ addi(R5_ARG3, R5_ARG3, -1);
1306         __ bind(l_2);
1307 
1308         // At this point the positions of both, from and to, are at least 4 byte aligned.
1309 
1310         // Copy 4 elements at a time.
1311         // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1312         __ xorr(tmp2, R3_ARG1, R4_ARG2);
1313         __ andi_(tmp1, tmp2, 7);
1314         __ bne(CR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1315 
1316         // Copy a 2-element word if necessary to align to 8 bytes.
1317         __ andi_(R0, R3_ARG1, 7);
1318         __ beq(CR0, l_7);
1319 
1320         __ lwzx(tmp2, R3_ARG1, tmp3);
1321         __ addi(R5_ARG3, R5_ARG3, -2);
1322         __ stwx(tmp2, R4_ARG2, tmp3);
1323         { // FasterArrayCopy
1324           __ addi(R3_ARG1, R3_ARG1, 4);
1325           __ addi(R4_ARG2, R4_ARG2, 4);
1326         }
1327       }
1328 
1329       __ bind(l_7);
1330 
1331       // Copy 4 elements at a time; either the loads or the stores can
1332       // be unaligned if aligned == false.
1333 
1334       { // FasterArrayCopy
1335         __ cmpwi(CR0, R5_ARG3, 15);
1336         __ ble(CR0, l_6); // copy 2 at a time if less than 16 elements remain
1337 
1338         __ srdi(tmp1, R5_ARG3, 4);
1339         __ andi_(R5_ARG3, R5_ARG3, 15);
1340         __ mtctr(tmp1);
1341 
1342 
1343         // Processor supports VSX, so use it to mass copy.
1344 
1345           // Prefetch src data into L2 cache.
1346           __ dcbt(R3_ARG1, 0);
1347 
1348           // If supported set DSCR pre-fetch to deepest.
1349           __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1350           __ mtdscr(tmp2);
1351           __ li(tmp1, 16);
1352 
1353           // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1354           // as loop contains < 8 instructions that fit inside a single
1355           // i-cache sector.
1356           __ align(32);
1357 
1358           __ bind(l_9);
1359           // Use loop with VSX load/store instructions to
1360           // copy 16 elements a time.
1361           __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load from src.
1362           __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
1363           __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
1364           __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1365           __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
1366           __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
1367           __ bdnz(l_9);                        // Dec CTR and loop if not zero.
1368 
1369           // Restore DSCR pre-fetch value.
1370           __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1371           __ mtdscr(tmp2);
1372       } // FasterArrayCopy
1373       __ bind(l_6);
1374 
1375       // copy 2 elements at a time
1376       { // FasterArrayCopy
1377         __ cmpwi(CR0, R5_ARG3, 2);
1378         __ blt(CR0, l_1);
1379         __ srdi(tmp1, R5_ARG3, 1);
1380         __ andi_(R5_ARG3, R5_ARG3, 1);
1381 
1382         __ addi(R3_ARG1, R3_ARG1, -4);
1383         __ addi(R4_ARG2, R4_ARG2, -4);
1384         __ mtctr(tmp1);
1385 
1386         __ bind(l_3);
1387         __ lwzu(tmp2, 4, R3_ARG1);
1388         __ stwu(tmp2, 4, R4_ARG2);
1389         __ bdnz(l_3);
1390 
1391         __ addi(R3_ARG1, R3_ARG1, 4);
1392         __ addi(R4_ARG2, R4_ARG2, 4);
1393       }
1394 
1395       // do single element copy
1396       __ bind(l_1);
1397       __ cmpwi(CR0, R5_ARG3, 0);
1398       __ beq(CR0, l_4);
1399 
1400       { // FasterArrayCopy
1401         __ mtctr(R5_ARG3);
1402         __ addi(R3_ARG1, R3_ARG1, -2);
1403         __ addi(R4_ARG2, R4_ARG2, -2);
1404 
1405         __ bind(l_5);
1406         __ lhzu(tmp2, 2, R3_ARG1);
1407         __ sthu(tmp2, 2, R4_ARG2);
1408         __ bdnz(l_5);
1409       }
1410     }
1411 
1412     __ bind(l_4);
1413     __ li(R3_RET, 0); // return 0
1414     __ blr();
1415 
1416     return start;
1417   }
1418 
1419   // Generate stub for conjoint short copy.  If "aligned" is true, the
1420   // "from" and "to" addresses are assumed to be heapword aligned.
1421   //
1422   // Arguments for generated stub:
1423   //      from:  R3_ARG1
1424   //      to:    R4_ARG2
1425   //      count: R5_ARG3 treated as signed
1426   //
1427   address generate_conjoint_short_copy(StubGenStubId stub_id) {
1428     bool aligned;
1429     switch (stub_id) {
1430     case jshort_arraycopy_id:
1431       aligned = false;
1432       break;
1433     case arrayof_jshort_arraycopy_id:
1434       aligned = true;
1435       break;
1436     default:
1437       ShouldNotReachHere();
1438     }
1439 
1440     StubCodeMark mark(this, stub_id);
1441     address start = __ function_entry();
1442     assert_positive_int(R5_ARG3);
1443 
1444     Register tmp1 = R6_ARG4;
1445     Register tmp2 = R7_ARG5;
1446     Register tmp3 = R8_ARG6;
1447 
1448     address nooverlap_target = aligned ?
1449       STUB_ENTRY(arrayof_jshort_disjoint_arraycopy()) :
1450       STUB_ENTRY(jshort_disjoint_arraycopy());
1451 
1452     array_overlap_test(nooverlap_target, 1);
1453 
1454     Label l_1, l_2;
1455     {
1456       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1457       UnsafeMemoryAccessMark umam(this, !aligned, false);
1458       __ sldi(tmp1, R5_ARG3, 1);
1459       __ b(l_2);
1460       __ bind(l_1);
1461       __ sthx(tmp2, R4_ARG2, tmp1);
1462       __ bind(l_2);
1463       __ addic_(tmp1, tmp1, -2);
1464       __ lhzx(tmp2, R3_ARG1, tmp1);
1465       __ bge(CR0, l_1);
1466     }
1467     __ li(R3_RET, 0); // return 0
1468     __ blr();
1469 
1470     return start;
1471   }
1472 
1473   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"
1474   // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1475   //
1476   // Arguments:
1477   //      from:  R3_ARG1
1478   //      to:    R4_ARG2
1479   //      count: R5_ARG3 treated as signed
1480   //
1481   void generate_disjoint_int_copy_core(bool aligned) {
1482     Register tmp1 = R6_ARG4;
1483     Register tmp2 = R7_ARG5;
1484     Register tmp3 = R8_ARG6;
1485     Register tmp4 = R0;
1486 
1487     VectorSRegister tmp_vsr1  = VSR1;
1488     VectorSRegister tmp_vsr2  = VSR2;
1489 
1490     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1491 
1492     // for short arrays, just do single element copy
1493     __ li(tmp3, 0);
1494     __ cmpwi(CR0, R5_ARG3, 5);
1495     __ ble(CR0, l_2);
1496 
1497     if (!aligned) {
1498         // check if arrays have same alignment mod 8.
1499         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1500         __ andi_(R0, tmp1, 7);
1501         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1502         __ bne(CR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1503 
1504         // copy 1 element to align to and from on an 8 byte boundary
1505         __ andi_(R0, R3_ARG1, 7);
1506         __ beq(CR0, l_4);
1507 
1508         __ lwzx(tmp2, R3_ARG1, tmp3);
1509         __ addi(R5_ARG3, R5_ARG3, -1);
1510         __ stwx(tmp2, R4_ARG2, tmp3);
1511         { // FasterArrayCopy
1512           __ addi(R3_ARG1, R3_ARG1, 4);
1513           __ addi(R4_ARG2, R4_ARG2, 4);
1514         }
1515         __ bind(l_4);
1516       }
1517 
1518     { // FasterArrayCopy
1519       __ cmpwi(CR0, R5_ARG3, 7);
1520       __ ble(CR0, l_2); // copy 1 at a time if less than 8 elements remain
1521 
1522       __ srdi(tmp1, R5_ARG3, 3);
1523       __ andi_(R5_ARG3, R5_ARG3, 7);
1524       __ mtctr(tmp1);
1525 
1526     // Processor supports VSX, so use it to mass copy.
1527 
1528     // Prefetch the data into the L2 cache.
1529     __ dcbt(R3_ARG1, 0);
1530 
1531     // Set DSCR pre-fetch to deepest.
1532     __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1533     __ mtdscr(tmp2);
1534 
1535     __ li(tmp1, 16);
1536 
1537     // Backbranch target aligned to 32-byte. Not 16-byte align as
1538     // loop contains < 8 instructions that fit inside a single
1539     // i-cache sector.
1540     __ align(32);
1541 
1542     __ bind(l_7);
1543     // Use loop with VSX load/store instructions to
1544     // copy 8 elements a time.
1545     __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1546     __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1547     __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1548     __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1549     __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1550     __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1551     __ bdnz(l_7);                        // Dec CTR and loop if not zero.
1552 
1553     // Restore DSCR pre-fetch value.
1554     __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1555     __ mtdscr(tmp2);
1556 
1557 
1558    } // FasterArrayCopy
1559 
1560     // copy 1 element at a time
1561     __ bind(l_2);
1562     __ cmpwi(CR0, R5_ARG3, 0);
1563     __ beq(CR0, l_1);
1564 
1565     { // FasterArrayCopy
1566       __ mtctr(R5_ARG3);
1567       __ addi(R3_ARG1, R3_ARG1, -4);
1568       __ addi(R4_ARG2, R4_ARG2, -4);
1569 
1570       __ bind(l_3);
1571       __ lwzu(tmp2, 4, R3_ARG1);
1572       __ stwu(tmp2, 4, R4_ARG2);
1573       __ bdnz(l_3);
1574     }
1575 
1576     __ bind(l_1);
1577     return;
1578   }
1579 
1580   // Generate stub for disjoint int copy.  If "aligned" is true, the
1581   // "from" and "to" addresses are assumed to be heapword aligned.
1582   //
1583   // Arguments for generated stub:
1584   //      from:  R3_ARG1
1585   //      to:    R4_ARG2
1586   //      count: R5_ARG3 treated as signed
1587   //
1588   address generate_disjoint_int_copy(StubGenStubId stub_id) {
1589     bool aligned;
1590     switch (stub_id) {
1591     case jint_disjoint_arraycopy_id:
1592       aligned = false;
1593       break;
1594     case arrayof_jint_disjoint_arraycopy_id:
1595       aligned = true;
1596       break;
1597     default:
1598       ShouldNotReachHere();
1599     }
1600 
1601     StubCodeMark mark(this, stub_id);
1602     address start = __ function_entry();
1603     assert_positive_int(R5_ARG3);
1604     {
1605       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1606       UnsafeMemoryAccessMark umam(this, !aligned, false);
1607       generate_disjoint_int_copy_core(aligned);
1608     }
1609     __ li(R3_RET, 0); // return 0
1610     __ blr();
1611     return start;
1612   }
1613 
1614   // Generate core code for conjoint int copy (and oop copy on
1615   // 32-bit).  If "aligned" is true, the "from" and "to" addresses
1616   // are assumed to be heapword aligned.
1617   //
1618   // Arguments:
1619   //      from:  R3_ARG1
1620   //      to:    R4_ARG2
1621   //      count: R5_ARG3 treated as signed
1622   //
1623   void generate_conjoint_int_copy_core(bool aligned) {
1624     // Do reverse copy.  We assume the case of actual overlap is rare enough
1625     // that we don't have to optimize it.
1626 
1627     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1628 
1629     Register tmp1 = R6_ARG4;
1630     Register tmp2 = R7_ARG5;
1631     Register tmp3 = R8_ARG6;
1632     Register tmp4 = R0;
1633 
1634     VectorSRegister tmp_vsr1  = VSR1;
1635     VectorSRegister tmp_vsr2  = VSR2;
1636 
1637     { // FasterArrayCopy
1638       __ cmpwi(CR0, R5_ARG3, 0);
1639       __ beq(CR0, l_6);
1640 
1641       __ sldi(R5_ARG3, R5_ARG3, 2);
1642       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1643       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1644       __ srdi(R5_ARG3, R5_ARG3, 2);
1645 
1646       if (!aligned) {
1647         // check if arrays have same alignment mod 8.
1648         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1649         __ andi_(R0, tmp1, 7);
1650         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1651         __ bne(CR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1652 
1653         // copy 1 element to align to and from on an 8 byte boundary
1654         __ andi_(R0, R3_ARG1, 7);
1655         __ beq(CR0, l_7);
1656 
1657         __ addi(R3_ARG1, R3_ARG1, -4);
1658         __ addi(R4_ARG2, R4_ARG2, -4);
1659         __ addi(R5_ARG3, R5_ARG3, -1);
1660         __ lwzx(tmp2, R3_ARG1);
1661         __ stwx(tmp2, R4_ARG2);
1662         __ bind(l_7);
1663       }
1664 
1665       __ cmpwi(CR0, R5_ARG3, 7);
1666       __ ble(CR0, l_5); // copy 1 at a time if less than 8 elements remain
1667 
1668       __ srdi(tmp1, R5_ARG3, 3);
1669       __ andi(R5_ARG3, R5_ARG3, 7);
1670       __ mtctr(tmp1);
1671 
1672       // Processor supports VSX, so use it to mass copy.
1673       // Prefetch the data into the L2 cache.
1674       __ dcbt(R3_ARG1, 0);
1675 
1676       // Set DSCR pre-fetch to deepest.
1677       __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1678       __ mtdscr(tmp2);
1679 
1680       __ li(tmp1, 16);
1681 
1682       // Backbranch target aligned to 32-byte. Not 16-byte align as
1683       // loop contains < 8 instructions that fit inside a single
1684       // i-cache sector.
1685       __ align(32);
1686 
1687       __ bind(l_4);
1688       // Use loop with VSX load/store instructions to
1689       // copy 8 elements a time.
1690       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1691       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1692       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1693       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1694       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1695       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1696       __ bdnz(l_4);
1697 
1698       // Restore DSCR pre-fetch value.
1699       __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1700       __ mtdscr(tmp2);
1701 
1702       __ cmpwi(CR0, R5_ARG3, 0);
1703       __ beq(CR0, l_6);
1704 
1705       __ bind(l_5);
1706       __ mtctr(R5_ARG3);
1707       __ bind(l_3);
1708       __ lwz(R0, -4, R3_ARG1);
1709       __ stw(R0, -4, R4_ARG2);
1710       __ addi(R3_ARG1, R3_ARG1, -4);
1711       __ addi(R4_ARG2, R4_ARG2, -4);
1712       __ bdnz(l_3);
1713 
1714       __ bind(l_6);
1715     }
1716   }
1717 
1718   // Generate stub for conjoint int copy.  If "aligned" is true, the
1719   // "from" and "to" addresses are assumed to be heapword aligned.
1720   //
1721   // Arguments for generated stub:
1722   //      from:  R3_ARG1
1723   //      to:    R4_ARG2
1724   //      count: R5_ARG3 treated as signed
1725   //
1726   address generate_conjoint_int_copy(StubGenStubId stub_id) {
1727     bool aligned;
1728     switch (stub_id) {
1729     case jint_arraycopy_id:
1730       aligned = false;
1731       break;
1732     case arrayof_jint_arraycopy_id:
1733       aligned = true;
1734       break;
1735     default:
1736       ShouldNotReachHere();
1737     }
1738 
1739     StubCodeMark mark(this, stub_id);
1740     address start = __ function_entry();
1741     assert_positive_int(R5_ARG3);
1742     address nooverlap_target = aligned ?
1743       STUB_ENTRY(arrayof_jint_disjoint_arraycopy()) :
1744       STUB_ENTRY(jint_disjoint_arraycopy());
1745 
1746     array_overlap_test(nooverlap_target, 2);
1747     {
1748       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1749       UnsafeMemoryAccessMark umam(this, !aligned, false);
1750       generate_conjoint_int_copy_core(aligned);
1751     }
1752 
1753     __ li(R3_RET, 0); // return 0
1754     __ blr();
1755 
1756     return start;
1757   }
1758 
1759   // Generate core code for disjoint long copy (and oop copy on
1760   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1761   // are assumed to be heapword aligned.
1762   //
1763   // Arguments:
1764   //      from:  R3_ARG1
1765   //      to:    R4_ARG2
1766   //      count: R5_ARG3 treated as signed
1767   //
1768   void generate_disjoint_long_copy_core(bool aligned) {
1769     Register tmp1 = R6_ARG4;
1770     Register tmp2 = R7_ARG5;
1771     Register tmp3 = R8_ARG6;
1772     Register tmp4 = R0;
1773 
1774     Label l_1, l_2, l_3, l_4, l_5;
1775 
1776     VectorSRegister tmp_vsr1  = VSR1;
1777     VectorSRegister tmp_vsr2  = VSR2;
1778 
1779     { // FasterArrayCopy
1780       __ cmpwi(CR0, R5_ARG3, 3);
1781       __ ble(CR0, l_3); // copy 1 at a time if less than 4 elements remain
1782 
1783       __ srdi(tmp1, R5_ARG3, 2);
1784       __ andi_(R5_ARG3, R5_ARG3, 3);
1785       __ mtctr(tmp1);
1786 
1787       // Processor supports VSX, so use it to mass copy.
1788 
1789       // Prefetch the data into the L2 cache.
1790       __ dcbt(R3_ARG1, 0);
1791 
1792       // Set DSCR pre-fetch to deepest.
1793       __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1794       __ mtdscr(tmp2);
1795 
1796       __ li(tmp1, 16);
1797 
1798       // Backbranch target aligned to 32-byte. Not 16-byte align as
1799       // loop contains < 8 instructions that fit inside a single
1800       // i-cache sector.
1801       __ align(32);
1802 
1803       __ bind(l_5);
1804       // Use loop with VSX load/store instructions to
1805       // copy 4 elements a time.
1806       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1807       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1808       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1809       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1810       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1811       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1812       __ bdnz(l_5);                        // Dec CTR and loop if not zero.
1813 
1814       // Restore DSCR pre-fetch value.
1815       __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1816       __ mtdscr(tmp2);
1817 
1818    } // FasterArrayCopy
1819 
1820     // copy 1 element at a time
1821     __ bind(l_3);
1822     __ cmpwi(CR0, R5_ARG3, 0);
1823     __ beq(CR0, l_1);
1824 
1825     { // FasterArrayCopy
1826       __ mtctr(R5_ARG3);
1827       __ addi(R3_ARG1, R3_ARG1, -8);
1828       __ addi(R4_ARG2, R4_ARG2, -8);
1829 
1830       __ bind(l_2);
1831       __ ldu(R0, 8, R3_ARG1);
1832       __ stdu(R0, 8, R4_ARG2);
1833       __ bdnz(l_2);
1834 
1835     }
1836     __ bind(l_1);
1837   }
1838 
1839   // Generate stub for disjoint long copy.  If "aligned" is true, the
1840   // "from" and "to" addresses are assumed to be heapword aligned.
1841   //
1842   // Arguments for generated stub:
1843   //      from:  R3_ARG1
1844   //      to:    R4_ARG2
1845   //      count: R5_ARG3 treated as signed
1846   //
1847   address generate_disjoint_long_copy(StubGenStubId stub_id) {
1848     bool aligned;
1849     switch (stub_id) {
1850     case jlong_disjoint_arraycopy_id:
1851       aligned = false;
1852       break;
1853     case arrayof_jlong_disjoint_arraycopy_id:
1854       aligned = true;
1855       break;
1856     default:
1857       ShouldNotReachHere();
1858     }
1859 
1860     StubCodeMark mark(this, stub_id);
1861     address start = __ function_entry();
1862     assert_positive_int(R5_ARG3);
1863     {
1864       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1865       UnsafeMemoryAccessMark umam(this, !aligned, false);
1866       generate_disjoint_long_copy_core(aligned);
1867     }
1868     __ li(R3_RET, 0); // return 0
1869     __ blr();
1870 
1871   return start;
1872   }
1873 
1874   // Generate core code for conjoint long copy (and oop copy on
1875   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1876   // are assumed to be heapword aligned.
1877   //
1878   // Arguments:
1879   //      from:  R3_ARG1
1880   //      to:    R4_ARG2
1881   //      count: R5_ARG3 treated as signed
1882   //
1883   void generate_conjoint_long_copy_core(bool aligned) {
1884     Register tmp1 = R6_ARG4;
1885     Register tmp2 = R7_ARG5;
1886     Register tmp3 = R8_ARG6;
1887     Register tmp4 = R0;
1888 
1889     VectorSRegister tmp_vsr1  = VSR1;
1890     VectorSRegister tmp_vsr2  = VSR2;
1891 
1892     Label l_1, l_2, l_3, l_4, l_5;
1893 
1894     __ cmpwi(CR0, R5_ARG3, 0);
1895     __ beq(CR0, l_1);
1896 
1897     { // FasterArrayCopy
1898       __ sldi(R5_ARG3, R5_ARG3, 3);
1899       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1900       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1901       __ srdi(R5_ARG3, R5_ARG3, 3);
1902 
1903       __ cmpwi(CR0, R5_ARG3, 3);
1904       __ ble(CR0, l_5); // copy 1 at a time if less than 4 elements remain
1905 
1906       __ srdi(tmp1, R5_ARG3, 2);
1907       __ andi(R5_ARG3, R5_ARG3, 3);
1908       __ mtctr(tmp1);
1909 
1910       // Processor supports VSX, so use it to mass copy.
1911       // Prefetch the data into the L2 cache.
1912       __ dcbt(R3_ARG1, 0);
1913 
1914       // Set DSCR pre-fetch to deepest.
1915       __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1916       __ mtdscr(tmp2);
1917 
1918       __ li(tmp1, 16);
1919 
1920       // Backbranch target aligned to 32-byte. Not 16-byte align as
1921       // loop contains < 8 instructions that fit inside a single
1922       // i-cache sector.
1923       __ align(32);
1924 
1925       __ bind(l_4);
1926       // Use loop with VSX load/store instructions to
1927       // copy 4 elements a time.
1928       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1929       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1930       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1931       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1932       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1933       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1934       __ bdnz(l_4);
1935 
1936       // Restore DSCR pre-fetch value.
1937       __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1938       __ mtdscr(tmp2);
1939 
1940       __ cmpwi(CR0, R5_ARG3, 0);
1941       __ beq(CR0, l_1);
1942 
1943       __ bind(l_5);
1944       __ mtctr(R5_ARG3);
1945       __ bind(l_3);
1946       __ ld(R0, -8, R3_ARG1);
1947       __ std(R0, -8, R4_ARG2);
1948       __ addi(R3_ARG1, R3_ARG1, -8);
1949       __ addi(R4_ARG2, R4_ARG2, -8);
1950       __ bdnz(l_3);
1951 
1952     }
1953     __ bind(l_1);
1954   }
1955 
1956   // Generate stub for conjoint long copy.  If "aligned" is true, the
1957   // "from" and "to" addresses are assumed to be heapword aligned.
1958   //
1959   // Arguments for generated stub:
1960   //      from:  R3_ARG1
1961   //      to:    R4_ARG2
1962   //      count: R5_ARG3 treated as signed
1963   //
1964   address generate_conjoint_long_copy(StubGenStubId stub_id) {
1965     bool aligned;
1966     switch (stub_id) {
1967     case jlong_arraycopy_id:
1968       aligned = false;
1969       break;
1970     case arrayof_jlong_arraycopy_id:
1971       aligned = true;
1972       break;
1973     default:
1974       ShouldNotReachHere();
1975     }
1976 
1977     StubCodeMark mark(this, stub_id);
1978     address start = __ function_entry();
1979     assert_positive_int(R5_ARG3);
1980     address nooverlap_target = aligned ?
1981       STUB_ENTRY(arrayof_jlong_disjoint_arraycopy()) :
1982       STUB_ENTRY(jlong_disjoint_arraycopy());
1983 
1984     array_overlap_test(nooverlap_target, 3);
1985     {
1986       // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1987       UnsafeMemoryAccessMark umam(this, !aligned, false);
1988       generate_conjoint_long_copy_core(aligned);
1989     }
1990     __ li(R3_RET, 0); // return 0
1991     __ blr();
1992 
1993     return start;
1994   }
1995 
1996   // Generate stub for conjoint oop copy.  If "aligned" is true, the
1997   // "from" and "to" addresses are assumed to be heapword aligned.
1998   //
1999   // Arguments for generated stub:
2000   //      from:  R3_ARG1
2001   //      to:    R4_ARG2
2002   //      count: R5_ARG3 treated as signed
2003   //      dest_uninitialized: G1 support
2004   //
2005   address generate_conjoint_oop_copy(StubGenStubId stub_id) {
2006     bool aligned;
2007     bool dest_uninitialized;
2008     switch (stub_id) {
2009     case oop_arraycopy_id:
2010       aligned = false;
2011       dest_uninitialized = false;
2012       break;
2013     case arrayof_oop_arraycopy_id:
2014       aligned = true;
2015       dest_uninitialized = false;
2016       break;
2017     case oop_arraycopy_uninit_id:
2018       aligned = false;
2019       dest_uninitialized = true;
2020       break;
2021     case arrayof_oop_arraycopy_uninit_id:
2022       aligned = true;
2023       dest_uninitialized = true;
2024       break;
2025     default:
2026       ShouldNotReachHere();
2027     }
2028 
2029     StubCodeMark mark(this, stub_id);
2030     address start = __ function_entry();
2031     assert_positive_int(R5_ARG3);
2032     address nooverlap_target = aligned ?
2033       STUB_ENTRY(arrayof_oop_disjoint_arraycopy(dest_uninitialized)) :
2034       STUB_ENTRY(oop_disjoint_arraycopy(dest_uninitialized));
2035 
2036     array_overlap_test(nooverlap_target, UseCompressedOops ? 2 : 3);
2037 
2038     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2039     if (dest_uninitialized) {
2040       decorators |= IS_DEST_UNINITIALIZED;
2041     }
2042     if (aligned) {
2043       decorators |= ARRAYCOPY_ALIGNED;
2044     }
2045 
2046     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2047     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2048 
2049     if (UseCompressedOops) {
2050       generate_conjoint_int_copy_core(aligned);
2051     } else {
2052 #if INCLUDE_ZGC
2053       if (UseZGC) {
2054         ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
2055         zbs->generate_conjoint_oop_copy(_masm, dest_uninitialized);
2056       } else
2057 #endif
2058       generate_conjoint_long_copy_core(aligned);
2059     }
2060 
2061     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2062     __ li(R3_RET, 0); // return 0
2063     __ blr();
2064     return start;
2065   }
2066 
2067   // Generate stub for disjoint oop copy.  If "aligned" is true, the
2068   // "from" and "to" addresses are assumed to be heapword aligned.
2069   //
2070   // Arguments for generated stub:
2071   //      from:  R3_ARG1
2072   //      to:    R4_ARG2
2073   //      count: R5_ARG3 treated as signed
2074   //      dest_uninitialized: G1 support
2075   //
2076   address generate_disjoint_oop_copy(StubGenStubId stub_id) {
2077     bool aligned;
2078     bool dest_uninitialized;
2079     switch (stub_id) {
2080     case oop_disjoint_arraycopy_id:
2081       aligned = false;
2082       dest_uninitialized = false;
2083       break;
2084     case arrayof_oop_disjoint_arraycopy_id:
2085       aligned = true;
2086       dest_uninitialized = false;
2087       break;
2088     case oop_disjoint_arraycopy_uninit_id:
2089       aligned = false;
2090       dest_uninitialized = true;
2091       break;
2092     case arrayof_oop_disjoint_arraycopy_uninit_id:
2093       aligned = true;
2094       dest_uninitialized = true;
2095       break;
2096     default:
2097       ShouldNotReachHere();
2098     }
2099 
2100     StubCodeMark mark(this, stub_id);
2101     address start = __ function_entry();
2102     assert_positive_int(R5_ARG3);
2103 
2104     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2105     if (dest_uninitialized) {
2106       decorators |= IS_DEST_UNINITIALIZED;
2107     }
2108     if (aligned) {
2109       decorators |= ARRAYCOPY_ALIGNED;
2110     }
2111 
2112     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2113     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2114 
2115     if (UseCompressedOops) {
2116       generate_disjoint_int_copy_core(aligned);
2117     } else {
2118 #if INCLUDE_ZGC
2119       if (UseZGC) {
2120         ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
2121         zbs->generate_disjoint_oop_copy(_masm, dest_uninitialized);
2122       } else
2123 #endif
2124       generate_disjoint_long_copy_core(aligned);
2125     }
2126 
2127     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2128     __ li(R3_RET, 0); // return 0
2129     __ blr();
2130 
2131     return start;
2132   }
2133 
2134 
2135   // Helper for generating a dynamic type check.
2136   // Smashes only the given temp registers.
2137   void generate_type_check(Register sub_klass,
2138                            Register super_check_offset,
2139                            Register super_klass,
2140                            Register temp1,
2141                            Register temp2,
2142                            Label& L_success) {
2143     assert_different_registers(sub_klass, super_check_offset, super_klass);
2144 
2145     BLOCK_COMMENT("type_check:");
2146 
2147     Label L_miss;
2148 
2149     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp1, temp2, &L_success, &L_miss, nullptr,
2150                                      super_check_offset);
2151     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success);
2152 
2153     // Fall through on failure!
2154     __ bind(L_miss);
2155   }
2156 
2157 
2158   //  Generate stub for checked oop copy.
2159   //
2160   // Arguments for generated stub:
2161   //      from:  R3
2162   //      to:    R4
2163   //      count: R5 treated as signed
2164   //      ckoff: R6 (super_check_offset)
2165   //      ckval: R7 (super_klass)
2166   //      ret:   R3 zero for success; (-1^K) where K is partial transfer count
2167   //
2168   address generate_checkcast_copy(StubGenStubId stub_id) {
2169     const Register R3_from   = R3_ARG1;      // source array address
2170     const Register R4_to     = R4_ARG2;      // destination array address
2171     const Register R5_count  = R5_ARG3;      // elements count
2172     const Register R6_ckoff  = R6_ARG4;      // super_check_offset
2173     const Register R7_ckval  = R7_ARG5;      // super_klass
2174 
2175     const Register R8_offset = R8_ARG6;      // loop var, with stride wordSize
2176     const Register R9_remain = R9_ARG7;      // loop var, with stride -1
2177     const Register R10_oop   = R10_ARG8;     // actual oop copied
2178     const Register R11_klass = R11_scratch1; // oop._klass
2179     const Register R12_tmp   = R12_scratch2;
2180     const Register R2_tmp    = R2;
2181 
2182     bool dest_uninitialized;
2183     switch (stub_id) {
2184     case checkcast_arraycopy_id:
2185       dest_uninitialized = false;
2186       break;
2187     case checkcast_arraycopy_uninit_id:
2188       dest_uninitialized = true;
2189       break;
2190     default:
2191       ShouldNotReachHere();
2192     }
2193     //__ align(CodeEntryAlignment);
2194     StubCodeMark mark(this, stub_id);
2195     address start = __ function_entry();
2196 
2197     // Assert that int is 64 bit sign extended and arrays are not conjoint.
2198 #ifdef ASSERT
2199     {
2200     assert_positive_int(R5_ARG3);
2201     const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2;
2202     Label no_overlap;
2203     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
2204     __ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes
2205     __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
2206     __ cmpld(CR1, tmp1, tmp2);
2207     __ crnand(CR0, Assembler::less, CR1, Assembler::less);
2208     // Overlaps if Src before dst and distance smaller than size.
2209     // Branch to forward copy routine otherwise.
2210     __ blt(CR0, no_overlap);
2211     __ stop("overlap in checkcast_copy");
2212     __ bind(no_overlap);
2213     }
2214 #endif
2215 
2216     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2217     if (dest_uninitialized) {
2218       decorators |= IS_DEST_UNINITIALIZED;
2219     }
2220 
2221     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2222     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_from, R4_to, R5_count, /* preserve: */ R6_ckoff, R7_ckval);
2223 
2224     //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
2225 
2226     Label load_element, store_element, store_null, success, do_epilogue;
2227     __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
2228     __ li(R8_offset, 0);                   // Offset from start of arrays.
2229     __ bne(CR0, load_element);
2230 
2231     // Empty array: Nothing to do.
2232     __ li(R3_RET, 0);           // Return 0 on (trivial) success.
2233     __ blr();
2234 
2235     // ======== begin loop ========
2236     // (Entry is load_element.)
2237     __ align(OptoLoopAlignment);
2238     __ bind(store_element);
2239     if (UseCompressedOops) {
2240       __ encode_heap_oop_not_null(R10_oop);
2241       __ bind(store_null);
2242       __ stw(R10_oop, R8_offset, R4_to);
2243     } else {
2244       __ bind(store_null);
2245 #if INCLUDE_ZGC
2246       if (UseZGC) {
2247         __ store_heap_oop(R10_oop, R8_offset, R4_to, R11_scratch1, R12_tmp, noreg,
2248                           MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2249                           dest_uninitialized ? IS_DEST_UNINITIALIZED : 0);
2250       } else
2251 #endif
2252       __ std(R10_oop, R8_offset, R4_to);
2253     }
2254 
2255     __ addi(R8_offset, R8_offset, heapOopSize);   // Step to next offset.
2256     __ addic_(R9_remain, R9_remain, -1);          // Decrement the count.
2257     __ beq(CR0, success);
2258 
2259     // ======== loop entry is here ========
2260     __ bind(load_element);
2261 #if INCLUDE_ZGC
2262     if (UseZGC) {
2263       __ load_heap_oop(R10_oop, R8_offset, R3_from,
2264                        R11_scratch1, R12_tmp,
2265                        MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2266                        0, &store_null);
2267     } else
2268 #endif
2269     __ load_heap_oop(R10_oop, R8_offset, R3_from,
2270                      R11_scratch1, R12_tmp,
2271                      MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2272                      AS_RAW, &store_null);
2273 
2274     __ load_klass(R11_klass, R10_oop); // Query the object klass.
2275 
2276     generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, R2_tmp,
2277                         // Branch to this on success:
2278                         store_element);
2279     // ======== end loop ========
2280 
2281     // It was a real error; we must depend on the caller to finish the job.
2282     // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
2283     // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
2284     // and report their number to the caller.
2285     __ subf_(R5_count, R9_remain, R5_count);
2286     __ nand(R3_RET, R5_count, R5_count);   // report (-1^K) to caller
2287     __ bne(CR0, do_epilogue);
2288     __ blr();
2289 
2290     __ bind(success);
2291     __ li(R3_RET, 0);
2292 
2293     __ bind(do_epilogue);
2294     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_to, R5_count, /* preserve */ R3_RET);
2295 
2296     __ blr();
2297     return start;
2298   }
2299 
2300 
2301   //  Generate 'unsafe' array copy stub.
2302   //  Though just as safe as the other stubs, it takes an unscaled
2303   //  size_t argument instead of an element count.
2304   //
2305   // Arguments for generated stub:
2306   //      from:  R3
2307   //      to:    R4
2308   //      count: R5 byte count, treated as ssize_t, can be zero
2309   //
2310   // Examines the alignment of the operands and dispatches
2311   // to a long, int, short, or byte copy loop.
2312   //
2313   address generate_unsafe_copy(address byte_copy_entry,
2314                                address short_copy_entry,
2315                                address int_copy_entry,
2316                                address long_copy_entry) {
2317 
2318     const Register R3_from   = R3_ARG1;      // source array address
2319     const Register R4_to     = R4_ARG2;      // destination array address
2320     const Register R5_count  = R5_ARG3;      // elements count (as long on PPC64)
2321 
2322     const Register R6_bits   = R6_ARG4;      // test copy of low bits
2323     const Register R7_tmp    = R7_ARG5;
2324 
2325     //__ align(CodeEntryAlignment);
2326     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
2327     StubCodeMark mark(this, stub_id);
2328     address start = __ function_entry();
2329 
2330     // Bump this on entry, not on exit:
2331     //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
2332 
2333     Label short_copy, int_copy, long_copy;
2334 
2335     __ orr(R6_bits, R3_from, R4_to);
2336     __ orr(R6_bits, R6_bits, R5_count);
2337     __ andi_(R0, R6_bits, (BytesPerLong-1));
2338     __ beq(CR0, long_copy);
2339 
2340     __ andi_(R0, R6_bits, (BytesPerInt-1));
2341     __ beq(CR0, int_copy);
2342 
2343     __ andi_(R0, R6_bits, (BytesPerShort-1));
2344     __ beq(CR0, short_copy);
2345 
2346     // byte_copy:
2347     __ b(byte_copy_entry);
2348 
2349     __ bind(short_copy);
2350     __ srwi(R5_count, R5_count, LogBytesPerShort);
2351     __ b(short_copy_entry);
2352 
2353     __ bind(int_copy);
2354     __ srwi(R5_count, R5_count, LogBytesPerInt);
2355     __ b(int_copy_entry);
2356 
2357     __ bind(long_copy);
2358     __ srwi(R5_count, R5_count, LogBytesPerLong);
2359     __ b(long_copy_entry);
2360 
2361     return start;
2362   }
2363 
2364 
2365   // Perform range checks on the proposed arraycopy.
2366   // Kills the two temps, but nothing else.
2367   // Also, clean the sign bits of src_pos and dst_pos.
2368   void arraycopy_range_checks(Register src,     // source array oop
2369                               Register src_pos, // source position
2370                               Register dst,     // destination array oop
2371                               Register dst_pos, // destination position
2372                               Register length,  // length of copy
2373                               Register temp1, Register temp2,
2374                               Label& L_failed) {
2375     BLOCK_COMMENT("arraycopy_range_checks:");
2376 
2377     const Register array_length = temp1;  // scratch
2378     const Register end_pos      = temp2;  // scratch
2379 
2380     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2381     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
2382     __ add(end_pos, src_pos, length);  // src_pos + length
2383     __ cmpd(CR0, end_pos, array_length);
2384     __ bgt(CR0, L_failed);
2385 
2386     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2387     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
2388     __ add(end_pos, dst_pos, length);  // src_pos + length
2389     __ cmpd(CR0, end_pos, array_length);
2390     __ bgt(CR0, L_failed);
2391 
2392     BLOCK_COMMENT("arraycopy_range_checks done");
2393   }
2394 
2395 
2396   // Helper for generate_unsafe_setmemory
2397   //
2398   // Atomically fill an array of memory using 1-, 2-, 4-, or 8-byte chunks and return.
2399   static void do_setmemory_atomic_loop(int elem_size, Register dest, Register size, Register byteVal,
2400                                        MacroAssembler *_masm) {
2401 
2402     Label L_Loop, L_Tail; // 2x unrolled loop
2403 
2404     // Propagate byte to required width
2405     if (elem_size > 1) __ rldimi(byteVal, byteVal,  8, 64 - 2 *  8);
2406     if (elem_size > 2) __ rldimi(byteVal, byteVal, 16, 64 - 2 * 16);
2407     if (elem_size > 4) __ rldimi(byteVal, byteVal, 32, 64 - 2 * 32);
2408 
2409     __ srwi_(R0, size, exact_log2(2 * elem_size)); // size is a 32 bit value
2410     __ beq(CR0, L_Tail);
2411     __ mtctr(R0);
2412 
2413     __ align(32); // loop alignment
2414     __ bind(L_Loop);
2415     __ store_sized_value(byteVal, 0, dest, elem_size);
2416     __ store_sized_value(byteVal, elem_size, dest, elem_size);
2417     __ addi(dest, dest, 2 * elem_size);
2418     __ bdnz(L_Loop);
2419 
2420     __ bind(L_Tail);
2421     __ andi_(R0, size, elem_size);
2422     __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintbhBCLRisReturn);
2423     __ store_sized_value(byteVal, 0, dest, elem_size);
2424     __ blr();
2425   }
2426 
2427   //
2428   //  Generate 'unsafe' set memory stub
2429   //  Though just as safe as the other stubs, it takes an unscaled
2430   //  size_t (# bytes) argument instead of an element count.
2431   //
2432   //  Input:
2433   //    R3_ARG1   - destination array address
2434   //    R4_ARG2   - byte count (size_t)
2435   //    R5_ARG3   - byte value
2436   //
2437   address generate_unsafe_setmemory(address unsafe_byte_fill) {
2438     __ align(CodeEntryAlignment);
2439     StubCodeMark mark(this, StubGenStubId::unsafe_setmemory_id);
2440     address start = __ function_entry();
2441 
2442     // bump this on entry, not on exit:
2443     // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
2444 
2445     {
2446       Label L_fill8Bytes, L_fill4Bytes, L_fillBytes;
2447 
2448       const Register dest = R3_ARG1;
2449       const Register size = R4_ARG2;
2450       const Register byteVal = R5_ARG3;
2451       const Register rScratch1 = R6;
2452 
2453       // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2454 
2455       // Check for pointer & size alignment
2456       __ orr(rScratch1, dest, size);
2457 
2458       __ andi_(R0, rScratch1, 7);
2459       __ beq(CR0, L_fill8Bytes);
2460 
2461       __ andi_(R0, rScratch1, 3);
2462       __ beq(CR0, L_fill4Bytes);
2463 
2464       __ andi_(R0, rScratch1, 1);
2465       __ bne(CR0, L_fillBytes);
2466 
2467       // Mark remaining code as such which performs Unsafe accesses.
2468       UnsafeMemoryAccessMark umam(this, true, false);
2469 
2470       // At this point, we know the lower bit of size is zero and a
2471       // multiple of 2
2472       do_setmemory_atomic_loop(2, dest, size, byteVal, _masm);
2473 
2474       __ align(32);
2475       __ bind(L_fill8Bytes);
2476       // At this point, we know the lower 3 bits of size are zero and a
2477       // multiple of 8
2478       do_setmemory_atomic_loop(8, dest, size, byteVal, _masm);
2479 
2480       __ align(32);
2481       __ bind(L_fill4Bytes);
2482       // At this point, we know the lower 2 bits of size are zero and a
2483       // multiple of 4
2484       do_setmemory_atomic_loop(4, dest, size, byteVal, _masm);
2485 
2486       __ align(32);
2487       __ bind(L_fillBytes);
2488       do_setmemory_atomic_loop(1, dest, size, byteVal, _masm);
2489     }
2490 
2491     return start;
2492   }
2493 
2494 
2495   //
2496   //  Generate generic array copy stubs
2497   //
2498   //  Input:
2499   //    R3    -  src oop
2500   //    R4    -  src_pos
2501   //    R5    -  dst oop
2502   //    R6    -  dst_pos
2503   //    R7    -  element count
2504   //
2505   //  Output:
2506   //    R3 ==  0  -  success
2507   //    R3 == -1  -  need to call System.arraycopy
2508   //
2509   address generate_generic_copy(address entry_jbyte_arraycopy,
2510                                 address entry_jshort_arraycopy,
2511                                 address entry_jint_arraycopy,
2512                                 address entry_oop_arraycopy,
2513                                 address entry_disjoint_oop_arraycopy,
2514                                 address entry_jlong_arraycopy,
2515                                 address entry_checkcast_arraycopy) {
2516     Label L_failed, L_objArray;
2517 
2518     // Input registers
2519     const Register src       = R3_ARG1;  // source array oop
2520     const Register src_pos   = R4_ARG2;  // source position
2521     const Register dst       = R5_ARG3;  // destination array oop
2522     const Register dst_pos   = R6_ARG4;  // destination position
2523     const Register length    = R7_ARG5;  // elements count
2524 
2525     // registers used as temp
2526     const Register src_klass = R8_ARG6;  // source array klass
2527     const Register dst_klass = R9_ARG7;  // destination array klass
2528     const Register lh        = R10_ARG8; // layout handler
2529     const Register temp      = R2;
2530 
2531     //__ align(CodeEntryAlignment);
2532     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
2533     StubCodeMark mark(this, stub_id);
2534     address start = __ function_entry();
2535 
2536     // Bump this on entry, not on exit:
2537     //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
2538 
2539     // In principle, the int arguments could be dirty.
2540 
2541     //-----------------------------------------------------------------------
2542     // Assembler stubs will be used for this call to arraycopy
2543     // if the following conditions are met:
2544     //
2545     // (1) src and dst must not be null.
2546     // (2) src_pos must not be negative.
2547     // (3) dst_pos must not be negative.
2548     // (4) length  must not be negative.
2549     // (5) src klass and dst klass should be the same and not null.
2550     // (6) src and dst should be arrays.
2551     // (7) src_pos + length must not exceed length of src.
2552     // (8) dst_pos + length must not exceed length of dst.
2553     BLOCK_COMMENT("arraycopy initial argument checks");
2554 
2555     __ cmpdi(CR1, src, 0);      // if (src == nullptr) return -1;
2556     __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
2557     __ cmpdi(CR5, dst, 0);      // if (dst == nullptr) return -1;
2558     __ cror(CR1, Assembler::equal, CR0, Assembler::less);
2559     __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
2560     __ cror(CR5, Assembler::equal, CR0, Assembler::less);
2561     __ extsw_(length, length);   // if (length < 0) return -1;
2562     __ cror(CR1, Assembler::equal, CR5, Assembler::equal);
2563     __ cror(CR1, Assembler::equal, CR0, Assembler::less);
2564     __ beq(CR1, L_failed);
2565 
2566     BLOCK_COMMENT("arraycopy argument klass checks");
2567     __ load_klass(src_klass, src);
2568     __ load_klass(dst_klass, dst);
2569 
2570     // Load layout helper
2571     //
2572     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2573     // 32        30    24            16              8     2                 0
2574     //
2575     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2576     //
2577 
2578     int lh_offset = in_bytes(Klass::layout_helper_offset());
2579 
2580     // Load 32-bits signed value. Use br() instruction with it to check icc.
2581     __ lwz(lh, lh_offset, src_klass);
2582 
2583     // Handle objArrays completely differently...
2584     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2585     __ load_const_optimized(temp, objArray_lh, R0);
2586     __ cmpw(CR0, lh, temp);
2587     __ beq(CR0, L_objArray);
2588 
2589     __ cmpd(CR5, src_klass, dst_klass);          // if (src->klass() != dst->klass()) return -1;
2590     __ cmpwi(CR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
2591 
2592     __ crnand(CR5, Assembler::equal, CR6, Assembler::less);
2593     __ beq(CR5, L_failed);
2594 
2595     // At this point, it is known to be a typeArray (array_tag 0x3).
2596 #ifdef ASSERT
2597     { Label L;
2598       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2599       __ load_const_optimized(temp, lh_prim_tag_in_place, R0);
2600       __ cmpw(CR0, lh, temp);
2601       __ bge(CR0, L);
2602       __ stop("must be a primitive array");
2603       __ bind(L);
2604     }
2605 #endif
2606 
2607     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2608                            temp, dst_klass, L_failed);
2609 
2610     // TypeArrayKlass
2611     //
2612     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2613     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2614     //
2615 
2616     const Register offset = dst_klass;    // array offset
2617     const Register elsize = src_klass;    // log2 element size
2618 
2619     __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
2620     __ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
2621     __ add(src, offset, src);       // src array offset
2622     __ add(dst, offset, dst);       // dst array offset
2623 
2624     // Next registers should be set before the jump to corresponding stub.
2625     const Register from     = R3_ARG1;  // source array address
2626     const Register to       = R4_ARG2;  // destination array address
2627     const Register count    = R5_ARG3;  // elements count
2628 
2629     // 'from', 'to', 'count' registers should be set in this order
2630     // since they are the same as 'src', 'src_pos', 'dst'.
2631 
2632     BLOCK_COMMENT("scale indexes to element size");
2633     __ sld(src_pos, src_pos, elsize);
2634     __ sld(dst_pos, dst_pos, elsize);
2635     __ add(from, src_pos, src);  // src_addr
2636     __ add(to, dst_pos, dst);    // dst_addr
2637     __ mr(count, length);        // length
2638 
2639     BLOCK_COMMENT("choose copy loop based on element size");
2640     // Using conditional branches with range 32kB.
2641     const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CR0, Assembler::equal);
2642     __ cmpwi(CR0, elsize, 0);
2643     __ bc(bo, bi, entry_jbyte_arraycopy);
2644     __ cmpwi(CR0, elsize, LogBytesPerShort);
2645     __ bc(bo, bi, entry_jshort_arraycopy);
2646     __ cmpwi(CR0, elsize, LogBytesPerInt);
2647     __ bc(bo, bi, entry_jint_arraycopy);
2648 #ifdef ASSERT
2649     { Label L;
2650       __ cmpwi(CR0, elsize, LogBytesPerLong);
2651       __ beq(CR0, L);
2652       __ stop("must be long copy, but elsize is wrong");
2653       __ bind(L);
2654     }
2655 #endif
2656     __ b(entry_jlong_arraycopy);
2657 
2658     // ObjArrayKlass
2659   __ bind(L_objArray);
2660     // live at this point:  src_klass, dst_klass, src[_pos], dst[_pos], length
2661 
2662     Label L_disjoint_plain_copy, L_checkcast_copy;
2663     //  test array classes for subtyping
2664     __ cmpd(CR0, src_klass, dst_klass);         // usual case is exact equality
2665     __ bne(CR0, L_checkcast_copy);
2666 
2667     // Identically typed arrays can be copied without element-wise checks.
2668     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2669                            temp, lh, L_failed);
2670 
2671     __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2672     __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2673     __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2674     __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2675     __ add(from, src_pos, src);  // src_addr
2676     __ add(to, dst_pos, dst);    // dst_addr
2677     __ mr(count, length);        // length
2678     __ b(entry_oop_arraycopy);
2679 
2680   __ bind(L_checkcast_copy);
2681     // live at this point:  src_klass, dst_klass
2682     {
2683       // Before looking at dst.length, make sure dst is also an objArray.
2684       __ lwz(temp, lh_offset, dst_klass);
2685       __ cmpw(CR0, lh, temp);
2686       __ bne(CR0, L_failed);
2687 
2688       // It is safe to examine both src.length and dst.length.
2689       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2690                              temp, lh, L_failed);
2691 
2692       // Marshal the base address arguments now, freeing registers.
2693       __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2694       __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2695       __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2696       __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2697       __ add(from, src_pos, src);  // src_addr
2698       __ add(to, dst_pos, dst);    // dst_addr
2699       __ mr(count, length);        // length
2700 
2701       Register sco_temp = R6_ARG4;             // This register is free now.
2702       assert_different_registers(from, to, count, sco_temp,
2703                                  dst_klass, src_klass);
2704 
2705       // Generate the type check.
2706       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2707       __ lwz(sco_temp, sco_offset, dst_klass);
2708       generate_type_check(src_klass, sco_temp, dst_klass,
2709                           temp, /* temp */ R10_ARG8, L_disjoint_plain_copy);
2710 
2711       // Fetch destination element klass from the ObjArrayKlass header.
2712       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2713 
2714       // The checkcast_copy loop needs two extra arguments:
2715       __ ld(R7_ARG5, ek_offset, dst_klass);   // dest elem klass
2716       __ lwz(R6_ARG4, sco_offset, R7_ARG5);   // sco of elem klass
2717       __ b(entry_checkcast_arraycopy);
2718     }
2719 
2720     __ bind(L_disjoint_plain_copy);
2721     __ b(entry_disjoint_oop_arraycopy);
2722 
2723   __ bind(L_failed);
2724     __ li(R3_RET, -1); // return -1
2725     __ blr();
2726     return start;
2727   }
2728 
2729   // Arguments for generated stub:
2730   //   R3_ARG1   - source byte array address
2731   //   R4_ARG2   - destination byte array address
2732   //   R5_ARG3   - round key array
2733   address generate_aescrypt_encryptBlock() {
2734     assert(UseAES, "need AES instructions and misaligned SSE support");
2735     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
2736     StubCodeMark mark(this, stub_id);
2737 
2738     address start = __ function_entry();
2739 
2740     Label L_doLast, L_error;
2741 
2742     Register from           = R3_ARG1;  // source array address
2743     Register to             = R4_ARG2;  // destination array address
2744     Register key            = R5_ARG3;  // round key array
2745 
2746     Register keylen         = R8;
2747     Register temp           = R9;
2748     Register keypos         = R10;
2749     Register fifteen        = R12;
2750 
2751     VectorRegister vRet     = VR0;
2752 
2753     VectorRegister vKey1    = VR1;
2754     VectorRegister vKey2    = VR2;
2755     VectorRegister vKey3    = VR3;
2756     VectorRegister vKey4    = VR4;
2757 
2758     VectorRegister fromPerm = VR5;
2759     VectorRegister keyPerm  = VR6;
2760     VectorRegister toPerm   = VR7;
2761     VectorRegister fSplt    = VR8;
2762 
2763     VectorRegister vTmp1    = VR9;
2764     VectorRegister vTmp2    = VR10;
2765     VectorRegister vTmp3    = VR11;
2766     VectorRegister vTmp4    = VR12;
2767 
2768     __ li              (fifteen, 15);
2769 
2770     // load unaligned from[0-15] to vRet
2771     __ lvx             (vRet, from);
2772     __ lvx             (vTmp1, fifteen, from);
2773     __ lvsl            (fromPerm, from);
2774 #ifdef VM_LITTLE_ENDIAN
2775     __ vspltisb        (fSplt, 0x0f);
2776     __ vxor            (fromPerm, fromPerm, fSplt);
2777 #endif
2778     __ vperm           (vRet, vRet, vTmp1, fromPerm);
2779 
2780     // load keylen (44 or 52 or 60)
2781     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2782 
2783     // to load keys
2784     __ load_perm       (keyPerm, key);
2785 #ifdef VM_LITTLE_ENDIAN
2786     __ vspltisb        (vTmp2, -16);
2787     __ vrld            (keyPerm, keyPerm, vTmp2);
2788     __ vrld            (keyPerm, keyPerm, vTmp2);
2789     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
2790 #endif
2791 
2792     // load the 1st round key to vTmp1
2793     __ lvx             (vTmp1, key);
2794     __ li              (keypos, 16);
2795     __ lvx             (vKey1, keypos, key);
2796     __ vec_perm        (vTmp1, vKey1, keyPerm);
2797 
2798     // 1st round
2799     __ vxor            (vRet, vRet, vTmp1);
2800 
2801     // load the 2nd round key to vKey1
2802     __ li              (keypos, 32);
2803     __ lvx             (vKey2, keypos, key);
2804     __ vec_perm        (vKey1, vKey2, keyPerm);
2805 
2806     // load the 3rd round key to vKey2
2807     __ li              (keypos, 48);
2808     __ lvx             (vKey3, keypos, key);
2809     __ vec_perm        (vKey2, vKey3, keyPerm);
2810 
2811     // load the 4th round key to vKey3
2812     __ li              (keypos, 64);
2813     __ lvx             (vKey4, keypos, key);
2814     __ vec_perm        (vKey3, vKey4, keyPerm);
2815 
2816     // load the 5th round key to vKey4
2817     __ li              (keypos, 80);
2818     __ lvx             (vTmp1, keypos, key);
2819     __ vec_perm        (vKey4, vTmp1, keyPerm);
2820 
2821     // 2nd - 5th rounds
2822     __ vcipher         (vRet, vRet, vKey1);
2823     __ vcipher         (vRet, vRet, vKey2);
2824     __ vcipher         (vRet, vRet, vKey3);
2825     __ vcipher         (vRet, vRet, vKey4);
2826 
2827     // load the 6th round key to vKey1
2828     __ li              (keypos, 96);
2829     __ lvx             (vKey2, keypos, key);
2830     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2831 
2832     // load the 7th round key to vKey2
2833     __ li              (keypos, 112);
2834     __ lvx             (vKey3, keypos, key);
2835     __ vec_perm        (vKey2, vKey3, keyPerm);
2836 
2837     // load the 8th round key to vKey3
2838     __ li              (keypos, 128);
2839     __ lvx             (vKey4, keypos, key);
2840     __ vec_perm        (vKey3, vKey4, keyPerm);
2841 
2842     // load the 9th round key to vKey4
2843     __ li              (keypos, 144);
2844     __ lvx             (vTmp1, keypos, key);
2845     __ vec_perm        (vKey4, vTmp1, keyPerm);
2846 
2847     // 6th - 9th rounds
2848     __ vcipher         (vRet, vRet, vKey1);
2849     __ vcipher         (vRet, vRet, vKey2);
2850     __ vcipher         (vRet, vRet, vKey3);
2851     __ vcipher         (vRet, vRet, vKey4);
2852 
2853     // load the 10th round key to vKey1
2854     __ li              (keypos, 160);
2855     __ lvx             (vKey2, keypos, key);
2856     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2857 
2858     // load the 11th round key to vKey2
2859     __ li              (keypos, 176);
2860     __ lvx             (vTmp1, keypos, key);
2861     __ vec_perm        (vKey2, vTmp1, keyPerm);
2862 
2863     // if all round keys are loaded, skip next 4 rounds
2864     __ cmpwi           (CR0, keylen, 44);
2865     __ beq             (CR0, L_doLast);
2866 
2867     // 10th - 11th rounds
2868     __ vcipher         (vRet, vRet, vKey1);
2869     __ vcipher         (vRet, vRet, vKey2);
2870 
2871     // load the 12th round key to vKey1
2872     __ li              (keypos, 192);
2873     __ lvx             (vKey2, keypos, key);
2874     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2875 
2876     // load the 13th round key to vKey2
2877     __ li              (keypos, 208);
2878     __ lvx             (vTmp1, keypos, key);
2879     __ vec_perm        (vKey2, vTmp1, keyPerm);
2880 
2881     // if all round keys are loaded, skip next 2 rounds
2882     __ cmpwi           (CR0, keylen, 52);
2883     __ beq             (CR0, L_doLast);
2884 
2885 #ifdef ASSERT
2886     __ cmpwi           (CR0, keylen, 60);
2887     __ bne             (CR0, L_error);
2888 #endif
2889 
2890     // 12th - 13th rounds
2891     __ vcipher         (vRet, vRet, vKey1);
2892     __ vcipher         (vRet, vRet, vKey2);
2893 
2894     // load the 14th round key to vKey1
2895     __ li              (keypos, 224);
2896     __ lvx             (vKey2, keypos, key);
2897     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2898 
2899     // load the 15th round key to vKey2
2900     __ li              (keypos, 240);
2901     __ lvx             (vTmp1, keypos, key);
2902     __ vec_perm        (vKey2, vTmp1, keyPerm);
2903 
2904     __ bind(L_doLast);
2905 
2906     // last two rounds
2907     __ vcipher         (vRet, vRet, vKey1);
2908     __ vcipherlast     (vRet, vRet, vKey2);
2909 
2910 #ifdef VM_LITTLE_ENDIAN
2911     // toPerm = 0x0F0E0D0C0B0A09080706050403020100
2912     __ lvsl            (toPerm, keypos); // keypos is a multiple of 16
2913     __ vxor            (toPerm, toPerm, fSplt);
2914 
2915     // Swap Bytes
2916     __ vperm           (vRet, vRet, vRet, toPerm);
2917 #endif
2918 
2919     // store result (unaligned)
2920     // Note: We can't use a read-modify-write sequence which touches additional Bytes.
2921     Register lo = temp, hi = fifteen; // Reuse
2922     __ vsldoi          (vTmp1, vRet, vRet, 8);
2923     __ mfvrd           (hi, vRet);
2924     __ mfvrd           (lo, vTmp1);
2925     __ std             (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
2926     __ std             (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
2927 
2928     __ blr();
2929 
2930 #ifdef ASSERT
2931     __ bind(L_error);
2932     __ stop("aescrypt_encryptBlock: invalid key length");
2933 #endif
2934      return start;
2935   }
2936 
2937   // Arguments for generated stub:
2938   //   R3_ARG1   - source byte array address
2939   //   R4_ARG2   - destination byte array address
2940   //   R5_ARG3   - K (key) in little endian int array
2941   address generate_aescrypt_decryptBlock() {
2942     assert(UseAES, "need AES instructions and misaligned SSE support");
2943     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
2944     StubCodeMark mark(this, stub_id);
2945 
2946     address start = __ function_entry();
2947 
2948     Label L_doLast, L_do44, L_do52, L_error;
2949 
2950     Register from           = R3_ARG1;  // source array address
2951     Register to             = R4_ARG2;  // destination array address
2952     Register key            = R5_ARG3;  // round key array
2953 
2954     Register keylen         = R8;
2955     Register temp           = R9;
2956     Register keypos         = R10;
2957     Register fifteen        = R12;
2958 
2959     VectorRegister vRet     = VR0;
2960 
2961     VectorRegister vKey1    = VR1;
2962     VectorRegister vKey2    = VR2;
2963     VectorRegister vKey3    = VR3;
2964     VectorRegister vKey4    = VR4;
2965     VectorRegister vKey5    = VR5;
2966 
2967     VectorRegister fromPerm = VR6;
2968     VectorRegister keyPerm  = VR7;
2969     VectorRegister toPerm   = VR8;
2970     VectorRegister fSplt    = VR9;
2971 
2972     VectorRegister vTmp1    = VR10;
2973     VectorRegister vTmp2    = VR11;
2974     VectorRegister vTmp3    = VR12;
2975     VectorRegister vTmp4    = VR13;
2976 
2977     __ li              (fifteen, 15);
2978 
2979     // load unaligned from[0-15] to vRet
2980     __ lvx             (vRet, from);
2981     __ lvx             (vTmp1, fifteen, from);
2982     __ lvsl            (fromPerm, from);
2983 #ifdef VM_LITTLE_ENDIAN
2984     __ vspltisb        (fSplt, 0x0f);
2985     __ vxor            (fromPerm, fromPerm, fSplt);
2986 #endif
2987     __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
2988 
2989     // load keylen (44 or 52 or 60)
2990     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2991 
2992     // to load keys
2993     __ load_perm       (keyPerm, key);
2994 #ifdef VM_LITTLE_ENDIAN
2995     __ vxor            (vTmp2, vTmp2, vTmp2);
2996     __ vspltisb        (vTmp2, -16);
2997     __ vrld            (keyPerm, keyPerm, vTmp2);
2998     __ vrld            (keyPerm, keyPerm, vTmp2);
2999     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
3000 #endif
3001 
3002     __ cmpwi           (CR0, keylen, 44);
3003     __ beq             (CR0, L_do44);
3004 
3005     __ cmpwi           (CR0, keylen, 52);
3006     __ beq             (CR0, L_do52);
3007 
3008 #ifdef ASSERT
3009     __ cmpwi           (CR0, keylen, 60);
3010     __ bne             (CR0, L_error);
3011 #endif
3012 
3013     // load the 15th round key to vKey1
3014     __ li              (keypos, 240);
3015     __ lvx             (vKey1, keypos, key);
3016     __ li              (keypos, 224);
3017     __ lvx             (vKey2, keypos, key);
3018     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
3019 
3020     // load the 14th round key to vKey2
3021     __ li              (keypos, 208);
3022     __ lvx             (vKey3, keypos, key);
3023     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3024 
3025     // load the 13th round key to vKey3
3026     __ li              (keypos, 192);
3027     __ lvx             (vKey4, keypos, key);
3028     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
3029 
3030     // load the 12th round key to vKey4
3031     __ li              (keypos, 176);
3032     __ lvx             (vKey5, keypos, key);
3033     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
3034 
3035     // load the 11th round key to vKey5
3036     __ li              (keypos, 160);
3037     __ lvx             (vTmp1, keypos, key);
3038     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
3039 
3040     // 1st - 5th rounds
3041     __ vxor            (vRet, vRet, vKey1);
3042     __ vncipher        (vRet, vRet, vKey2);
3043     __ vncipher        (vRet, vRet, vKey3);
3044     __ vncipher        (vRet, vRet, vKey4);
3045     __ vncipher        (vRet, vRet, vKey5);
3046 
3047     __ b               (L_doLast);
3048 
3049     __ align(32);
3050     __ bind            (L_do52);
3051 
3052     // load the 13th round key to vKey1
3053     __ li              (keypos, 208);
3054     __ lvx             (vKey1, keypos, key);
3055     __ li              (keypos, 192);
3056     __ lvx             (vKey2, keypos, key);
3057     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
3058 
3059     // load the 12th round key to vKey2
3060     __ li              (keypos, 176);
3061     __ lvx             (vKey3, keypos, key);
3062     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3063 
3064     // load the 11th round key to vKey3
3065     __ li              (keypos, 160);
3066     __ lvx             (vTmp1, keypos, key);
3067     __ vec_perm        (vKey3, vTmp1, vKey3, keyPerm);
3068 
3069     // 1st - 3rd rounds
3070     __ vxor            (vRet, vRet, vKey1);
3071     __ vncipher        (vRet, vRet, vKey2);
3072     __ vncipher        (vRet, vRet, vKey3);
3073 
3074     __ b               (L_doLast);
3075 
3076     __ align(32);
3077     __ bind            (L_do44);
3078 
3079     // load the 11th round key to vKey1
3080     __ li              (keypos, 176);
3081     __ lvx             (vKey1, keypos, key);
3082     __ li              (keypos, 160);
3083     __ lvx             (vTmp1, keypos, key);
3084     __ vec_perm        (vKey1, vTmp1, vKey1, keyPerm);
3085 
3086     // 1st round
3087     __ vxor            (vRet, vRet, vKey1);
3088 
3089     __ bind            (L_doLast);
3090 
3091     // load the 10th round key to vKey1
3092     __ li              (keypos, 144);
3093     __ lvx             (vKey2, keypos, key);
3094     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
3095 
3096     // load the 9th round key to vKey2
3097     __ li              (keypos, 128);
3098     __ lvx             (vKey3, keypos, key);
3099     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3100 
3101     // load the 8th round key to vKey3
3102     __ li              (keypos, 112);
3103     __ lvx             (vKey4, keypos, key);
3104     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
3105 
3106     // load the 7th round key to vKey4
3107     __ li              (keypos, 96);
3108     __ lvx             (vKey5, keypos, key);
3109     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
3110 
3111     // load the 6th round key to vKey5
3112     __ li              (keypos, 80);
3113     __ lvx             (vTmp1, keypos, key);
3114     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
3115 
3116     // last 10th - 6th rounds
3117     __ vncipher        (vRet, vRet, vKey1);
3118     __ vncipher        (vRet, vRet, vKey2);
3119     __ vncipher        (vRet, vRet, vKey3);
3120     __ vncipher        (vRet, vRet, vKey4);
3121     __ vncipher        (vRet, vRet, vKey5);
3122 
3123     // load the 5th round key to vKey1
3124     __ li              (keypos, 64);
3125     __ lvx             (vKey2, keypos, key);
3126     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
3127 
3128     // load the 4th round key to vKey2
3129     __ li              (keypos, 48);
3130     __ lvx             (vKey3, keypos, key);
3131     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
3132 
3133     // load the 3rd round key to vKey3
3134     __ li              (keypos, 32);
3135     __ lvx             (vKey4, keypos, key);
3136     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
3137 
3138     // load the 2nd round key to vKey4
3139     __ li              (keypos, 16);
3140     __ lvx             (vKey5, keypos, key);
3141     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
3142 
3143     // load the 1st round key to vKey5
3144     __ lvx             (vTmp1, key);
3145     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
3146 
3147     // last 5th - 1th rounds
3148     __ vncipher        (vRet, vRet, vKey1);
3149     __ vncipher        (vRet, vRet, vKey2);
3150     __ vncipher        (vRet, vRet, vKey3);
3151     __ vncipher        (vRet, vRet, vKey4);
3152     __ vncipherlast    (vRet, vRet, vKey5);
3153 
3154 #ifdef VM_LITTLE_ENDIAN
3155     // toPerm = 0x0F0E0D0C0B0A09080706050403020100
3156     __ lvsl            (toPerm, keypos); // keypos is a multiple of 16
3157     __ vxor            (toPerm, toPerm, fSplt);
3158 
3159     // Swap Bytes
3160     __ vperm           (vRet, vRet, vRet, toPerm);
3161 #endif
3162 
3163     // store result (unaligned)
3164     // Note: We can't use a read-modify-write sequence which touches additional Bytes.
3165     Register lo = temp, hi = fifteen; // Reuse
3166     __ vsldoi          (vTmp1, vRet, vRet, 8);
3167     __ mfvrd           (hi, vRet);
3168     __ mfvrd           (lo, vTmp1);
3169     __ std             (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
3170     __ std             (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
3171 
3172     __ blr();
3173 
3174 #ifdef ASSERT
3175     __ bind(L_error);
3176     __ stop("aescrypt_decryptBlock: invalid key length");
3177 #endif
3178      return start;
3179   }
3180 
3181   address generate_sha256_implCompress(StubGenStubId stub_id) {
3182     assert(UseSHA, "need SHA instructions");
3183     bool multi_block;
3184     switch (stub_id) {
3185     case sha256_implCompress_id:
3186       multi_block = false;
3187       break;
3188     case sha256_implCompressMB_id:
3189       multi_block = true;
3190       break;
3191     default:
3192       ShouldNotReachHere();
3193     }
3194     StubCodeMark mark(this, stub_id);
3195     address start = __ function_entry();
3196 
3197     __ sha256 (multi_block);
3198     __ blr();
3199 
3200     return start;
3201   }
3202 
3203   address generate_sha512_implCompress(StubGenStubId stub_id) {
3204     assert(UseSHA, "need SHA instructions");
3205     bool multi_block;
3206     switch (stub_id) {
3207     case sha512_implCompress_id:
3208       multi_block = false;
3209       break;
3210     case sha512_implCompressMB_id:
3211       multi_block = true;
3212       break;
3213     default:
3214       ShouldNotReachHere();
3215     }
3216     StubCodeMark mark(this, stub_id);
3217     address start = __ function_entry();
3218 
3219     __ sha512 (multi_block);
3220     __ blr();
3221 
3222     return start;
3223   }
3224 
3225   address generate_data_cache_writeback() {
3226     const Register cacheline = R3_ARG1;
3227     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
3228     StubCodeMark mark(this, stub_id);
3229     address start = __ pc();
3230 
3231     __ cache_wb(Address(cacheline));
3232     __ blr();
3233 
3234     return start;
3235   }
3236 
3237   address generate_data_cache_writeback_sync() {
3238     const Register is_presync = R3_ARG1;
3239     Register temp = R4;
3240     Label SKIP;
3241     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
3242     StubCodeMark mark(this, stub_id);
3243     address start = __ pc();
3244 
3245     __ andi_(temp, is_presync, 1);
3246     __ bne(CR0, SKIP);
3247     __ cache_wbsync(false); // post sync => emit 'sync'
3248     __ bind(SKIP);          // pre sync => emit nothing
3249     __ blr();
3250 
3251     return start;
3252   }
3253 
3254   void generate_arraycopy_stubs() {
3255     // Note: the disjoint stubs must be generated first, some of
3256     // the conjoint stubs use them.
3257 
3258     address ucm_common_error_exit       =  generate_unsafecopy_common_error_exit();
3259     UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit);
3260 
3261     // non-aligned disjoint versions
3262     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(StubGenStubId::jbyte_disjoint_arraycopy_id);
3263     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(StubGenStubId::jshort_disjoint_arraycopy_id);
3264     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(StubGenStubId::jint_disjoint_arraycopy_id);
3265     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(StubGenStubId::jlong_disjoint_arraycopy_id);
3266     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(StubGenStubId::oop_disjoint_arraycopy_id);
3267     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(StubGenStubId::oop_disjoint_arraycopy_uninit_id);
3268 
3269     // aligned disjoint versions
3270     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id);
3271     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id);
3272     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id);
3273     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id);
3274     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id);
3275     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubGenStubId::oop_disjoint_arraycopy_uninit_id);
3276 
3277     // non-aligned conjoint versions
3278     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(StubGenStubId::jbyte_arraycopy_id);
3279     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(StubGenStubId::jshort_arraycopy_id);
3280     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(StubGenStubId::jint_arraycopy_id);
3281     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(StubGenStubId::jlong_arraycopy_id);
3282     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(StubGenStubId::oop_arraycopy_id);
3283     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubGenStubId::oop_arraycopy_uninit_id);
3284 
3285     // aligned conjoint versions
3286     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(StubGenStubId::arrayof_jbyte_arraycopy_id);
3287     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(StubGenStubId::arrayof_jshort_arraycopy_id);
3288     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(StubGenStubId::arrayof_jint_arraycopy_id);
3289     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(StubGenStubId::arrayof_jlong_arraycopy_id);
3290     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(StubGenStubId::arrayof_oop_arraycopy_id);
3291     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubGenStubId::arrayof_oop_arraycopy_id);
3292 
3293     // special/generic versions
3294     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id);
3295     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id);
3296 
3297     StubRoutines::_unsafe_arraycopy  = generate_unsafe_copy(STUB_ENTRY(jbyte_arraycopy()),
3298                                                             STUB_ENTRY(jshort_arraycopy()),
3299                                                             STUB_ENTRY(jint_arraycopy()),
3300                                                             STUB_ENTRY(jlong_arraycopy()));
3301     StubRoutines::_generic_arraycopy = generate_generic_copy(STUB_ENTRY(jbyte_arraycopy()),
3302                                                              STUB_ENTRY(jshort_arraycopy()),
3303                                                              STUB_ENTRY(jint_arraycopy()),
3304                                                              STUB_ENTRY(oop_arraycopy()),
3305                                                              STUB_ENTRY(oop_disjoint_arraycopy()),
3306                                                              STUB_ENTRY(jlong_arraycopy()),
3307                                                              STUB_ENTRY(checkcast_arraycopy()));
3308 
3309     // fill routines
3310 #ifdef COMPILER2
3311     if (OptimizeFill) {
3312       StubRoutines::_jbyte_fill          = generate_fill(StubGenStubId::jbyte_fill_id);
3313       StubRoutines::_jshort_fill         = generate_fill(StubGenStubId::jshort_fill_id);
3314       StubRoutines::_jint_fill           = generate_fill(StubGenStubId::jint_fill_id);
3315       StubRoutines::_arrayof_jbyte_fill  = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
3316       StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
3317       StubRoutines::_arrayof_jint_fill   = generate_fill(StubGenStubId::arrayof_jint_fill_id);
3318     }
3319     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
3320 #endif
3321   }
3322 
3323   // Stub for BigInteger::multiplyToLen()
3324   //
3325   //  Arguments:
3326   //
3327   //  Input:
3328   //    R3 - x address
3329   //    R4 - x length
3330   //    R5 - y address
3331   //    R6 - y length
3332   //    R7 - z address
3333   //
3334   address generate_multiplyToLen() {
3335 
3336     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
3337     StubCodeMark mark(this, stub_id);
3338 
3339     address start = __ function_entry();
3340 
3341     const Register x     = R3;
3342     const Register xlen  = R4;
3343     const Register y     = R5;
3344     const Register ylen  = R6;
3345     const Register z     = R7;
3346 
3347     const Register tmp1  = R2; // TOC not used.
3348     const Register tmp2  = R9;
3349     const Register tmp3  = R10;
3350     const Register tmp4  = R11;
3351     const Register tmp5  = R12;
3352 
3353     // non-volatile regs
3354     const Register tmp6  = R31;
3355     const Register tmp7  = R30;
3356     const Register tmp8  = R29;
3357     const Register tmp9  = R28;
3358     const Register tmp10 = R27;
3359     const Register tmp11 = R26;
3360     const Register tmp12 = R25;
3361     const Register tmp13 = R24;
3362 
3363     BLOCK_COMMENT("Entry:");
3364 
3365     // C2 does not respect int to long conversion for stub calls.
3366     __ clrldi(xlen, xlen, 32);
3367     __ clrldi(ylen, ylen, 32);
3368 
3369     // Save non-volatile regs (frameless).
3370     int current_offs = 8;
3371     __ std(R24, -current_offs, R1_SP); current_offs += 8;
3372     __ std(R25, -current_offs, R1_SP); current_offs += 8;
3373     __ std(R26, -current_offs, R1_SP); current_offs += 8;
3374     __ std(R27, -current_offs, R1_SP); current_offs += 8;
3375     __ std(R28, -current_offs, R1_SP); current_offs += 8;
3376     __ std(R29, -current_offs, R1_SP); current_offs += 8;
3377     __ std(R30, -current_offs, R1_SP); current_offs += 8;
3378     __ std(R31, -current_offs, R1_SP);
3379 
3380     __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5,
3381                        tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
3382 
3383     // Restore non-volatile regs.
3384     current_offs = 8;
3385     __ ld(R24, -current_offs, R1_SP); current_offs += 8;
3386     __ ld(R25, -current_offs, R1_SP); current_offs += 8;
3387     __ ld(R26, -current_offs, R1_SP); current_offs += 8;
3388     __ ld(R27, -current_offs, R1_SP); current_offs += 8;
3389     __ ld(R28, -current_offs, R1_SP); current_offs += 8;
3390     __ ld(R29, -current_offs, R1_SP); current_offs += 8;
3391     __ ld(R30, -current_offs, R1_SP); current_offs += 8;
3392     __ ld(R31, -current_offs, R1_SP);
3393 
3394     __ blr();  // Return to caller.
3395 
3396     return start;
3397   }
3398 
3399   /**
3400   *  Arguments:
3401   *
3402   *  Input:
3403   *   R3_ARG1    - out address
3404   *   R4_ARG2    - in address
3405   *   R5_ARG3    - offset
3406   *   R6_ARG4    - len
3407   *   R7_ARG5    - k
3408   *  Output:
3409   *   R3_RET     - carry
3410   */
3411   address generate_mulAdd() {
3412     __ align(CodeEntryAlignment);
3413     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
3414     StubCodeMark mark(this, stub_id);
3415 
3416     address start = __ function_entry();
3417 
3418     // C2 does not sign extend signed parameters to full 64 bits registers:
3419     __ rldic (R5_ARG3, R5_ARG3, 2, 32);  // always positive
3420     __ clrldi(R6_ARG4, R6_ARG4, 32);     // force zero bits on higher word
3421     __ clrldi(R7_ARG5, R7_ARG5, 32);     // force zero bits on higher word
3422 
3423     __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
3424 
3425     // Moves output carry to return register
3426     __ mr    (R3_RET,  R10);
3427 
3428     __ blr();
3429 
3430     return start;
3431   }
3432 
3433   /**
3434   *  Arguments:
3435   *
3436   *  Input:
3437   *   R3_ARG1    - in address
3438   *   R4_ARG2    - in length
3439   *   R5_ARG3    - out address
3440   *   R6_ARG4    - out length
3441   */
3442   address generate_squareToLen() {
3443     __ align(CodeEntryAlignment);
3444     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
3445     StubCodeMark mark(this, stub_id);
3446 
3447     address start = __ function_entry();
3448 
3449     // args - higher word is cleaned (unsignedly) due to int to long casting
3450     const Register in        = R3_ARG1;
3451     const Register in_len    = R4_ARG2;
3452     __ clrldi(in_len, in_len, 32);
3453     const Register out       = R5_ARG3;
3454     const Register out_len   = R6_ARG4;
3455     __ clrldi(out_len, out_len, 32);
3456 
3457     // output
3458     const Register ret       = R3_RET;
3459 
3460     // temporaries
3461     const Register lplw_s    = R7;
3462     const Register in_aux    = R8;
3463     const Register out_aux   = R9;
3464     const Register piece     = R10;
3465     const Register product   = R14;
3466     const Register lplw      = R15;
3467     const Register i_minus1  = R16;
3468     const Register carry     = R17;
3469     const Register offset    = R18;
3470     const Register off_aux   = R19;
3471     const Register t         = R20;
3472     const Register mlen      = R21;
3473     const Register len       = R22;
3474     const Register a         = R23;
3475     const Register b         = R24;
3476     const Register i         = R25;
3477     const Register c         = R26;
3478     const Register cs        = R27;
3479 
3480     // Labels
3481     Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_LOOP_SQUARE;
3482     Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_SQUARE;
3483 
3484     // Save non-volatile regs (frameless).
3485     int current_offs = -8;
3486     __ std(R28, current_offs, R1_SP); current_offs -= 8;
3487     __ std(R27, current_offs, R1_SP); current_offs -= 8;
3488     __ std(R26, current_offs, R1_SP); current_offs -= 8;
3489     __ std(R25, current_offs, R1_SP); current_offs -= 8;
3490     __ std(R24, current_offs, R1_SP); current_offs -= 8;
3491     __ std(R23, current_offs, R1_SP); current_offs -= 8;
3492     __ std(R22, current_offs, R1_SP); current_offs -= 8;
3493     __ std(R21, current_offs, R1_SP); current_offs -= 8;
3494     __ std(R20, current_offs, R1_SP); current_offs -= 8;
3495     __ std(R19, current_offs, R1_SP); current_offs -= 8;
3496     __ std(R18, current_offs, R1_SP); current_offs -= 8;
3497     __ std(R17, current_offs, R1_SP); current_offs -= 8;
3498     __ std(R16, current_offs, R1_SP); current_offs -= 8;
3499     __ std(R15, current_offs, R1_SP); current_offs -= 8;
3500     __ std(R14, current_offs, R1_SP);
3501 
3502     // Store the squares, right shifted one bit (i.e., divided by 2)
3503     __ subi   (out_aux,   out,       8);
3504     __ subi   (in_aux,    in,        4);
3505     __ cmpwi  (CR0,      in_len,    0);
3506     // Initialize lplw outside of the loop
3507     __ xorr   (lplw,      lplw,      lplw);
3508     __ ble    (CR0,      SKIP_LOOP_SQUARE);    // in_len <= 0
3509     __ mtctr  (in_len);
3510 
3511     __ bind(LOOP_SQUARE);
3512     __ lwzu   (piece,     4,         in_aux);
3513     __ mulld  (product,   piece,     piece);
3514     // shift left 63 bits and only keep the MSB
3515     __ rldic  (lplw_s,    lplw,      63, 0);
3516     __ mr     (lplw,      product);
3517     // shift right 1 bit without sign extension
3518     __ srdi   (product,   product,   1);
3519     // join them to the same register and store it
3520     __ orr    (product,   lplw_s,    product);
3521 #ifdef VM_LITTLE_ENDIAN
3522     // Swap low and high words for little endian
3523     __ rldicl (product,   product,   32, 0);
3524 #endif
3525     __ stdu   (product,   8,         out_aux);
3526     __ bdnz   (LOOP_SQUARE);
3527 
3528     __ bind(SKIP_LOOP_SQUARE);
3529 
3530     // Add in off-diagonal sums
3531     __ cmpwi  (CR0,      in_len,    0);
3532     __ ble    (CR0,      SKIP_DIAGONAL_SUM);
3533     // Avoid CTR usage here in order to use it at mulAdd
3534     __ subi   (i_minus1,  in_len,    1);
3535     __ li     (offset,    4);
3536 
3537     __ bind(LOOP_DIAGONAL_SUM);
3538 
3539     __ sldi   (off_aux,   out_len,   2);
3540     __ sub    (off_aux,   off_aux,   offset);
3541 
3542     __ mr     (len,       i_minus1);
3543     __ sldi   (mlen,      i_minus1,  2);
3544     __ lwzx   (t,         in,        mlen);
3545 
3546     __ muladd (out, in, off_aux, len, t, a, b, carry);
3547 
3548     // begin<addOne>
3549     // off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
3550     __ addi   (mlen,      mlen,      4);
3551     __ sldi   (a,         out_len,   2);
3552     __ subi   (a,         a,         4);
3553     __ sub    (a,         a,         mlen);
3554     __ subi   (off_aux,   offset,    4);
3555     __ sub    (off_aux,   a,         off_aux);
3556 
3557     __ lwzx   (b,         off_aux,   out);
3558     __ add    (b,         b,         carry);
3559     __ stwx   (b,         off_aux,   out);
3560 
3561     // if (((uint64_t)s >> 32) != 0) {
3562     __ srdi_  (a,         b,         32);
3563     __ beq    (CR0,      SKIP_ADDONE);
3564 
3565     // while (--mlen >= 0) {
3566     __ bind(LOOP_ADDONE);
3567     __ subi   (mlen,      mlen,      4);
3568     __ cmpwi  (CR0,      mlen,      0);
3569     __ beq    (CR0,      SKIP_ADDONE);
3570 
3571     // if (--offset_aux < 0) { // Carry out of number
3572     __ subi   (off_aux,   off_aux,   4);
3573     __ cmpwi  (CR0,      off_aux,   0);
3574     __ blt    (CR0,      SKIP_ADDONE);
3575 
3576     // } else {
3577     __ lwzx   (b,         off_aux,   out);
3578     __ addi   (b,         b,         1);
3579     __ stwx   (b,         off_aux,   out);
3580     __ cmpwi  (CR0,      b,         0);
3581     __ bne    (CR0,      SKIP_ADDONE);
3582     __ b      (LOOP_ADDONE);
3583 
3584     __ bind(SKIP_ADDONE);
3585     // } } } end<addOne>
3586 
3587     __ addi   (offset,    offset,    8);
3588     __ subi   (i_minus1,  i_minus1,  1);
3589     __ cmpwi  (CR0,      i_minus1,  0);
3590     __ bge    (CR0,      LOOP_DIAGONAL_SUM);
3591 
3592     __ bind(SKIP_DIAGONAL_SUM);
3593 
3594     // Shift back up and set low bit
3595     // Shifts 1 bit left up to len positions. Assumes no leading zeros
3596     // begin<primitiveLeftShift>
3597     __ cmpwi  (CR0,      out_len,   0);
3598     __ ble    (CR0,      SKIP_LSHIFT);
3599     __ li     (i,         0);
3600     __ lwz    (c,         0,         out);
3601     __ subi   (b,         out_len,   1);
3602     __ mtctr  (b);
3603 
3604     __ bind(LOOP_LSHIFT);
3605     __ mr     (b,         c);
3606     __ addi   (cs,        i,         4);
3607     __ lwzx   (c,         out,       cs);
3608 
3609     __ sldi   (b,         b,         1);
3610     __ srwi   (cs,        c,         31);
3611     __ orr    (b,         b,         cs);
3612     __ stwx   (b,         i,         out);
3613 
3614     __ addi   (i,         i,         4);
3615     __ bdnz   (LOOP_LSHIFT);
3616 
3617     __ sldi   (c,         out_len,   2);
3618     __ subi   (c,         c,         4);
3619     __ lwzx   (b,         out,       c);
3620     __ sldi   (b,         b,         1);
3621     __ stwx   (b,         out,       c);
3622 
3623     __ bind(SKIP_LSHIFT);
3624     // end<primitiveLeftShift>
3625 
3626     // Set low bit
3627     __ sldi   (i,         in_len,    2);
3628     __ subi   (i,         i,         4);
3629     __ lwzx   (i,         in,        i);
3630     __ sldi   (c,         out_len,   2);
3631     __ subi   (c,         c,         4);
3632     __ lwzx   (b,         out,       c);
3633 
3634     __ andi   (i,         i,         1);
3635     __ orr    (i,         b,         i);
3636 
3637     __ stwx   (i,         out,       c);
3638 
3639     // Restore non-volatile regs.
3640     current_offs = -8;
3641     __ ld(R28, current_offs, R1_SP); current_offs -= 8;
3642     __ ld(R27, current_offs, R1_SP); current_offs -= 8;
3643     __ ld(R26, current_offs, R1_SP); current_offs -= 8;
3644     __ ld(R25, current_offs, R1_SP); current_offs -= 8;
3645     __ ld(R24, current_offs, R1_SP); current_offs -= 8;
3646     __ ld(R23, current_offs, R1_SP); current_offs -= 8;
3647     __ ld(R22, current_offs, R1_SP); current_offs -= 8;
3648     __ ld(R21, current_offs, R1_SP); current_offs -= 8;
3649     __ ld(R20, current_offs, R1_SP); current_offs -= 8;
3650     __ ld(R19, current_offs, R1_SP); current_offs -= 8;
3651     __ ld(R18, current_offs, R1_SP); current_offs -= 8;
3652     __ ld(R17, current_offs, R1_SP); current_offs -= 8;
3653     __ ld(R16, current_offs, R1_SP); current_offs -= 8;
3654     __ ld(R15, current_offs, R1_SP); current_offs -= 8;
3655     __ ld(R14, current_offs, R1_SP);
3656 
3657     __ mr(ret, out);
3658     __ blr();
3659 
3660     return start;
3661   }
3662 
3663   /**
3664    * Arguments:
3665    *
3666    * Inputs:
3667    *   R3_ARG1    - int   crc
3668    *   R4_ARG2    - byte* buf
3669    *   R5_ARG3    - int   length (of buffer)
3670    *
3671    * scratch:
3672    *   R2, R6-R12
3673    *
3674    * Output:
3675    *   R3_RET     - int   crc result
3676    */
3677   // Compute CRC32 function.
3678   address generate_CRC32_updateBytes(StubGenStubId stub_id) {
3679     bool is_crc32c;
3680     switch (stub_id) {
3681     case updateBytesCRC32_id:
3682       is_crc32c = false;
3683       break;
3684     case updateBytesCRC32C_id:
3685       is_crc32c = true;
3686       break;
3687     default:
3688       ShouldNotReachHere();
3689     }
3690     __ align(CodeEntryAlignment);
3691     StubCodeMark mark(this, stub_id);
3692     address start = __ function_entry();  // Remember stub start address (is rtn value).
3693     __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
3694     __ blr();
3695     return start;
3696   }
3697 
3698   address generate_floatToFloat16() {
3699     __ align(CodeEntryAlignment);
3700     StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
3701     address start = __ function_entry();
3702     __ f2hf(R3_RET, F1_ARG1, F0);
3703     __ blr();
3704     return start;
3705   }
3706 
3707   address generate_float16ToFloat() {
3708     __ align(CodeEntryAlignment);
3709     StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
3710     address start = __ function_entry();
3711     __ hf2f(F1_RET, R3_ARG1);
3712     __ blr();
3713     return start;
3714   }
3715 
3716   address generate_method_entry_barrier() {
3717     __ align(CodeEntryAlignment);
3718     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
3719     StubCodeMark mark(this, stub_id);
3720 
3721     address stub_address = __ pc();
3722 
3723     int nbytes_save = MacroAssembler::num_volatile_regs * BytesPerWord;
3724     __ save_volatile_gprs(R1_SP, -nbytes_save, true);
3725 
3726     // Link register points to instruction in prologue of the guarded nmethod.
3727     // As the stub requires one layer of indirection (argument is of type address* and not address),
3728     // passing the link register's value directly doesn't work.
3729     // Since we have to save the link register on the stack anyway, we calculate the corresponding stack address
3730     // and pass that one instead.
3731     __ addi(R3_ARG1, R1_SP, _abi0(lr));
3732 
3733     __ save_LR(R0);
3734     __ push_frame_reg_args(nbytes_save, R0);
3735 
3736     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier));
3737     __ mr(R0, R3_RET);
3738 
3739     __ pop_frame();
3740     __ restore_LR(R3_RET /* used as tmp register */);
3741     __ restore_volatile_gprs(R1_SP, -nbytes_save, true);
3742 
3743     __ cmpdi(CR0, R0, 0);
3744 
3745     // Return to prologue if no deoptimization is required (bnelr)
3746     __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintIsTaken);
3747 
3748     // Deoptimization required.
3749     // For actually handling the deoptimization, the 'wrong method stub' is invoked.
3750     __ load_const_optimized(R0, SharedRuntime::get_handle_wrong_method_stub());
3751     __ mtctr(R0);
3752 
3753     // Pop the frame built in the prologue.
3754     __ pop_frame();
3755 
3756     // Restore link register.  Required as the 'wrong method stub' needs the caller's frame
3757     // to properly deoptimize this method (e.g. by re-resolving the call site for compiled methods).
3758     // This method's prologue is aborted.
3759     __ restore_LR(R0);
3760 
3761     __ bctr();
3762     return stub_address;
3763   }
3764 
3765 #ifdef VM_LITTLE_ENDIAN
3766 // The following Base64 decode intrinsic is based on an algorithm outlined
3767 // in here:
3768 // http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html
3769 // in the section titled "Vector lookup (pshufb with bitmask)"
3770 //
3771 // This implementation differs in the following ways:
3772 //  * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions
3773 //    are used instead.  It turns out that some of the vector operations
3774 //    needed in the algorithm require fewer AltiVec instructions.
3775 //  * The algorithm in the above mentioned paper doesn't handle the
3776 //    Base64-URL variant in RFC 4648.  Adjustments to both the code and to two
3777 //    lookup tables are needed for this.
3778 //  * The "Pack" section of the code is a complete rewrite for Power because we
3779 //    can utilize better instructions for this step.
3780 //
3781 
3782 // Offsets per group of Base64 characters
3783 // Uppercase
3784 #define UC  (signed char)((-'A' + 0) & 0xff)
3785 // Lowercase
3786 #define LC  (signed char)((-'a' + 26) & 0xff)
3787 // Digits
3788 #define DIG (signed char)((-'0' + 52) & 0xff)
3789 // Plus sign (URL = 0)
3790 #define PLS (signed char)((-'+' + 62) & 0xff)
3791 // Hyphen (URL = 1)
3792 #define HYP (signed char)((-'-' + 62) & 0xff)
3793 // Slash (URL = 0)
3794 #define SLS (signed char)((-'/' + 63) & 0xff)
3795 // Underscore (URL = 1)
3796 #define US  (signed char)((-'_' + 63) & 0xff)
3797 
3798 // For P10 (or later) only
3799 #define VALID_B64 0x80
3800 #define VB64(x) (VALID_B64 | x)
3801 
3802 #define BLK_OFFSETOF(x) (offsetof(constant_block, x))
3803 
3804 // In little-endian mode, the lxv instruction loads the element at EA into
3805 // element 15 of the vector register, EA+1 goes into element 14, and so
3806 // on.
3807 //
3808 // To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the
3809 // order of the elements in a vector initialization.
3810 #define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0
3811 
3812   //
3813   // Base64 decodeBlock intrinsic
3814   address generate_base64_decodeBlock() {
3815     __ align(CodeEntryAlignment);
3816     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
3817     StubCodeMark mark(this, stub_id);
3818     address start   = __ function_entry();
3819 
3820     typedef struct {
3821       signed char offsetLUT_val[16];
3822       signed char offsetLUT_URL_val[16];
3823       unsigned char maskLUT_val[16];
3824       unsigned char maskLUT_URL_val[16];
3825       unsigned char bitposLUT_val[16];
3826       unsigned char table_32_47_val[16];
3827       unsigned char table_32_47_URL_val[16];
3828       unsigned char table_48_63_val[16];
3829       unsigned char table_64_79_val[16];
3830       unsigned char table_80_95_val[16];
3831       unsigned char table_80_95_URL_val[16];
3832       unsigned char table_96_111_val[16];
3833       unsigned char table_112_127_val[16];
3834       unsigned char pack_lshift_val[16];
3835       unsigned char pack_rshift_val[16];
3836       unsigned char pack_permute_val[16];
3837     } constant_block;
3838 
3839     alignas(16) static const constant_block const_block = {
3840 
3841       .offsetLUT_val = {
3842         ARRAY_TO_LXV_ORDER(
3843         0,   0, PLS, DIG,  UC,  UC,  LC,  LC,
3844         0,   0,   0,   0,   0,   0,   0,   0 ) },
3845 
3846       .offsetLUT_URL_val = {
3847         ARRAY_TO_LXV_ORDER(
3848         0,   0, HYP, DIG,  UC,  UC,  LC,  LC,
3849         0,   0,   0,   0,   0,   0,   0,   0 ) },
3850 
3851       .maskLUT_val = {
3852         ARRAY_TO_LXV_ORDER(
3853         /* 0        */ (unsigned char)0b10101000,
3854         /* 1 .. 9   */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3855                        (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3856                        (unsigned char)0b11111000,
3857         /* 10       */ (unsigned char)0b11110000,
3858         /* 11       */ (unsigned char)0b01010100,
3859         /* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000,
3860         /* 15       */ (unsigned char)0b01010100 ) },
3861 
3862       .maskLUT_URL_val = {
3863         ARRAY_TO_LXV_ORDER(
3864         /* 0        */ (unsigned char)0b10101000,
3865         /* 1 .. 9   */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3866                        (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3867                        (unsigned char)0b11111000,
3868         /* 10       */ (unsigned char)0b11110000,
3869         /* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000,
3870         /* 13       */ (unsigned char)0b01010100,
3871         /* 14       */ (unsigned char)0b01010000,
3872         /* 15       */ (unsigned char)0b01110000 ) },
3873 
3874       .bitposLUT_val = {
3875         ARRAY_TO_LXV_ORDER(
3876         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80,
3877         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) },
3878 
3879       // In the following table_*_val constants, a 0 value means the
3880       // character is not in the Base64 character set
3881       .table_32_47_val = {
3882         ARRAY_TO_LXV_ORDER (
3883          /* space .. '*' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '+' = 62 */ VB64(62), /* ',' .. '.' = 0 */ 0, 0, 0, /* '/' = 63 */ VB64(63) ) },
3884 
3885       .table_32_47_URL_val = {
3886         ARRAY_TO_LXV_ORDER(
3887          /* space .. ',' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '-' = 62 */ VB64(62), /* '.' .. '/' */ 0, 0 ) },
3888 
3889       .table_48_63_val = {
3890         ARRAY_TO_LXV_ORDER(
3891          /* '0' .. '9' = 52 .. 61 */ VB64(52), VB64(53), VB64(54), VB64(55), VB64(56), VB64(57), VB64(58), VB64(59), VB64(60), VB64(61),
3892          /* ':' .. '?' = 0 */ 0, 0, 0, 0, 0, 0 ) },
3893 
3894       .table_64_79_val = {
3895         ARRAY_TO_LXV_ORDER(
3896          /* '@' = 0 */ 0, /* 'A' .. 'O' = 0 .. 14 */ VB64(0), VB64(1), VB64(2), VB64(3), VB64(4), VB64(5), VB64(6), VB64(7), VB64(8),
3897          VB64(9), VB64(10), VB64(11), VB64(12), VB64(13), VB64(14) ) },
3898 
3899       .table_80_95_val = {
3900         ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3901         VB64(23), VB64(24), VB64(25), /* '[' .. '_' = 0 */ 0, 0, 0, 0, 0 ) },
3902 
3903       .table_80_95_URL_val = {
3904         ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3905         VB64(23), VB64(24), VB64(25), /* '[' .. '^' = 0 */ 0, 0, 0, 0, /* '_' = 63 */ VB64(63) ) },
3906 
3907       .table_96_111_val = {
3908         ARRAY_TO_LXV_ORDER(/* '`' = 0 */ 0, /* 'a' .. 'o' = 26 .. 40 */ VB64(26), VB64(27), VB64(28), VB64(29), VB64(30), VB64(31),
3909         VB64(32), VB64(33), VB64(34), VB64(35), VB64(36), VB64(37), VB64(38), VB64(39), VB64(40) ) },
3910 
3911       .table_112_127_val = {
3912         ARRAY_TO_LXV_ORDER(/* 'p' .. 'z' = 41 .. 51 */ VB64(41), VB64(42), VB64(43), VB64(44), VB64(45), VB64(46), VB64(47), VB64(48),
3913         VB64(49), VB64(50), VB64(51), /* '{' .. DEL = 0 */ 0, 0, 0, 0, 0 ) },
3914 
3915       .pack_lshift_val = {
3916         ARRAY_TO_LXV_ORDER(
3917         0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2 ) },
3918 
3919       .pack_rshift_val = {
3920         ARRAY_TO_LXV_ORDER(
3921         0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0 ) },
3922 
3923       // The first 4 index values are "don't care" because
3924       // we only use the first 12 bytes of the vector,
3925       // which are decoded from 16 bytes of Base64 characters.
3926       .pack_permute_val = {
3927         ARRAY_TO_LXV_ORDER(
3928          0, 0, 0, 0,
3929          0,  1,  2,
3930          4,  5,  6,
3931          8,  9, 10,
3932         12, 13, 14 ) }
3933     };
3934 
3935     const unsigned block_size = 16;  // number of bytes to process in each pass through the loop
3936     const unsigned block_size_shift = 4;
3937 
3938     // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
3939     Register s      = R3_ARG1; // source starting address of Base64 characters
3940     Register sp     = R4_ARG2; // source offset
3941     Register sl     = R5_ARG3; // source length = # of Base64 characters to be processed
3942     Register d      = R6_ARG4; // destination address
3943     Register dp     = R7_ARG5; // destination offset
3944     Register isURL  = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
3945     Register isMIME = R9_ARG7; // boolean, if non-zero indicates use of RFC 2045 MIME encoding - not used
3946 
3947     // Local variables
3948     Register const_ptr     = R9;  // used for loading constants
3949     Register tmp_reg       = R10; // used for speeding up load_constant_optimized()
3950 
3951     // Re-use R9 and R10 to avoid using non-volatile registers (requires save/restore)
3952     Register out           = R9;  // moving out (destination) pointer
3953     Register in            = R10; // moving in (source) pointer
3954 
3955     // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
3956     // VR Constants
3957     VectorRegister  vec_0s                  = VR0;
3958     VectorRegister  vec_4s                  = VR1;
3959     VectorRegister  vec_8s                  = VR2;
3960     VectorRegister  vec_special_case_char   = VR3;
3961     VectorRegister  pack_rshift             = VR4;
3962     VectorRegister  pack_lshift             = VR5;
3963 
3964     // VSR Constants
3965     VectorSRegister offsetLUT               = VSR0;
3966     VectorSRegister maskLUT                 = VSR1;
3967     VectorSRegister bitposLUT               = VSR2;
3968     VectorSRegister vec_0xfs                = VSR3;
3969     VectorSRegister vec_special_case_offset = VSR4;
3970     VectorSRegister pack_permute            = VSR5;
3971 
3972     // P10 (or later) VSR lookup constants
3973     VectorSRegister table_32_47             = VSR0;
3974     VectorSRegister table_48_63             = VSR1;
3975     VectorSRegister table_64_79             = VSR2;
3976     VectorSRegister table_80_95             = VSR3;
3977     VectorSRegister table_96_111            = VSR4;
3978     VectorSRegister table_112_127           = VSR6;
3979 
3980     // Data read in and later converted
3981     VectorRegister  input                   = VR6;
3982     // Variable for testing Base64 validity
3983     VectorRegister  non_match               = VR10;
3984 
3985     // P9 VR Variables for lookup
3986     VectorRegister  higher_nibble           = VR7;
3987     VectorRegister  eq_special_case_char    = VR8;
3988     VectorRegister  offsets                 = VR9;
3989 
3990     // P9 VSR lookup variables
3991     VectorSRegister bit                     = VSR6;
3992     VectorSRegister lower_nibble            = VSR7;
3993     VectorSRegister M                       = VSR8;
3994 
3995     // P10 (or later) VSR lookup variables
3996     VectorSRegister  xlate_a                = VSR7;
3997     VectorSRegister  xlate_b                = VSR8;
3998 
3999     // Variables for pack
4000     // VR
4001     VectorRegister  l                       = VR7;  // reuse higher_nibble's register
4002     VectorRegister  r                       = VR8;  // reuse eq_special_case_char's register
4003     VectorRegister  gathered                = VR10; // reuse non_match's register
4004 
4005     Label not_URL, calculate_size, loop_start, loop_exit, return_zero;
4006 
4007     // The upper 32 bits of the non-pointer parameter registers are not
4008     // guaranteed to be zero, so mask off those upper bits.
4009     __ clrldi(sp, sp, 32);
4010     __ clrldi(sl, sl, 32);
4011 
4012     // Don't handle the last 4 characters of the source, because this
4013     // VSX-based algorithm doesn't handle padding characters.  Also the
4014     // vector code will always write 16 bytes of decoded data on each pass,
4015     // but only the first 12 of those 16 bytes are valid data (16 base64
4016     // characters become 12 bytes of binary data), so for this reason we
4017     // need to subtract an additional 8 bytes from the source length, in
4018     // order not to write past the end of the destination buffer.  The
4019     // result of this subtraction implies that a Java function in the
4020     // Base64 class will be used to process the last 12 characters.
4021     __ sub(sl, sl, sp);
4022     __ subi(sl, sl, 12);
4023 
4024     // Load CTR with the number of passes through the loop
4025     // = sl >> block_size_shift.  After the shift, if sl <= 0, there's too
4026     // little data to be processed by this intrinsic.
4027     __ srawi_(sl, sl, block_size_shift);
4028     __ ble(CR0, return_zero);
4029     __ mtctr(sl);
4030 
4031     // Clear the other two parameter registers upper 32 bits.
4032     __ clrldi(isURL, isURL, 32);
4033     __ clrldi(dp, dp, 32);
4034 
4035     // Load constant vec registers that need to be loaded from memory
4036     __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
4037     __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
4038     __ lxv(pack_rshift->to_vsr(), BLK_OFFSETOF(pack_rshift_val), const_ptr);
4039     __ lxv(pack_lshift->to_vsr(), BLK_OFFSETOF(pack_lshift_val), const_ptr);
4040     __ lxv(pack_permute, BLK_OFFSETOF(pack_permute_val), const_ptr);
4041 
4042     // Splat the constants that can use xxspltib
4043     __ xxspltib(vec_0s->to_vsr(), 0);
4044     __ xxspltib(vec_8s->to_vsr(), 8);
4045     if (PowerArchitecturePPC64 >= 10) {
4046       // Using VALID_B64 for the offsets effectively strips the upper bit
4047       // of each byte that was selected from the table.  Setting the upper
4048       // bit gives us a way to distinguish between the 6-bit value of 0
4049       // from an error code of 0, which will happen if the character is
4050       // outside the range of the lookup, or is an illegal Base64
4051       // character, such as %.
4052       __ xxspltib(offsets->to_vsr(), VALID_B64);
4053 
4054       __ lxv(table_48_63, BLK_OFFSETOF(table_48_63_val), const_ptr);
4055       __ lxv(table_64_79, BLK_OFFSETOF(table_64_79_val), const_ptr);
4056       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
4057       __ lxv(table_96_111, BLK_OFFSETOF(table_96_111_val), const_ptr);
4058       __ lxv(table_112_127, BLK_OFFSETOF(table_112_127_val), const_ptr);
4059     } else {
4060       __ xxspltib(vec_4s->to_vsr(), 4);
4061       __ xxspltib(vec_0xfs, 0xf);
4062       __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
4063     }
4064 
4065     // The rest of the constants use different values depending on the
4066     // setting of isURL
4067     __ cmpwi(CR0, isURL, 0);
4068     __ beq(CR0, not_URL);
4069 
4070     // isURL != 0 (true)
4071     if (PowerArchitecturePPC64 >= 10) {
4072       __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_URL_val), const_ptr);
4073       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_URL_val), const_ptr);
4074     } else {
4075       __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_URL_val), const_ptr);
4076       __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_URL_val), const_ptr);
4077       __ xxspltib(vec_special_case_char->to_vsr(), '_');
4078       __ xxspltib(vec_special_case_offset, (unsigned char)US);
4079     }
4080     __ b(calculate_size);
4081 
4082     // isURL = 0 (false)
4083     __ bind(not_URL);
4084     if (PowerArchitecturePPC64 >= 10) {
4085       __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_val), const_ptr);
4086       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
4087     } else {
4088       __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_val), const_ptr);
4089       __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_val), const_ptr);
4090       __ xxspltib(vec_special_case_char->to_vsr(), '/');
4091       __ xxspltib(vec_special_case_offset, (unsigned char)SLS);
4092     }
4093 
4094     __ bind(calculate_size);
4095 
4096     // out starts at d + dp
4097     __ add(out, d, dp);
4098 
4099     // in starts at s + sp
4100     __ add(in, s, sp);
4101 
4102     __ align(32);
4103     __ bind(loop_start);
4104     __ lxv(input->to_vsr(), 0, in); // offset=0
4105 
4106     //
4107     // Lookup
4108     //
4109     if (PowerArchitecturePPC64 >= 10) {
4110       // Use xxpermx to do a lookup of each Base64 character in the
4111       // input vector and translate it to a 6-bit value + 0x80.
4112       // Characters which are not valid Base64 characters will result
4113       // in a zero in the corresponding byte.
4114       //
4115       // Note that due to align(32) call above, the xxpermx instructions do
4116       // not require align_prefix() calls, since the final xxpermx
4117       // prefix+opcode is at byte 24.
4118       __ xxpermx(xlate_a, table_32_47, table_48_63, input->to_vsr(), 1);    // offset=4
4119       __ xxpermx(xlate_b, table_64_79, table_80_95, input->to_vsr(), 2);    // offset=12
4120       __ xxlor(xlate_b, xlate_a, xlate_b);                                  // offset=20
4121       __ xxpermx(xlate_a, table_96_111, table_112_127, input->to_vsr(), 3); // offset=24
4122       __ xxlor(input->to_vsr(), xlate_a, xlate_b);
4123       // Check for non-Base64 characters by comparing each byte to zero.
4124       __ vcmpequb_(non_match, input, vec_0s);
4125     } else {
4126       // Isolate the upper 4 bits of each character by shifting it right 4 bits
4127       __ vsrb(higher_nibble, input, vec_4s);
4128       // Isolate the lower 4 bits by masking
4129       __ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
4130 
4131       // Get the offset (the value to subtract from the byte) by using
4132       // a lookup table indexed by the upper 4 bits of the character
4133       __ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
4134 
4135       // Find out which elements are the special case character (isURL ? '/' : '-')
4136       __ vcmpequb(eq_special_case_char, input, vec_special_case_char);
4137 
4138       // For each character in the input which is a special case
4139       // character, replace its offset with one that is special for that
4140       // character.
4141       __ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
4142 
4143       // Use the lower_nibble to select a mask "M" from the lookup table.
4144       __ xxperm(M, maskLUT, lower_nibble);
4145 
4146       // "bit" is used to isolate which of the bits in M is relevant.
4147       __ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
4148 
4149       // Each element of non_match correspond to one each of the 16 input
4150       // characters.  Those elements that become 0x00 after the xxland
4151       // instruction are invalid Base64 characters.
4152       __ xxland(non_match->to_vsr(), M, bit);
4153 
4154       // Compare each element to zero
4155       //
4156       __ vcmpequb_(non_match, non_match, vec_0s);
4157     }
4158     // vmcmpequb_ sets the EQ bit of CR6 if no elements compare equal.
4159     // Any element comparing equal to zero means there is an error in
4160     // that element.  Note that the comparison result register
4161     // non_match is not referenced again.  Only CR6-EQ matters.
4162     __ bne_predict_not_taken(CR6, loop_exit);
4163 
4164     // The Base64 characters had no errors, so add the offsets, which in
4165     // the case of Power10 is a constant vector of all 0x80's (see earlier
4166     // comment where the offsets register is loaded).
4167     __ vaddubm(input, input, offsets);
4168 
4169     // Pack
4170     //
4171     // In the tables below, b0, b1, .. b15 are the bytes of decoded
4172     // binary data, the first line of each of the cells (except for
4173     // the constants) uses the bit-field nomenclature from the
4174     // above-linked paper, whereas the second line is more specific
4175     // about which exact bits are present, and is constructed using the
4176     // Power ISA 3.x document style, where:
4177     //
4178     // * The specifier after the colon depicts which bits are there.
4179     // * The bit numbering is big endian style (bit 0 is the most
4180     //   significant).
4181     // * || is a concatenate operator.
4182     // * Strings of 0's are a field of zeros with the shown length, and
4183     //   likewise for strings of 1's.
4184 
4185     // Note that only e12..e15 are shown here because the shifting
4186     // and OR'ing pattern replicates for e8..e11, e4..7, and
4187     // e0..e3.
4188     //
4189     // +======================+=================+======================+======================+=============+
4190     // |        Vector        |       e12       |         e13          |         e14          |     e15     |
4191     // |       Element        |                 |                      |                      |             |
4192     // +======================+=================+======================+======================+=============+
4193     // |    after vaddubm     |    00dddddd     |       00cccccc       |       00bbbbbb       |  00aaaaaa   |
4194     // |                      |   00||b2:2..7   | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4195     // +----------------------+-----------------+----------------------+----------------------+-------------+
4196     // |     pack_lshift      |                 |         << 6         |         << 4         |    << 2     |
4197     // +----------------------+-----------------+----------------------+----------------------+-------------+
4198     // |     l after vslb     |    00dddddd     |       cc000000       |       bbbb0000       |  aaaaaa00   |
4199     // |                      |   00||b2:2..7   |   b2:0..1||000000    |    b1:0..3||0000     | b0:0..5||00 |
4200     // +----------------------+-----------------+----------------------+----------------------+-------------+
4201     // |     l after vslo     |    cc000000     |       bbbb0000       |       aaaaaa00       |  00000000   |
4202     // |                      | b2:0..1||000000 |    b1:0..3||0000     |     b0:0..5||00      |  00000000   |
4203     // +----------------------+-----------------+----------------------+----------------------+-------------+
4204     // |     pack_rshift      |                 |         >> 2         |         >> 4         |             |
4205     // +----------------------+-----------------+----------------------+----------------------+-------------+
4206     // |     r after vsrb     |    00dddddd     |       0000cccc       |       000000bb       |  00aaaaaa   |
4207     // |                      |   00||b2:2..7   |    0000||b1:4..7     |   000000||b0:6..7    | 00||b0:0..5 |
4208     // +----------------------+-----------------+----------------------+----------------------+-------------+
4209     // | gathered after xxlor |    ccdddddd     |       bbbbcccc       |       aaaaaabb       |  00aaaaaa   |
4210     // |                      |     b2:0..7     |       b1:0..7        |       b0:0..7        | 00||b0:0..5 |
4211     // +======================+=================+======================+======================+=============+
4212     //
4213     // Note: there is a typo in the above-linked paper that shows the result of the gathering process is:
4214     // [ddddddcc|bbbbcccc|aaaaaabb]
4215     // but should be:
4216     // [ccdddddd|bbbbcccc|aaaaaabb]
4217     //
4218     __ vslb(l, input, pack_lshift);
4219     // vslo of vec_8s shifts the vector by one octet toward lower
4220     // element numbers, discarding element 0.  This means it actually
4221     // shifts to the right (not left) according to the order of the
4222     // table above.
4223     __ vslo(l, l, vec_8s);
4224     __ vsrb(r, input, pack_rshift);
4225     __ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
4226 
4227     // Final rearrangement of bytes into their correct positions.
4228     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4229     // |    Vector    |  e0  |  e1  |  e2  |  e3  | e4  | e5  | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
4230     // |   Elements   |      |      |      |      |     |     |    |    |    |    |     |     |     |     |     |     |
4231     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4232     // | after xxlor  | b11  | b10  |  b9  |  xx  | b8  | b7  | b6 | xx | b5 | b4 | b3  | xx  | b2  | b1  | b0  | xx  |
4233     // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4234     // | pack_permute |  0   |  0   |  0   |  0   |  0  |  1  | 2  | 4  | 5  | 6  |  8  |  9  | 10  | 12  | 13  | 14  |
4235     // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4236     // | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5  | b4  | b3  | b2  | b1  | b0  |
4237     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4238     // xx bytes are not used to form the final data
4239     // b0..b15 are the decoded and reassembled 8-bit bytes of data
4240     // b11 with asterisk is a "don't care", because these bytes will be
4241     // overwritten on the next iteration.
4242     __ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
4243 
4244     // We cannot use a static displacement on the store, since it's a
4245     // multiple of 12, not 16.  Note that this stxv instruction actually
4246     // writes 16 bytes, even though only the first 12 are valid data.
4247     __ stxv(gathered->to_vsr(), 0, out);
4248     __ addi(out, out, 12);
4249     __ addi(in, in, 16);
4250     __ bdnz(loop_start);
4251 
4252     __ bind(loop_exit);
4253 
4254     // Return the number of out bytes produced, which is (out - (d + dp)) == out - d - dp;
4255     __ sub(R3_RET, out, d);
4256     __ sub(R3_RET, R3_RET, dp);
4257 
4258     __ blr();
4259 
4260     __ bind(return_zero);
4261     __ li(R3_RET, 0);
4262     __ blr();
4263 
4264     return start;
4265   }
4266 
4267 #undef UC
4268 #undef LC
4269 #undef DIG
4270 #undef PLS
4271 #undef HYP
4272 #undef SLS
4273 #undef US
4274 
4275 // This algorithm is based on the methods described in this paper:
4276 // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
4277 //
4278 // The details of this implementation vary from the paper due to the
4279 // difference in the ISA between SSE and AltiVec, especially in the
4280 // splitting bytes section where there is no need on Power to mask after
4281 // the shift because the shift is byte-wise rather than an entire an entire
4282 // 128-bit word.
4283 //
4284 // For the lookup part of the algorithm, different logic is used than
4285 // described in the paper because of the availability of vperm, which can
4286 // do a 64-byte table lookup in four instructions, while preserving the
4287 // branchless nature.
4288 //
4289 // Description of the ENCODE_CORE macro
4290 //
4291 // Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2
4292 // bits of each byte are zeros)
4293 //
4294 // (Note: e7..e0 are not shown because they follow the same pattern as
4295 // e8..e15)
4296 //
4297 // In the table below, b0, b1, .. b15 are the bytes of unencoded
4298 // binary data, the first line of each of the cells (except for
4299 // the constants) uses the bit-field nomenclature from the
4300 // above-linked paper, whereas the second line is more specific
4301 // about which exact bits are present, and is constructed using the
4302 // Power ISA 3.x document style, where:
4303 //
4304 // * The specifier after the colon depicts which bits are there.
4305 // * The bit numbering is big endian style (bit 0 is the most
4306 //   significant).
4307 // * || is a concatenate operator.
4308 // * Strings of 0's are a field of zeros with the shown length, and
4309 //   likewise for strings of 1's.
4310 //
4311 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4312 // |          Vector          |     e8      |          e9          |         e10          |     e11     |     e12     |         e13          |         e14          |     e15     |
4313 // |         Element          |             |                      |                      |             |             |                      |                      |             |
4314 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4315 // |        after lxv         |  jjjjkkkk   |       iiiiiijj       |       gghhhhhh       |  ffffgggg   |  eeeeeeff   |       ccdddddd       |       bbbbcccc       |  aaaaaabb   |
4316 // |                          |     b7      |          b6          |          b5          |     b4      |     b3      |          b2          |          b1          |     b0      |
4317 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4318 // |      xxperm indexes      |      0      |          10          |          11          |     12      |      0      |          13          |          14          |     15      |
4319 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4320 // |     (1) after xxperm     |             |       gghhhhhh       |       ffffgggg       |  eeeeeeff   |             |       ccdddddd       |       bbbbcccc       |  aaaaaabb   |
4321 // |                          |    (b15)    |          b5          |          b4          |     b3      |    (b15)    |          b2          |          b1          |     b0      |
4322 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4323 // |      rshift_amount       |      0      |          6           |          4           |      2      |      0      |          6           |          4           |      2      |
4324 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4325 // |        after vsrb        |             |       000000gg       |       0000ffff       |  00eeeeee   |             |       000000cc       |       0000bbbb       |  00aaaaaa   |
4326 // |                          |    (b15)    |   000000||b5:0..1    |    0000||b4:0..3     | 00||b3:0..5 |    (b15)    |   000000||b2:0..1    |    0000||b1:0..3     | 00||b0:0..5 |
4327 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4328 // |       rshift_mask        |  00000000   |      000000||11      |      0000||1111      | 00||111111  |  00000000   |      000000||11      |      0000||1111      | 00||111111  |
4329 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4330 // |    rshift after vand     |  00000000   |       000000gg       |       0000ffff       |  00eeeeee   |  00000000   |       000000cc       |       0000bbbb       |  00aaaaaa   |
4331 // |                          |  00000000   |   000000||b5:0..1    |    0000||b4:0..3     | 00||b3:0..5 |  00000000   |   000000||b2:0..1    |    0000||b1:0..3     | 00||b0:0..5 |
4332 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4333 // |    1 octet lshift (1)    |  gghhhhhh   |       ffffgggg       |       eeeeeeff       |             |  ccdddddd   |       bbbbcccc       |       aaaaaabb       |  00000000   |
4334 // |                          |     b5      |          b4          |          b3          |    (b15)    |     b2      |          b1          |          b0          |  00000000   |
4335 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4336 // |      lshift_amount       |      0      |          2           |          4           |      0      |      0      |          2           |          4           |      0      |
4337 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4338 // |        after vslb        |  gghhhhhh   |       ffgggg00       |       eeff0000       |             |  ccdddddd   |       bbcccc00       |       aabb0000       |  00000000   |
4339 // |                          |     b5      |     b4:2..7||00      |    b3:4..7||0000     |    (b15)    |   b2:0..7   |     b1:2..7||00      |    b0:4..7||0000     |  00000000   |
4340 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4341 // |       lshift_mask        | 00||111111  |     00||1111||00     |     00||11||0000     |  00000000   | 00||111111  |     00||1111||00     |     00||11||0000     |  00000000   |
4342 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4343 // |    lshift after vand     |  00hhhhhh   |       00gggg00       |       00ff0000       |  00000000   |  00dddddd   |       00cccc00       |       00bb0000       |  00000000   |
4344 // |                          | 00||b5:2..7 |   00||b4:4..7||00    |  00||b3:6..7||0000   |  00000000   | 00||b2:2..7 |   00||b1:4..7||00    |  00||b0:6..7||0000   |  00000000   |
4345 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4346 // | after vor lshift, rshift |  00hhhhhh   |       00gggggg       |       00ffffff       |  00eeeeee   |  00dddddd   |       00cccccc       |       00bbbbbb       |  00aaaaaa   |
4347 // |                          | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4348 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4349 //
4350 // Expand the first 12 bytes into 16 bytes, leaving every 4th byte
4351 // blank for now.
4352 // __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);
4353 //
4354 // Generate two bit-shifted pieces - rshift and lshift - that will
4355 // later be OR'd together.
4356 //
4357 // First the right-shifted piece
4358 // __ vsrb(rshift, input, expand_rshift);
4359 // __ vand(rshift, rshift, expand_rshift_mask);
4360 //
4361 // Now the left-shifted piece, which is done by octet shifting
4362 // the input one byte to the left, then doing a variable shift,
4363 // followed by a mask operation.
4364 //
4365 // __ vslo(lshift, input, vec_8s);
4366 // __ vslb(lshift, lshift, expand_lshift);
4367 // __ vand(lshift, lshift, expand_lshift_mask);
4368 //
4369 // Combine the two pieces by OR'ing
4370 // __ vor(expanded, rshift, lshift);
4371 //
4372 // At this point, expanded is a vector containing a 6-bit value in each
4373 // byte.  These values are used as indexes into a 64-byte lookup table that
4374 // is contained in four vector registers.  The lookup operation is done
4375 // using vperm instructions with the same indexes for the lower 32 and
4376 // upper 32 bytes.  To figure out which of the two looked-up bytes to use
4377 // at each location, all values in expanded are compared to 31.  Using
4378 // vsel, values higher than 31 use the results from the upper 32 bytes of
4379 // the lookup operation, while values less than or equal to 31 use the
4380 // lower 32 bytes of the lookup operation.
4381 //
4382 // Note: it's tempting to use a xxpermx,xxpermx,vor sequence here on
4383 // Power10 (or later), but experiments doing so on Power10 yielded a slight
4384 // performance drop, perhaps due to the need for xxpermx instruction
4385 // prefixes.
4386 
4387 #define ENCODE_CORE                                                        \
4388     __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);           \
4389     __ vsrb(rshift, input, expand_rshift);                                 \
4390     __ vand(rshift, rshift, expand_rshift_mask);                           \
4391     __ vslo(lshift, input, vec_8s);                                        \
4392     __ vslb(lshift, lshift, expand_lshift);                                \
4393     __ vand(lshift, lshift, expand_lshift_mask);                           \
4394     __ vor(expanded, rshift, lshift);                                      \
4395     __ vperm(encoded_00_31, vec_base64_00_15, vec_base64_16_31, expanded); \
4396     __ vperm(encoded_32_63, vec_base64_32_47, vec_base64_48_63, expanded); \
4397     __ vcmpgtub(gt_31, expanded, vec_31s);                                 \
4398     __ vsel(expanded, encoded_00_31, encoded_32_63, gt_31);
4399 
4400 // Intrinsic function prototype in Base64.java:
4401 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
4402 
4403   address generate_base64_encodeBlock() {
4404     __ align(CodeEntryAlignment);
4405     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
4406     StubCodeMark mark(this, stub_id);
4407     address start   = __ function_entry();
4408 
4409     typedef struct {
4410       unsigned char expand_permute_val[16];
4411       unsigned char expand_rshift_val[16];
4412       unsigned char expand_rshift_mask_val[16];
4413       unsigned char expand_lshift_val[16];
4414       unsigned char expand_lshift_mask_val[16];
4415       unsigned char base64_00_15_val[16];
4416       unsigned char base64_16_31_val[16];
4417       unsigned char base64_32_47_val[16];
4418       unsigned char base64_48_63_val[16];
4419       unsigned char base64_48_63_URL_val[16];
4420     } constant_block;
4421 
4422     alignas(16) static const constant_block const_block = {
4423       .expand_permute_val = {
4424         ARRAY_TO_LXV_ORDER(
4425         0,  4,  5,  6,
4426         0,  7,  8,  9,
4427         0, 10, 11, 12,
4428         0, 13, 14, 15 ) },
4429 
4430       .expand_rshift_val = {
4431         ARRAY_TO_LXV_ORDER(
4432         0, 6, 4, 2,
4433         0, 6, 4, 2,
4434         0, 6, 4, 2,
4435         0, 6, 4, 2 ) },
4436 
4437       .expand_rshift_mask_val = {
4438         ARRAY_TO_LXV_ORDER(
4439         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4440         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4441         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4442         0b00000000, 0b00000011, 0b00001111, 0b00111111 ) },
4443 
4444       .expand_lshift_val = {
4445         ARRAY_TO_LXV_ORDER(
4446         0, 2, 4, 0,
4447         0, 2, 4, 0,
4448         0, 2, 4, 0,
4449         0, 2, 4, 0 ) },
4450 
4451       .expand_lshift_mask_val = {
4452         ARRAY_TO_LXV_ORDER(
4453         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4454         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4455         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4456         0b00111111, 0b00111100, 0b00110000, 0b00000000 ) },
4457 
4458       .base64_00_15_val = {
4459         ARRAY_TO_LXV_ORDER(
4460         'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P' ) },
4461 
4462       .base64_16_31_val = {
4463         ARRAY_TO_LXV_ORDER(
4464         'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f' ) },
4465 
4466       .base64_32_47_val = {
4467         ARRAY_TO_LXV_ORDER(
4468         'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v' ) },
4469 
4470       .base64_48_63_val = {
4471         ARRAY_TO_LXV_ORDER(
4472         'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/' ) },
4473 
4474       .base64_48_63_URL_val = {
4475         ARRAY_TO_LXV_ORDER(
4476         'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) }
4477     };
4478 
4479     // Number of bytes to process in each pass through the main loop.
4480     // 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes.
4481     const unsigned block_size = 12;
4482 
4483     // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
4484     Register src       = R3_ARG1; // source starting address of Base64 characters
4485     Register sp        = R4_ARG2; // source starting position
4486     Register sl        = R5_ARG3; // total source length of the Base64 characters to be processed
4487     Register dst       = R6_ARG4; // destination address
4488     Register dp        = R7_ARG5; // destination starting position
4489     Register isURL     = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
4490 
4491     // Local variables
4492     Register const_ptr     = R12; // used for loading constants (reuses isURL's register)
4493     Register tmp_reg       = R9;  // used for speeding up load_constant()
4494 
4495     Register size           = R9;  // number of bytes to process (reuses tmp_reg's register)
4496     Register blocked_size   = R10; // number of bytes to process a block at a time
4497     Register block_modulo   = R12; // == block_size (reuse const_ptr)
4498     Register remaining      = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg)
4499     Register in             = R4;  // current input (source) pointer (reuse sp's register)
4500     Register num_blocks     = R11; // number of blocks to be processed by the loop
4501     Register out            = R8;  // current output (destination) pointer (reuse const_ptr's register)
4502     Register three          = R9;  // constant divisor (reuse size's register)
4503     Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register)
4504     Register tmp1           = R7;  // temp register for lxvl length (reuse dp's register)
4505     Register modulo_chars   = R7;  // number of bytes written during the final write % 4 (reuse tmp1's register)
4506     Register pad_char       = R6;  // literal '=' (reuse dst's register)
4507 
4508     // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
4509     // VR Constants
4510     VectorRegister  vec_8s             = VR0;
4511     VectorRegister  vec_31s            = VR1;
4512     VectorRegister  vec_base64_00_15   = VR2;
4513     VectorRegister  vec_base64_16_31   = VR3;
4514     VectorRegister  vec_base64_32_47   = VR4;
4515     VectorRegister  vec_base64_48_63   = VR5;
4516     VectorRegister  expand_rshift      = VR6;
4517     VectorRegister  expand_rshift_mask = VR7;
4518     VectorRegister  expand_lshift      = VR8;
4519     VectorRegister  expand_lshift_mask = VR9;
4520 
4521     // VR variables for expand
4522     VectorRegister  input              = VR10;
4523     VectorRegister  rshift             = VR11;
4524     VectorRegister  lshift             = VR12;
4525     VectorRegister  expanded           = VR13;
4526 
4527     // VR variables for lookup
4528     VectorRegister  encoded_00_31      = VR10; // (reuse input)
4529     VectorRegister  encoded_32_63      = VR11; // (reuse rshift)
4530     VectorRegister  gt_31              = VR12; // (reuse lshift)
4531 
4532     // VSR Constants
4533     VectorSRegister expand_permute     = VSR0;
4534 
4535     Label not_URL, calculate_size, calculate_blocked_size, skip_loop;
4536     Label loop_start, le_16_to_write, no_pad, one_pad_char;
4537 
4538     // The upper 32 bits of the non-pointer parameter registers are not
4539     // guaranteed to be zero, so mask off those upper bits.
4540     __ clrldi(sp, sp, 32);
4541     __ clrldi(sl, sl, 32);
4542     __ clrldi(dp, dp, 32);
4543     __ clrldi(isURL, isURL, 32);
4544 
4545     // load up the constants
4546     __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
4547     __ lxv(expand_permute,               BLK_OFFSETOF(expand_permute_val),     const_ptr);
4548     __ lxv(expand_rshift->to_vsr(),      BLK_OFFSETOF(expand_rshift_val),      const_ptr);
4549     __ lxv(expand_rshift_mask->to_vsr(), BLK_OFFSETOF(expand_rshift_mask_val), const_ptr);
4550     __ lxv(expand_lshift->to_vsr(),      BLK_OFFSETOF(expand_lshift_val),      const_ptr);
4551     __ lxv(expand_lshift_mask->to_vsr(), BLK_OFFSETOF(expand_lshift_mask_val), const_ptr);
4552     __ lxv(vec_base64_00_15->to_vsr(),   BLK_OFFSETOF(base64_00_15_val),       const_ptr);
4553     __ lxv(vec_base64_16_31->to_vsr(),   BLK_OFFSETOF(base64_16_31_val),       const_ptr);
4554     __ lxv(vec_base64_32_47->to_vsr(),   BLK_OFFSETOF(base64_32_47_val),       const_ptr);
4555 
4556     // Splat the constants that can use xxspltib
4557     __ xxspltib(vec_8s->to_vsr(), 8);
4558     __ xxspltib(vec_31s->to_vsr(), 31);
4559 
4560 
4561     // Use a different translation lookup table depending on the
4562     // setting of isURL
4563     __ cmpdi(CR0, isURL, 0);
4564     __ beq(CR0, not_URL);
4565     __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr);
4566     __ b(calculate_size);
4567 
4568     __ bind(not_URL);
4569     __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_val), const_ptr);
4570 
4571     __ bind(calculate_size);
4572 
4573     // size = sl - sp - 4 (*)
4574     // (*) Don't process the last four bytes in the main loop because
4575     // we don't want the lxv instruction to read past the end of the src
4576     // data, in case those four bytes are on the start of an unmapped or
4577     // otherwise inaccessible page.
4578     //
4579     __ sub(size, sl, sp);
4580     __ subi(size, size, 4);
4581     __ cmpdi(CR7, size, block_size);
4582     __ bgt(CR7, calculate_blocked_size);
4583     __ mr(remaining, size);
4584     // Add the 4 back into remaining again
4585     __ addi(remaining, remaining, 4);
4586     // make "in" point to the beginning of the source data: in = src + sp
4587     __ add(in, src, sp);
4588     // out = dst + dp
4589     __ add(out, dst, dp);
4590     __ b(skip_loop);
4591 
4592     __ bind(calculate_blocked_size);
4593     __ li(block_modulo, block_size);
4594     // num_blocks = size / block_modulo
4595     __ divwu(num_blocks, size, block_modulo);
4596     // blocked_size = num_blocks * size
4597     __ mullw(blocked_size, num_blocks, block_modulo);
4598     // remaining = size - blocked_size
4599     __ sub(remaining, size, blocked_size);
4600     __ mtctr(num_blocks);
4601 
4602     // Add the 4 back in to remaining again
4603     __ addi(remaining, remaining, 4);
4604 
4605     // make "in" point to the beginning of the source data: in = src + sp
4606     __ add(in, src, sp);
4607 
4608     // out = dst + dp
4609     __ add(out, dst, dp);
4610 
4611     __ align(32);
4612     __ bind(loop_start);
4613 
4614     __ lxv(input->to_vsr(), 0, in);
4615 
4616     ENCODE_CORE
4617 
4618     __ stxv(expanded->to_vsr(), 0, out);
4619     __ addi(in, in, 12);
4620     __ addi(out, out, 16);
4621     __ bdnz(loop_start);
4622 
4623     __ bind(skip_loop);
4624 
4625     // When there are less than 16 bytes left, we need to be careful not to
4626     // read beyond the end of the src buffer, which might be in an unmapped
4627     // page.
4628     // Load the remaining bytes using lxvl.
4629     __ rldicr(tmp1, remaining, 56, 7);
4630     __ lxvl(input->to_vsr(), in, tmp1);
4631 
4632     ENCODE_CORE
4633 
4634     // bytes_to_write = ((remaining * 4) + 2) / 3
4635     __ li(three, 3);
4636     __ rlwinm(bytes_to_write, remaining, 2, 0, 29); // remaining * 4
4637     __ addi(bytes_to_write, bytes_to_write, 2);
4638     __ divwu(bytes_to_write, bytes_to_write, three);
4639 
4640     __ cmpwi(CR7, bytes_to_write, 16);
4641     __ ble_predict_taken(CR7, le_16_to_write);
4642     __ stxv(expanded->to_vsr(), 0, out);
4643 
4644     // We've processed 12 of the 13-15 data bytes, so advance the pointers,
4645     // and do one final pass for the remaining 1-3 bytes.
4646     __ addi(in, in, 12);
4647     __ addi(out, out, 16);
4648     __ subi(remaining, remaining, 12);
4649     __ subi(bytes_to_write, bytes_to_write, 16);
4650     __ rldicr(tmp1, bytes_to_write, 56, 7);
4651     __ lxvl(input->to_vsr(), in, tmp1);
4652 
4653     ENCODE_CORE
4654 
4655     __ bind(le_16_to_write);
4656     // shift bytes_to_write into the upper 8 bits of t1 for use by stxvl
4657     __ rldicr(tmp1, bytes_to_write, 56, 7);
4658     __ stxvl(expanded->to_vsr(), out, tmp1);
4659     __ add(out, out, bytes_to_write);
4660 
4661     __ li(pad_char, '=');
4662     __ rlwinm_(modulo_chars, bytes_to_write, 0, 30, 31); // bytes_to_write % 4, set CR0
4663     // Examples:
4664     //    remaining  bytes_to_write  modulo_chars  num pad chars
4665     //        0            0               0            0
4666     //        1            2               2            2
4667     //        2            3               3            1
4668     //        3            4               0            0
4669     //        4            6               2            2
4670     //        5            7               3            1
4671     //        ...
4672     //       12           16               0            0
4673     //       13           18               2            2
4674     //       14           19               3            1
4675     //       15           20               0            0
4676     __ beq(CR0, no_pad);
4677     __ cmpwi(CR7, modulo_chars, 3);
4678     __ beq(CR7, one_pad_char);
4679 
4680     // two pad chars
4681     __ stb(pad_char, out);
4682     __ addi(out, out, 1);
4683 
4684     __ bind(one_pad_char);
4685     __ stb(pad_char, out);
4686 
4687     __ bind(no_pad);
4688 
4689     __ blr();
4690     return start;
4691   }
4692 
4693 #endif // VM_LITTLE_ENDIAN
4694 
4695 void generate_lookup_secondary_supers_table_stub() {
4696     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
4697     StubCodeMark mark(this, stub_id);
4698 
4699     const Register
4700       r_super_klass  = R4_ARG2,
4701       r_array_base   = R3_ARG1,
4702       r_array_length = R7_ARG5,
4703       r_array_index  = R6_ARG4,
4704       r_sub_klass    = R5_ARG3,
4705       r_bitmap       = R11_scratch1,
4706       result         = R8_ARG6;
4707 
4708     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
4709       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
4710       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
4711                                              r_array_base, r_array_length, r_array_index,
4712                                              r_bitmap, result, slot);
4713       __ blr();
4714     }
4715   }
4716 
4717   // Slow path implementation for UseSecondarySupersTable.
4718   address generate_lookup_secondary_supers_table_slow_path_stub() {
4719     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
4720     StubCodeMark mark(this, stub_id);
4721 
4722     address start = __ pc();
4723     const Register
4724       r_super_klass  = R4_ARG2,
4725       r_array_base   = R3_ARG1,
4726       temp1          = R7_ARG5,
4727       r_array_index  = R6_ARG4,
4728       r_bitmap       = R11_scratch1,
4729       result         = R8_ARG6;
4730 
4731     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
4732     __ blr();
4733 
4734     return start;
4735   }
4736 
4737   address generate_cont_thaw(StubGenStubId stub_id) {
4738     if (!Continuations::enabled()) return nullptr;
4739 
4740     Continuation::thaw_kind kind;
4741     bool return_barrier;
4742     bool return_barrier_exception;
4743 
4744     switch (stub_id) {
4745     case cont_thaw_id:
4746       kind = Continuation::thaw_top;
4747       return_barrier = false;
4748       return_barrier_exception = false;
4749       break;
4750     case cont_returnBarrier_id:
4751       kind = Continuation::thaw_return_barrier;
4752       return_barrier = true;
4753       return_barrier_exception = false;
4754       break;
4755     case cont_returnBarrierExc_id:
4756       kind = Continuation::thaw_return_barrier_exception;
4757       return_barrier = true;
4758       return_barrier_exception = true;
4759       break;
4760     default:
4761       ShouldNotReachHere();
4762     }
4763     StubCodeMark mark(this, stub_id);
4764 
4765     Register tmp1 = R10_ARG8;
4766     Register tmp2 = R9_ARG7;
4767     Register tmp3 = R8_ARG6;
4768     Register nvtmp = R15_esp;   // nonvolatile tmp register
4769     FloatRegister nvftmp = F20; // nonvolatile fp tmp register
4770 
4771     address start = __ pc();
4772 
4773     if (kind == Continuation::thaw_top) {
4774       __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
4775     }
4776 
4777     if (return_barrier) {
4778       __ mr(nvtmp, R3_RET); __ fmr(nvftmp, F1_RET); // preserve possible return value from a method returning to the return barrier
4779       DEBUG_ONLY(__ ld_ptr(tmp1, _abi0(callers_sp), R1_SP);)
4780       __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
4781 #ifdef ASSERT
4782       __ ld_ptr(tmp2, _abi0(callers_sp), R1_SP);
4783       __ cmpd(CR0, tmp1, tmp2);
4784       __ asm_assert_eq(FILE_AND_LINE ": callers sp is corrupt");
4785 #endif
4786     }
4787 #ifdef ASSERT
4788     __ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread);
4789     __ cmpd(CR0, R1_SP, tmp1);
4790     __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
4791 #endif
4792 
4793     __ li(R4_ARG2, return_barrier ? 1 : 0);
4794     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), R16_thread, R4_ARG2);
4795 
4796 #ifdef ASSERT
4797     DEBUG_ONLY(__ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread));
4798     DEBUG_ONLY(__ cmpd(CR0, R1_SP, tmp1));
4799     __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
4800 #endif
4801 
4802     // R3_RET contains the size of the frames to thaw, 0 if overflow or no more frames
4803     Label thaw_success;
4804     __ cmpdi(CR0, R3_RET, 0);
4805     __ bne(CR0, thaw_success);
4806     __ load_const_optimized(tmp1, (SharedRuntime::throw_StackOverflowError_entry()), R0);
4807     __ mtctr(tmp1); __ bctr();
4808     __ bind(thaw_success);
4809 
4810     __ addi(R3_RET, R3_RET, frame::native_abi_reg_args_size); // Large abi required for C++ calls.
4811     __ neg(R3_RET, R3_RET);
4812     // align down resulting in a smaller negative offset
4813     __ clrrdi(R3_RET, R3_RET, exact_log2(frame::alignment_in_bytes));
4814     DEBUG_ONLY(__ mr(tmp1, R1_SP);)
4815     __ resize_frame(R3_RET, tmp2);  // make room for the thawed frames
4816 
4817     __ li(R4_ARG2, kind);
4818     __ call_VM_leaf(Continuation::thaw_entry(), R16_thread, R4_ARG2);
4819     __ mr(R1_SP, R3_RET); // R3_RET contains the SP of the thawed top frame
4820 
4821     if (return_barrier) {
4822       // we're now in the caller of the frame that returned to the barrier
4823       __ mr(R3_RET, nvtmp); __ fmr(F1_RET, nvftmp); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4824     } else {
4825       // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
4826       __ li(R3_RET, 0); // return 0 (success) from doYield
4827     }
4828 
4829     if (return_barrier_exception) {
4830       Register ex_pc = R17_tos;   // nonvolatile register
4831       __ ld(ex_pc, _abi0(lr), R1_SP); // LR
4832       __ mr(nvtmp, R3_RET); // save return value containing the exception oop
4833       // The thawed top frame has got a frame::java_abi. This is not sufficient for the runtime call.
4834       __ push_frame_reg_args(0, tmp1);
4835       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), R16_thread, ex_pc);
4836       __ mtlr(R3_RET); // the exception handler
4837       __ pop_frame();
4838       // See OptoRuntime::generate_exception_blob for register arguments
4839       __ mr(R3_ARG1, nvtmp); // exception oop
4840       __ mr(R4_ARG2, ex_pc); // exception pc
4841     } else {
4842       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4843       __ ld(R0, _abi0(lr), R1_SP); // LR
4844       __ mtlr(R0);
4845     }
4846     __ blr();
4847 
4848     return start;
4849   }
4850 
4851   address generate_cont_thaw() {
4852     return generate_cont_thaw(StubGenStubId::cont_thaw_id);
4853   }
4854 
4855   // TODO: will probably need multiple return barriers depending on return type
4856 
4857   address generate_cont_returnBarrier() {
4858     return generate_cont_thaw(StubGenStubId::cont_returnBarrier_id);
4859   }
4860 
4861   address generate_cont_returnBarrier_exception() {
4862     return generate_cont_thaw(StubGenStubId::cont_returnBarrierExc_id);
4863   }
4864 
4865   address generate_cont_preempt_stub() {
4866     if (!Continuations::enabled()) return nullptr;
4867     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
4868     StubCodeMark mark(this, stub_id);
4869     address start = __ pc();
4870 
4871     __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
4872 
4873     __ reset_last_Java_frame(false /*check_last_java_sp*/);
4874 
4875     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4876     __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
4877 
4878     Label preemption_cancelled;
4879     __ lbz(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
4880     __ cmpwi(CR0, R11_scratch1, 0);
4881     __ bne(CR0, preemption_cancelled);
4882 
4883     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4884     SharedRuntime::continuation_enter_cleanup(_masm);
4885     __ pop_frame();
4886     __ restore_LR(R11_scratch1);
4887     __ blr();
4888 
4889     // We acquired the monitor after freezing the frames so call thaw to continue execution.
4890     __ bind(preemption_cancelled);
4891     __ li(R11_scratch1, 0); // false
4892     __ stb(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
4893     int simm16_offs = __ load_const_optimized(R11_scratch1, ContinuationEntry::thaw_call_pc_address(), R0, true);
4894     __ ld(R11_scratch1, simm16_offs, R11_scratch1);
4895     __ mtctr(R11_scratch1);
4896     __ bctr();
4897 
4898     return start;
4899   }
4900 
4901   // exception handler for upcall stubs
4902   address generate_upcall_stub_exception_handler() {
4903     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
4904     StubCodeMark mark(this, stub_id);
4905     address start = __ pc();
4906 
4907     // Native caller has no idea how to handle exceptions,
4908     // so we just crash here. Up to callee to catch exceptions.
4909     __ verify_oop(R3_ARG1);
4910     __ load_const_optimized(R12_scratch2, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception), R0);
4911     __ call_c(R12_scratch2);
4912     __ should_not_reach_here();
4913 
4914     return start;
4915   }
4916 
4917   // load Method* target of MethodHandle
4918   // R3_ARG1 = jobject receiver
4919   // R19_method = result Method*
4920   address generate_upcall_stub_load_target() {
4921 
4922     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
4923     StubCodeMark mark(this, stub_id);
4924     address start = __ pc();
4925 
4926     __ resolve_global_jobject(R3_ARG1, R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS);
4927     // Load target method from receiver
4928     __ load_heap_oop(R19_method, java_lang_invoke_MethodHandle::form_offset(), R3_ARG1,
4929                      R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4930     __ load_heap_oop(R19_method, java_lang_invoke_LambdaForm::vmentry_offset(), R19_method,
4931                      R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4932     __ load_heap_oop(R19_method, java_lang_invoke_MemberName::method_offset(), R19_method,
4933                      R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4934     __ ld(R19_method, java_lang_invoke_ResolvedMethodName::vmtarget_offset(), R19_method);
4935     __ std(R19_method, in_bytes(JavaThread::callee_target_offset()), R16_thread); // just in case callee is deoptimized
4936 
4937     __ blr();
4938 
4939     return start;
4940   }
4941 
4942   // Initialization
4943   void generate_preuniverse_stubs() {
4944     // preuniverse stubs are not needed for ppc
4945   }
4946 
4947   void generate_initial_stubs() {
4948     // Generates all stubs and initializes the entry points
4949 
4950     // Entry points that exist in all platforms.
4951     // Note: This is code that could be shared among different platforms - however the
4952     // benefit seems to be smaller than the disadvantage of having a
4953     // much more complicated generator structure. See also comment in
4954     // stubRoutines.hpp.
4955 
4956     StubRoutines::_forward_exception_entry          = generate_forward_exception();
4957     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
4958     StubRoutines::_catch_exception_entry            = generate_catch_exception();
4959 
4960     if (UnsafeMemoryAccess::_table == nullptr) {
4961       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
4962     }
4963 
4964     // CRC32 Intrinsics.
4965     if (UseCRC32Intrinsics) {
4966       StubRoutines::_crc_table_adr = StubRoutines::ppc::generate_crc_constants(REVERSE_CRC32_POLY);
4967       StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(StubGenStubId::updateBytesCRC32_id);
4968     }
4969 
4970     // CRC32C Intrinsics.
4971     if (UseCRC32CIntrinsics) {
4972       StubRoutines::_crc32c_table_addr = StubRoutines::ppc::generate_crc_constants(REVERSE_CRC32C_POLY);
4973       StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(StubGenStubId::updateBytesCRC32C_id);
4974     }
4975 
4976     if (VM_Version::supports_float16()) {
4977       // For results consistency both intrinsics should be enabled.
4978       StubRoutines::_hf2f = generate_float16ToFloat();
4979       StubRoutines::_f2hf = generate_floatToFloat16();
4980     }
4981   }
4982 
4983   void generate_continuation_stubs() {
4984     // Continuation stubs:
4985     StubRoutines::_cont_thaw          = generate_cont_thaw();
4986     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
4987     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
4988     StubRoutines::_cont_preempt_stub  = generate_cont_preempt_stub();
4989   }
4990 
4991   void generate_final_stubs() {
4992     // Generates all stubs and initializes the entry points
4993 
4994     // support for verify_oop (must happen after universe_init)
4995     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();
4996 
4997     // nmethod entry barriers for concurrent class unloading
4998     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
4999 
5000     // arraycopy stubs used by compilers
5001     generate_arraycopy_stubs();
5002 
5003 #ifdef COMPILER2
5004     if (UseSecondarySupersTable) {
5005       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
5006       if (!InlineSecondarySupersTest) {
5007         generate_lookup_secondary_supers_table_stub();
5008       }
5009     }
5010 #endif // COMPILER2
5011 
5012     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
5013     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
5014   }
5015 
5016   void generate_compiler_stubs() {
5017 #if COMPILER2_OR_JVMCI
5018 
5019 #ifdef COMPILER2
5020     if (UseMultiplyToLenIntrinsic) {
5021       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5022     }
5023     if (UseSquareToLenIntrinsic) {
5024       StubRoutines::_squareToLen = generate_squareToLen();
5025     }
5026     if (UseMulAddIntrinsic) {
5027       StubRoutines::_mulAdd = generate_mulAdd();
5028     }
5029     if (UseMontgomeryMultiplyIntrinsic) {
5030       StubRoutines::_montgomeryMultiply
5031         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
5032     }
5033     if (UseMontgomerySquareIntrinsic) {
5034       StubRoutines::_montgomerySquare
5035         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
5036     }
5037 #endif
5038 
5039     // data cache line writeback
5040     if (VM_Version::supports_data_cache_line_flush()) {
5041       StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5042       StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5043     }
5044 
5045     if (UseGHASHIntrinsics) {
5046       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5047     }
5048 
5049     if (UseAESIntrinsics) {
5050       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5051       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5052     }
5053 
5054     if (UseSHA256Intrinsics) {
5055       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
5056       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
5057     }
5058     if (UseSHA512Intrinsics) {
5059       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
5060       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
5061     }
5062 
5063 #ifdef VM_LITTLE_ENDIAN
5064     // Currently supported on PPC64LE only
5065     if (UseBASE64Intrinsics) {
5066       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
5067       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
5068     }
5069 #endif
5070 #endif // COMPILER2_OR_JVMCI
5071   }
5072 
5073  public:
5074   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
5075     switch(blob_id) {
5076     case preuniverse_id:
5077       generate_preuniverse_stubs();
5078       break;
5079     case initial_id:
5080       generate_initial_stubs();
5081       break;
5082      case continuation_id:
5083       generate_continuation_stubs();
5084       break;
5085     case compiler_id:
5086       generate_compiler_stubs();
5087       break;
5088     case final_id:
5089       generate_final_stubs();
5090       break;
5091     default:
5092       fatal("unexpected blob id: %d", blob_id);
5093       break;
5094     };
5095   }
5096 };
5097 
5098 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
5099   StubGenerator g(code, blob_id);
5100 }
5101