1 /*
   2  * Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2021 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/oopMap.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "gc/shared/barrierSetNMethod.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "nativeInst_ppc.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "runtime/vm_version.hpp"
  46 #include "utilities/align.hpp"
  47 #include "utilities/powerOfTwo.hpp"
  48 
  49 // Declaration and definition of StubGenerator (no .hpp file).
  50 // For a more detailed description of the stub routine structure
  51 // see the comment in stubRoutines.hpp.
  52 
  53 #define __ _masm->
  54 
  55 #ifdef PRODUCT
  56 #define BLOCK_COMMENT(str) // nothing
  57 #else
  58 #define BLOCK_COMMENT(str) __ block_comment(str)
  59 #endif
  60 
  61 #if defined(ABI_ELFv2)
  62 #define STUB_ENTRY(name) StubRoutines::name()
  63 #else
  64 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name())->entry()
  65 #endif
  66 
  67 class StubGenerator: public StubCodeGenerator {
  68  private:
  69 
  70   // Call stubs are used to call Java from C
  71   //
  72   // Arguments:
  73   //
  74   //   R3  - call wrapper address     : address
  75   //   R4  - result                   : intptr_t*
  76   //   R5  - result type              : BasicType
  77   //   R6  - method                   : Method
  78   //   R7  - frame mgr entry point    : address
  79   //   R8  - parameter block          : intptr_t*
  80   //   R9  - parameter count in words : int
  81   //   R10 - thread                   : Thread*
  82   //
  83   address generate_call_stub(address& return_address) {
  84     // Setup a new c frame, copy java arguments, call frame manager or
  85     // native_entry, and process result.
  86 
  87     StubCodeMark mark(this, "StubRoutines", "call_stub");
  88 
  89     address start = __ function_entry();
  90 
  91     // some sanity checks
  92     assert((sizeof(frame::abi_minframe) % 16) == 0,           "unaligned");
  93     assert((sizeof(frame::abi_reg_args) % 16) == 0,           "unaligned");
  94     assert((sizeof(frame::spill_nonvolatiles) % 16) == 0,     "unaligned");
  95     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
  96     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");
  97 
  98     Register r_arg_call_wrapper_addr        = R3;
  99     Register r_arg_result_addr              = R4;
 100     Register r_arg_result_type              = R5;
 101     Register r_arg_method                   = R6;
 102     Register r_arg_entry                    = R7;
 103     Register r_arg_thread                   = R10;
 104 
 105     Register r_temp                         = R24;
 106     Register r_top_of_arguments_addr        = R25;
 107     Register r_entryframe_fp                = R26;
 108 
 109     {
 110       // Stack on entry to call_stub:
 111       //
 112       //      F1      [C_FRAME]
 113       //              ...
 114 
 115       Register r_arg_argument_addr          = R8;
 116       Register r_arg_argument_count         = R9;
 117       Register r_frame_alignment_in_bytes   = R27;
 118       Register r_argument_addr              = R28;
 119       Register r_argumentcopy_addr          = R29;
 120       Register r_argument_size_in_bytes     = R30;
 121       Register r_frame_size                 = R23;
 122 
 123       Label arguments_copied;
 124 
 125       // Save LR/CR to caller's C_FRAME.
 126       __ save_LR_CR(R0);
 127 
 128       // Zero extend arg_argument_count.
 129       __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
 130 
 131       // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
 132       __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
 133 
 134       // Keep copy of our frame pointer (caller's SP).
 135       __ mr(r_entryframe_fp, R1_SP);
 136 
 137       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
 138       // Push ENTRY_FRAME including arguments:
 139       //
 140       //      F0      [TOP_IJAVA_FRAME_ABI]
 141       //              alignment (optional)
 142       //              [outgoing Java arguments]
 143       //              [ENTRY_FRAME_LOCALS]
 144       //      F1      [C_FRAME]
 145       //              ...
 146 
 147       // calculate frame size
 148 
 149       // unaligned size of arguments
 150       __ sldi(r_argument_size_in_bytes,
 151                   r_arg_argument_count, Interpreter::logStackElementSize);
 152       // arguments alignment (max 1 slot)
 153       // FIXME: use round_to() here
 154       __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
 155       __ sldi(r_frame_alignment_in_bytes,
 156               r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
 157 
 158       // size = unaligned size of arguments + top abi's size
 159       __ addi(r_frame_size, r_argument_size_in_bytes,
 160               frame::top_ijava_frame_abi_size);
 161       // size += arguments alignment
 162       __ add(r_frame_size,
 163              r_frame_size, r_frame_alignment_in_bytes);
 164       // size += size of call_stub locals
 165       __ addi(r_frame_size,
 166               r_frame_size, frame::entry_frame_locals_size);
 167 
 168       // push ENTRY_FRAME
 169       __ push_frame(r_frame_size, r_temp);
 170 
 171       // initialize call_stub locals (step 1)
 172       __ std(r_arg_call_wrapper_addr,
 173              _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
 174       __ std(r_arg_result_addr,
 175              _entry_frame_locals_neg(result_address), r_entryframe_fp);
 176       __ std(r_arg_result_type,
 177              _entry_frame_locals_neg(result_type), r_entryframe_fp);
 178       // we will save arguments_tos_address later
 179 
 180 
 181       BLOCK_COMMENT("Copy Java arguments");
 182       // copy Java arguments
 183 
 184       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
 185       // FIXME: why not simply use SP+frame::top_ijava_frame_size?
 186       __ addi(r_top_of_arguments_addr,
 187               R1_SP, frame::top_ijava_frame_abi_size);
 188       __ add(r_top_of_arguments_addr,
 189              r_top_of_arguments_addr, r_frame_alignment_in_bytes);
 190 
 191       // any arguments to copy?
 192       __ cmpdi(CCR0, r_arg_argument_count, 0);
 193       __ beq(CCR0, arguments_copied);
 194 
 195       // prepare loop and copy arguments in reverse order
 196       {
 197         // init CTR with arg_argument_count
 198         __ mtctr(r_arg_argument_count);
 199 
 200         // let r_argumentcopy_addr point to last outgoing Java arguments P
 201         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
 202 
 203         // let r_argument_addr point to last incoming java argument
 204         __ add(r_argument_addr,
 205                    r_arg_argument_addr, r_argument_size_in_bytes);
 206         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 207 
 208         // now loop while CTR > 0 and copy arguments
 209         {
 210           Label next_argument;
 211           __ bind(next_argument);
 212 
 213           __ ld(r_temp, 0, r_argument_addr);
 214           // argument_addr--;
 215           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 216           __ std(r_temp, 0, r_argumentcopy_addr);
 217           // argumentcopy_addr++;
 218           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
 219 
 220           __ bdnz(next_argument);
 221         }
 222       }
 223 
 224       // Arguments copied, continue.
 225       __ bind(arguments_copied);
 226     }
 227 
 228     {
 229       BLOCK_COMMENT("Call frame manager or native entry.");
 230       // Call frame manager or native entry.
 231       Register r_new_arg_entry = R14;
 232       assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
 233                                  r_arg_method, r_arg_thread);
 234 
 235       __ mr(r_new_arg_entry, r_arg_entry);
 236 
 237       // Register state on entry to frame manager / native entry:
 238       //
 239       //   tos         -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
 240       //   R19_method  -  Method
 241       //   R16_thread  -  JavaThread*
 242 
 243       // Tos must point to last argument - element_size.
 244       const Register tos = R15_esp;
 245 
 246       __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
 247 
 248       // initialize call_stub locals (step 2)
 249       // now save tos as arguments_tos_address
 250       __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
 251 
 252       // load argument registers for call
 253       __ mr(R19_method, r_arg_method);
 254       __ mr(R16_thread, r_arg_thread);
 255       assert(tos != r_arg_method, "trashed r_arg_method");
 256       assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
 257 
 258       // Set R15_prev_state to 0 for simplifying checks in callee.
 259       __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
 260       // Stack on entry to frame manager / native entry:
 261       //
 262       //      F0      [TOP_IJAVA_FRAME_ABI]
 263       //              alignment (optional)
 264       //              [outgoing Java arguments]
 265       //              [ENTRY_FRAME_LOCALS]
 266       //      F1      [C_FRAME]
 267       //              ...
 268       //
 269 
 270       // global toc register
 271       __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R11_scratch1);
 272       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
 273       // when called via a c2i.
 274 
 275       // Pass initial_caller_sp to framemanager.
 276       __ mr(R21_sender_SP, R1_SP);
 277 
 278       // Do a light-weight C-call here, r_new_arg_entry holds the address
 279       // of the interpreter entry point (frame manager or native entry)
 280       // and save runtime-value of LR in return_address.
 281       assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
 282              "trashed r_new_arg_entry");
 283       return_address = __ call_stub(r_new_arg_entry);
 284     }
 285 
 286     {
 287       BLOCK_COMMENT("Returned from frame manager or native entry.");
 288       // Returned from frame manager or native entry.
 289       // Now pop frame, process result, and return to caller.
 290 
 291       // Stack on exit from frame manager / native entry:
 292       //
 293       //      F0      [ABI]
 294       //              ...
 295       //              [ENTRY_FRAME_LOCALS]
 296       //      F1      [C_FRAME]
 297       //              ...
 298       //
 299       // Just pop the topmost frame ...
 300       //
 301 
 302       Label ret_is_object;
 303       Label ret_is_long;
 304       Label ret_is_float;
 305       Label ret_is_double;
 306 
 307       Register r_entryframe_fp = R30;
 308       Register r_lr            = R7_ARG5;
 309       Register r_cr            = R8_ARG6;
 310 
 311       // Reload some volatile registers which we've spilled before the call
 312       // to frame manager / native entry.
 313       // Access all locals via frame pointer, because we know nothing about
 314       // the topmost frame's size.
 315       __ ld(r_entryframe_fp, _abi0(callers_sp), R1_SP);
 316       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
 317       __ ld(r_arg_result_addr,
 318             _entry_frame_locals_neg(result_address), r_entryframe_fp);
 319       __ ld(r_arg_result_type,
 320             _entry_frame_locals_neg(result_type), r_entryframe_fp);
 321       __ ld(r_cr, _abi0(cr), r_entryframe_fp);
 322       __ ld(r_lr, _abi0(lr), r_entryframe_fp);
 323 
 324       // pop frame and restore non-volatiles, LR and CR
 325       __ mr(R1_SP, r_entryframe_fp);
 326       __ mtcr(r_cr);
 327       __ mtlr(r_lr);
 328 
 329       // Store result depending on type. Everything that is not
 330       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
 331       __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
 332       __ cmpwi(CCR1, r_arg_result_type, T_LONG);
 333       __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
 334       __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
 335 
 336       // restore non-volatile registers
 337       __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
 338 
 339 
 340       // Stack on exit from call_stub:
 341       //
 342       //      0       [C_FRAME]
 343       //              ...
 344       //
 345       //  no call_stub frames left.
 346 
 347       // All non-volatiles have been restored at this point!!
 348       assert(R3_RET == R3, "R3_RET should be R3");
 349 
 350       __ beq(CCR0, ret_is_object);
 351       __ beq(CCR1, ret_is_long);
 352       __ beq(CCR5, ret_is_float);
 353       __ beq(CCR6, ret_is_double);
 354 
 355       // default:
 356       __ stw(R3_RET, 0, r_arg_result_addr);
 357       __ blr(); // return to caller
 358 
 359       // case T_OBJECT:
 360       __ bind(ret_is_object);
 361       __ std(R3_RET, 0, r_arg_result_addr);
 362       __ blr(); // return to caller
 363 
 364       // case T_LONG:
 365       __ bind(ret_is_long);
 366       __ std(R3_RET, 0, r_arg_result_addr);
 367       __ blr(); // return to caller
 368 
 369       // case T_FLOAT:
 370       __ bind(ret_is_float);
 371       __ stfs(F1_RET, 0, r_arg_result_addr);
 372       __ blr(); // return to caller
 373 
 374       // case T_DOUBLE:
 375       __ bind(ret_is_double);
 376       __ stfd(F1_RET, 0, r_arg_result_addr);
 377       __ blr(); // return to caller
 378     }
 379 
 380     return start;
 381   }
 382 
 383   // Return point for a Java call if there's an exception thrown in
 384   // Java code.  The exception is caught and transformed into a
 385   // pending exception stored in JavaThread that can be tested from
 386   // within the VM.
 387   //
 388   address generate_catch_exception() {
 389     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 390 
 391     address start = __ pc();
 392 
 393     // Registers alive
 394     //
 395     //  R16_thread
 396     //  R3_ARG1 - address of pending exception
 397     //  R4_ARG2 - return address in call stub
 398 
 399     const Register exception_file = R21_tmp1;
 400     const Register exception_line = R22_tmp2;
 401 
 402     __ load_const(exception_file, (void*)__FILE__);
 403     __ load_const(exception_line, (void*)__LINE__);
 404 
 405     __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
 406     // store into `char *'
 407     __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
 408     // store into `int'
 409     __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
 410 
 411     // complete return to VM
 412     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 413 
 414     __ mtlr(R4_ARG2);
 415     // continue in call stub
 416     __ blr();
 417 
 418     return start;
 419   }
 420 
 421   // Continuation point for runtime calls returning with a pending
 422   // exception.  The pending exception check happened in the runtime
 423   // or native call stub.  The pending exception in Thread is
 424   // converted into a Java-level exception.
 425   //
 426   // Read:
 427   //
 428   //   LR:     The pc the runtime library callee wants to return to.
 429   //           Since the exception occurred in the callee, the return pc
 430   //           from the point of view of Java is the exception pc.
 431   //   thread: Needed for method handles.
 432   //
 433   // Invalidate:
 434   //
 435   //   volatile registers (except below).
 436   //
 437   // Update:
 438   //
 439   //   R4_ARG2: exception
 440   //
 441   // (LR is unchanged and is live out).
 442   //
 443   address generate_forward_exception() {
 444     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 445     address start = __ pc();
 446 
 447     if (VerifyOops) {
 448       // Get pending exception oop.
 449       __ ld(R3_ARG1,
 450                 in_bytes(Thread::pending_exception_offset()),
 451                 R16_thread);
 452       // Make sure that this code is only executed if there is a pending exception.
 453       {
 454         Label L;
 455         __ cmpdi(CCR0, R3_ARG1, 0);
 456         __ bne(CCR0, L);
 457         __ stop("StubRoutines::forward exception: no pending exception (1)");
 458         __ bind(L);
 459       }
 460       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
 461     }
 462 
 463     // Save LR/CR and copy exception pc (LR) into R4_ARG2.
 464     __ save_LR_CR(R4_ARG2);
 465     __ push_frame_reg_args(0, R0);
 466     // Find exception handler.
 467     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 468                      SharedRuntime::exception_handler_for_return_address),
 469                     R16_thread,
 470                     R4_ARG2);
 471     // Copy handler's address.
 472     __ mtctr(R3_RET);
 473     __ pop_frame();
 474     __ restore_LR_CR(R0);
 475 
 476     // Set up the arguments for the exception handler:
 477     //  - R3_ARG1: exception oop
 478     //  - R4_ARG2: exception pc.
 479 
 480     // Load pending exception oop.
 481     __ ld(R3_ARG1,
 482               in_bytes(Thread::pending_exception_offset()),
 483               R16_thread);
 484 
 485     // The exception pc is the return address in the caller.
 486     // Must load it into R4_ARG2.
 487     __ mflr(R4_ARG2);
 488 
 489 #ifdef ASSERT
 490     // Make sure exception is set.
 491     {
 492       Label L;
 493       __ cmpdi(CCR0, R3_ARG1, 0);
 494       __ bne(CCR0, L);
 495       __ stop("StubRoutines::forward exception: no pending exception (2)");
 496       __ bind(L);
 497     }
 498 #endif
 499 
 500     // Clear the pending exception.
 501     __ li(R0, 0);
 502     __ std(R0,
 503                in_bytes(Thread::pending_exception_offset()),
 504                R16_thread);
 505     // Jump to exception handler.
 506     __ bctr();
 507 
 508     return start;
 509   }
 510 
 511 #undef __
 512 #define __ masm->
 513   // Continuation point for throwing of implicit exceptions that are
 514   // not handled in the current activation. Fabricates an exception
 515   // oop and initiates normal exception dispatching in this
 516   // frame. Only callee-saved registers are preserved (through the
 517   // normal register window / RegisterMap handling).  If the compiler
 518   // needs all registers to be preserved between the fault point and
 519   // the exception handler then it must assume responsibility for that
 520   // in AbstractCompiler::continuation_for_implicit_null_exception or
 521   // continuation_for_implicit_division_by_zero_exception. All other
 522   // implicit exceptions (e.g., NullPointerException or
 523   // AbstractMethodError on entry) are either at call sites or
 524   // otherwise assume that stack unwinding will be initiated, so
 525   // caller saved registers were assumed volatile in the compiler.
 526   //
 527   // Note that we generate only this stub into a RuntimeStub, because
 528   // it needs to be properly traversed and ignored during GC, so we
 529   // change the meaning of the "__" macro within this method.
 530   //
 531   // Note: the routine set_pc_not_at_call_for_caller in
 532   // SharedRuntime.cpp requires that this code be generated into a
 533   // RuntimeStub.
 534   address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
 535                                    Register arg1 = noreg, Register arg2 = noreg) {
 536     CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
 537     MacroAssembler* masm = new MacroAssembler(&code);
 538 
 539     OopMapSet* oop_maps  = new OopMapSet();
 540     int frame_size_in_bytes = frame::abi_reg_args_size;
 541     OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
 542 
 543     address start = __ pc();
 544 
 545     __ save_LR_CR(R11_scratch1);
 546 
 547     // Push a frame.
 548     __ push_frame_reg_args(0, R11_scratch1);
 549 
 550     address frame_complete_pc = __ pc();
 551 
 552     if (restore_saved_exception_pc) {
 553       __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc");
 554     }
 555 
 556     // Note that we always have a runtime stub frame on the top of
 557     // stack by this point. Remember the offset of the instruction
 558     // whose address will be moved to R11_scratch1.
 559     address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
 560 
 561     __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
 562 
 563     __ mr(R3_ARG1, R16_thread);
 564     if (arg1 != noreg) {
 565       __ mr(R4_ARG2, arg1);
 566     }
 567     if (arg2 != noreg) {
 568       __ mr(R5_ARG3, arg2);
 569     }
 570 #if defined(ABI_ELFv2)
 571     __ call_c(runtime_entry, relocInfo::none);
 572 #else
 573     __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);
 574 #endif
 575 
 576     // Set an oopmap for the call site.
 577     oop_maps->add_gc_map((int)(gc_map_pc - start), map);
 578 
 579     __ reset_last_Java_frame();
 580 
 581 #ifdef ASSERT
 582     // Make sure that this code is only executed if there is a pending
 583     // exception.
 584     {
 585       Label L;
 586       __ ld(R0,
 587                 in_bytes(Thread::pending_exception_offset()),
 588                 R16_thread);
 589       __ cmpdi(CCR0, R0, 0);
 590       __ bne(CCR0, L);
 591       __ stop("StubRoutines::throw_exception: no pending exception");
 592       __ bind(L);
 593     }
 594 #endif
 595 
 596     // Pop frame.
 597     __ pop_frame();
 598 
 599     __ restore_LR_CR(R11_scratch1);
 600 
 601     __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
 602     __ mtctr(R11_scratch1);
 603     __ bctr();
 604 
 605     // Create runtime stub with OopMap.
 606     RuntimeStub* stub =
 607       RuntimeStub::new_runtime_stub(name, &code,
 608                                     /*frame_complete=*/ (int)(frame_complete_pc - start),
 609                                     frame_size_in_bytes/wordSize,
 610                                     oop_maps,
 611                                     false);
 612     return stub->entry_point();
 613   }
 614 #undef __
 615 #define __ _masm->
 616 
 617 
 618   // Support for void zero_words_aligned8(HeapWord* to, size_t count)
 619   //
 620   // Arguments:
 621   //   to:
 622   //   count:
 623   //
 624   // Destroys:
 625   //
 626   address generate_zero_words_aligned8() {
 627     StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
 628 
 629     // Implemented as in ClearArray.
 630     address start = __ function_entry();
 631 
 632     Register base_ptr_reg   = R3_ARG1; // tohw (needs to be 8b aligned)
 633     Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
 634     Register tmp1_reg       = R5_ARG3;
 635     Register tmp2_reg       = R6_ARG4;
 636     Register zero_reg       = R7_ARG5;
 637 
 638     // Procedure for large arrays (uses data cache block zero instruction).
 639     Label dwloop, fast, fastloop, restloop, lastdword, done;
 640     int cl_size = VM_Version::L1_data_cache_line_size();
 641     int cl_dwords = cl_size >> 3;
 642     int cl_dwordaddr_bits = exact_log2(cl_dwords);
 643     int min_dcbz = 2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
 644 
 645     // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
 646     __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...
 647     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if number of dwords is even.
 648     __ srdi_(tmp1_reg, cnt_dwords_reg, 1);      // number of double dwords
 649     __ load_const_optimized(zero_reg, 0L);      // Use as zero register.
 650 
 651     __ cmpdi(CCR1, tmp2_reg, 0);                // cnt_dwords even?
 652     __ beq(CCR0, lastdword);                    // size <= 1
 653     __ mtctr(tmp1_reg);                         // Speculatively preload counter for rest loop (>0).
 654     __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
 655     __ neg(tmp1_reg, base_ptr_reg);             // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
 656 
 657     __ blt(CCR0, restloop);                     // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
 658     __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
 659 
 660     __ beq(CCR0, fast);                         // already 128byte aligned
 661     __ mtctr(tmp1_reg);                         // Set ctr to hit 128byte boundary (0<ctr<cnt).
 662     __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
 663 
 664     // Clear in first cache line dword-by-dword if not already 128byte aligned.
 665     __ bind(dwloop);
 666       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
 667       __ addi(base_ptr_reg, base_ptr_reg, 8);
 668     __ bdnz(dwloop);
 669 
 670     // clear 128byte blocks
 671     __ bind(fast);
 672     __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
 673     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if rest even
 674 
 675     __ mtctr(tmp1_reg);                         // load counter
 676     __ cmpdi(CCR1, tmp2_reg, 0);                // rest even?
 677     __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
 678 
 679     __ bind(fastloop);
 680       __ dcbz(base_ptr_reg);                    // Clear 128byte aligned block.
 681       __ addi(base_ptr_reg, base_ptr_reg, cl_size);
 682     __ bdnz(fastloop);
 683 
 684     //__ dcbtst(base_ptr_reg);                  // Indicate write access to last cache line.
 685     __ beq(CCR0, lastdword);                    // rest<=1
 686     __ mtctr(tmp1_reg);                         // load counter
 687 
 688     // Clear rest.
 689     __ bind(restloop);
 690       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
 691       __ std(zero_reg, 8, base_ptr_reg);        // Clear 8byte aligned block.
 692       __ addi(base_ptr_reg, base_ptr_reg, 16);
 693     __ bdnz(restloop);
 694 
 695     __ bind(lastdword);
 696     __ beq(CCR1, done);
 697     __ std(zero_reg, 0, base_ptr_reg);
 698     __ bind(done);
 699     __ blr();                                   // return
 700 
 701     return start;
 702   }
 703 
 704 #if !defined(PRODUCT)
 705   // Wrapper which calls oopDesc::is_oop_or_null()
 706   // Only called by MacroAssembler::verify_oop
 707   static void verify_oop_helper(const char* message, oopDesc* o) {
 708     if (!oopDesc::is_oop_or_null(o)) {
 709       fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
 710     }
 711     ++ StubRoutines::_verify_oop_count;
 712   }
 713 #endif
 714 
 715   // Return address of code to be called from code generated by
 716   // MacroAssembler::verify_oop.
 717   //
 718   // Don't generate, rather use C++ code.
 719   address generate_verify_oop() {
 720     // this is actually a `FunctionDescriptor*'.
 721     address start = 0;
 722 
 723 #if !defined(PRODUCT)
 724     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 725 #endif
 726 
 727     return start;
 728   }
 729 
 730   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
 731   //
 732   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
 733   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
 734   //
 735   // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
 736   // for turning on loop predication optimization, and hence the behavior of "array range check"
 737   // and "loop invariant check" could be influenced, which potentially boosted JVM98.
 738   //
 739   // Generate stub for disjoint short fill. If "aligned" is true, the
 740   // "to" address is assumed to be heapword aligned.
 741   //
 742   // Arguments for generated stub:
 743   //   to:    R3_ARG1
 744   //   value: R4_ARG2
 745   //   count: R5_ARG3 treated as signed
 746   //
 747   address generate_fill(BasicType t, bool aligned, const char* name) {
 748     StubCodeMark mark(this, "StubRoutines", name);
 749     address start = __ function_entry();
 750 
 751     const Register to    = R3_ARG1;   // source array address
 752     const Register value = R4_ARG2;   // fill value
 753     const Register count = R5_ARG3;   // elements count
 754     const Register temp  = R6_ARG4;   // temp register
 755 
 756     //assert_clean_int(count, O3);    // Make sure 'count' is clean int.
 757 
 758     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
 759     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
 760 
 761     int shift = -1;
 762     switch (t) {
 763        case T_BYTE:
 764         shift = 2;
 765         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 766         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit
 767         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 768         __ blt(CCR0, L_fill_elements);
 769         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 770         break;
 771        case T_SHORT:
 772         shift = 1;
 773         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 774         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 775         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 776         __ blt(CCR0, L_fill_elements);
 777         break;
 778       case T_INT:
 779         shift = 0;
 780         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 781         __ blt(CCR0, L_fill_4_bytes);
 782         break;
 783       default: ShouldNotReachHere();
 784     }
 785 
 786     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
 787       // Align source address at 4 bytes address boundary.
 788       if (t == T_BYTE) {
 789         // One byte misalignment happens only for byte arrays.
 790         __ andi_(temp, to, 1);
 791         __ beq(CCR0, L_skip_align1);
 792         __ stb(value, 0, to);
 793         __ addi(to, to, 1);
 794         __ addi(count, count, -1);
 795         __ bind(L_skip_align1);
 796       }
 797       // Two bytes misalignment happens only for byte and short (char) arrays.
 798       __ andi_(temp, to, 2);
 799       __ beq(CCR0, L_skip_align2);
 800       __ sth(value, 0, to);
 801       __ addi(to, to, 2);
 802       __ addi(count, count, -(1 << (shift - 1)));
 803       __ bind(L_skip_align2);
 804     }
 805 
 806     if (!aligned) {
 807       // Align to 8 bytes, we know we are 4 byte aligned to start.
 808       __ andi_(temp, to, 7);
 809       __ beq(CCR0, L_fill_32_bytes);
 810       __ stw(value, 0, to);
 811       __ addi(to, to, 4);
 812       __ addi(count, count, -(1 << shift));
 813       __ bind(L_fill_32_bytes);
 814     }
 815 
 816     __ li(temp, 8<<shift);                  // Prepare for 32 byte loop.
 817     // Clone bytes int->long as above.
 818     __ rldimi(value, value, 32, 0);         // 32 bit -> 64 bit
 819 
 820     Label L_check_fill_8_bytes;
 821     // Fill 32-byte chunks.
 822     __ subf_(count, temp, count);
 823     __ blt(CCR0, L_check_fill_8_bytes);
 824 
 825     Label L_fill_32_bytes_loop;
 826     __ align(32);
 827     __ bind(L_fill_32_bytes_loop);
 828 
 829     __ std(value, 0, to);
 830     __ std(value, 8, to);
 831     __ subf_(count, temp, count);           // Update count.
 832     __ std(value, 16, to);
 833     __ std(value, 24, to);
 834 
 835     __ addi(to, to, 32);
 836     __ bge(CCR0, L_fill_32_bytes_loop);
 837 
 838     __ bind(L_check_fill_8_bytes);
 839     __ add_(count, temp, count);
 840     __ beq(CCR0, L_exit);
 841     __ addic_(count, count, -(2 << shift));
 842     __ blt(CCR0, L_fill_4_bytes);
 843 
 844     //
 845     // Length is too short, just fill 8 bytes at a time.
 846     //
 847     Label L_fill_8_bytes_loop;
 848     __ bind(L_fill_8_bytes_loop);
 849     __ std(value, 0, to);
 850     __ addic_(count, count, -(2 << shift));
 851     __ addi(to, to, 8);
 852     __ bge(CCR0, L_fill_8_bytes_loop);
 853 
 854     // Fill trailing 4 bytes.
 855     __ bind(L_fill_4_bytes);
 856     __ andi_(temp, count, 1<<shift);
 857     __ beq(CCR0, L_fill_2_bytes);
 858 
 859     __ stw(value, 0, to);
 860     if (t == T_BYTE || t == T_SHORT) {
 861       __ addi(to, to, 4);
 862       // Fill trailing 2 bytes.
 863       __ bind(L_fill_2_bytes);
 864       __ andi_(temp, count, 1<<(shift-1));
 865       __ beq(CCR0, L_fill_byte);
 866       __ sth(value, 0, to);
 867       if (t == T_BYTE) {
 868         __ addi(to, to, 2);
 869         // Fill trailing byte.
 870         __ bind(L_fill_byte);
 871         __ andi_(count, count, 1);
 872         __ beq(CCR0, L_exit);
 873         __ stb(value, 0, to);
 874       } else {
 875         __ bind(L_fill_byte);
 876       }
 877     } else {
 878       __ bind(L_fill_2_bytes);
 879     }
 880     __ bind(L_exit);
 881     __ blr();
 882 
 883     // Handle copies less than 8 bytes. Int is handled elsewhere.
 884     if (t == T_BYTE) {
 885       __ bind(L_fill_elements);
 886       Label L_fill_2, L_fill_4;
 887       __ andi_(temp, count, 1);
 888       __ beq(CCR0, L_fill_2);
 889       __ stb(value, 0, to);
 890       __ addi(to, to, 1);
 891       __ bind(L_fill_2);
 892       __ andi_(temp, count, 2);
 893       __ beq(CCR0, L_fill_4);
 894       __ stb(value, 0, to);
 895       __ stb(value, 0, to);
 896       __ addi(to, to, 2);
 897       __ bind(L_fill_4);
 898       __ andi_(temp, count, 4);
 899       __ beq(CCR0, L_exit);
 900       __ stb(value, 0, to);
 901       __ stb(value, 1, to);
 902       __ stb(value, 2, to);
 903       __ stb(value, 3, to);
 904       __ blr();
 905     }
 906 
 907     if (t == T_SHORT) {
 908       Label L_fill_2;
 909       __ bind(L_fill_elements);
 910       __ andi_(temp, count, 1);
 911       __ beq(CCR0, L_fill_2);
 912       __ sth(value, 0, to);
 913       __ addi(to, to, 2);
 914       __ bind(L_fill_2);
 915       __ andi_(temp, count, 2);
 916       __ beq(CCR0, L_exit);
 917       __ sth(value, 0, to);
 918       __ sth(value, 2, to);
 919       __ blr();
 920     }
 921     return start;
 922   }
 923 
 924   inline void assert_positive_int(Register count) {
 925 #ifdef ASSERT
 926     __ srdi_(R0, count, 31);
 927     __ asm_assert_eq("missing zero extend");
 928 #endif
 929   }
 930 
 931   // Generate overlap test for array copy stubs.
 932   //
 933   // Input:
 934   //   R3_ARG1    -  from
 935   //   R4_ARG2    -  to
 936   //   R5_ARG3    -  element count
 937   //
 938   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
 939     Register tmp1 = R6_ARG4;
 940     Register tmp2 = R7_ARG5;
 941 
 942     assert_positive_int(R5_ARG3);
 943 
 944     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
 945     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
 946     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
 947     __ cmpld(CCR1, tmp1, tmp2);
 948     __ crnand(CCR0, Assembler::less, CCR1, Assembler::less);
 949     // Overlaps if Src before dst and distance smaller than size.
 950     // Branch to forward copy routine otherwise (within range of 32kB).
 951     __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::less), no_overlap_target);
 952 
 953     // need to copy backwards
 954   }
 955 
 956   // This is common errorexit stub for UnsafeCopyMemory.
 957   address generate_unsafecopy_common_error_exit() {
 958     address start_pc = __ pc();
 959     Register tmp1 = R6_ARG4;
 960     // probably copy stub would have changed value reset it.
 961     if (VM_Version::has_mfdscr()) {
 962       __ load_const_optimized(tmp1, VM_Version::_dscr_val);
 963       __ mtdscr(tmp1);
 964     }
 965     __ li(R3_RET, 0); // return 0
 966     __ blr();
 967     return start_pc;
 968   }
 969 
 970   // The guideline in the implementations of generate_disjoint_xxx_copy
 971   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
 972   // single instructions, but to avoid alignment interrupts (see subsequent
 973   // comment). Furthermore, we try to minimize misaligned access, even
 974   // though they cause no alignment interrupt.
 975   //
 976   // In Big-Endian mode, the PowerPC architecture requires implementations to
 977   // handle automatically misaligned integer halfword and word accesses,
 978   // word-aligned integer doubleword accesses, and word-aligned floating-point
 979   // accesses. Other accesses may or may not generate an Alignment interrupt
 980   // depending on the implementation.
 981   // Alignment interrupt handling may require on the order of hundreds of cycles,
 982   // so every effort should be made to avoid misaligned memory values.
 983   //
 984   //
 985   // Generate stub for disjoint byte copy.  If "aligned" is true, the
 986   // "from" and "to" addresses are assumed to be heapword aligned.
 987   //
 988   // Arguments for generated stub:
 989   //      from:  R3_ARG1
 990   //      to:    R4_ARG2
 991   //      count: R5_ARG3 treated as signed
 992   //
 993   address generate_disjoint_byte_copy(bool aligned, const char * name) {
 994     StubCodeMark mark(this, "StubRoutines", name);
 995     address start = __ function_entry();
 996     assert_positive_int(R5_ARG3);
 997 
 998     Register tmp1 = R6_ARG4;
 999     Register tmp2 = R7_ARG5;
1000     Register tmp3 = R8_ARG6;
1001     Register tmp4 = R9_ARG7;
1002 
1003     VectorSRegister tmp_vsr1  = VSR1;
1004     VectorSRegister tmp_vsr2  = VSR2;
1005 
1006     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
1007     {
1008       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1009       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1010 
1011       // Don't try anything fancy if arrays don't have many elements.
1012       __ li(tmp3, 0);
1013       __ cmpwi(CCR0, R5_ARG3, 17);
1014       __ ble(CCR0, l_6); // copy 4 at a time
1015 
1016       if (!aligned) {
1017         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1018         __ andi_(tmp1, tmp1, 3);
1019         __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1020 
1021         // Copy elements if necessary to align to 4 bytes.
1022         __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1023         __ andi_(tmp1, tmp1, 3);
1024         __ beq(CCR0, l_2);
1025 
1026         __ subf(R5_ARG3, tmp1, R5_ARG3);
1027         __ bind(l_9);
1028         __ lbz(tmp2, 0, R3_ARG1);
1029         __ addic_(tmp1, tmp1, -1);
1030         __ stb(tmp2, 0, R4_ARG2);
1031         __ addi(R3_ARG1, R3_ARG1, 1);
1032         __ addi(R4_ARG2, R4_ARG2, 1);
1033         __ bne(CCR0, l_9);
1034 
1035         __ bind(l_2);
1036       }
1037 
1038       // copy 8 elements at a time
1039       __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1040       __ andi_(tmp1, tmp2, 7);
1041       __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1042 
1043       // copy a 2-element word if necessary to align to 8 bytes
1044       __ andi_(R0, R3_ARG1, 7);
1045       __ beq(CCR0, l_7);
1046 
1047       __ lwzx(tmp2, R3_ARG1, tmp3);
1048       __ addi(R5_ARG3, R5_ARG3, -4);
1049       __ stwx(tmp2, R4_ARG2, tmp3);
1050       { // FasterArrayCopy
1051         __ addi(R3_ARG1, R3_ARG1, 4);
1052         __ addi(R4_ARG2, R4_ARG2, 4);
1053       }
1054       __ bind(l_7);
1055 
1056       { // FasterArrayCopy
1057         __ cmpwi(CCR0, R5_ARG3, 31);
1058         __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
1059 
1060         __ srdi(tmp1, R5_ARG3, 5);
1061         __ andi_(R5_ARG3, R5_ARG3, 31);
1062         __ mtctr(tmp1);
1063 
1064        if (!VM_Version::has_vsx()) {
1065 
1066         __ bind(l_8);
1067         // Use unrolled version for mass copying (copy 32 elements a time)
1068         // Load feeding store gets zero latency on Power6, however not on Power5.
1069         // Therefore, the following sequence is made for the good of both.
1070         __ ld(tmp1, 0, R3_ARG1);
1071         __ ld(tmp2, 8, R3_ARG1);
1072         __ ld(tmp3, 16, R3_ARG1);
1073         __ ld(tmp4, 24, R3_ARG1);
1074         __ std(tmp1, 0, R4_ARG2);
1075         __ std(tmp2, 8, R4_ARG2);
1076         __ std(tmp3, 16, R4_ARG2);
1077         __ std(tmp4, 24, R4_ARG2);
1078         __ addi(R3_ARG1, R3_ARG1, 32);
1079         __ addi(R4_ARG2, R4_ARG2, 32);
1080         __ bdnz(l_8);
1081 
1082       } else { // Processor supports VSX, so use it to mass copy.
1083 
1084         // Prefetch the data into the L2 cache.
1085         __ dcbt(R3_ARG1, 0);
1086 
1087         // If supported set DSCR pre-fetch to deepest.
1088         if (VM_Version::has_mfdscr()) {
1089           __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1090           __ mtdscr(tmp2);
1091         }
1092 
1093         __ li(tmp1, 16);
1094 
1095         // Backbranch target aligned to 32-byte. Not 16-byte align as
1096         // loop contains < 8 instructions that fit inside a single
1097         // i-cache sector.
1098         __ align(32);
1099 
1100         __ bind(l_10);
1101         // Use loop with VSX load/store instructions to
1102         // copy 32 elements a time.
1103         __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1104         __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1105         __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1106         __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1107         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1108         __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1109         __ bdnz(l_10);                       // Dec CTR and loop if not zero.
1110 
1111         // Restore DSCR pre-fetch value.
1112         if (VM_Version::has_mfdscr()) {
1113           __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1114           __ mtdscr(tmp2);
1115         }
1116 
1117       } // VSX
1118      } // FasterArrayCopy
1119 
1120       __ bind(l_6);
1121 
1122       // copy 4 elements at a time
1123       __ cmpwi(CCR0, R5_ARG3, 4);
1124       __ blt(CCR0, l_1);
1125       __ srdi(tmp1, R5_ARG3, 2);
1126       __ mtctr(tmp1); // is > 0
1127       __ andi_(R5_ARG3, R5_ARG3, 3);
1128 
1129       { // FasterArrayCopy
1130         __ addi(R3_ARG1, R3_ARG1, -4);
1131         __ addi(R4_ARG2, R4_ARG2, -4);
1132         __ bind(l_3);
1133         __ lwzu(tmp2, 4, R3_ARG1);
1134         __ stwu(tmp2, 4, R4_ARG2);
1135         __ bdnz(l_3);
1136         __ addi(R3_ARG1, R3_ARG1, 4);
1137         __ addi(R4_ARG2, R4_ARG2, 4);
1138       }
1139 
1140       // do single element copy
1141       __ bind(l_1);
1142       __ cmpwi(CCR0, R5_ARG3, 0);
1143       __ beq(CCR0, l_4);
1144 
1145       { // FasterArrayCopy
1146         __ mtctr(R5_ARG3);
1147         __ addi(R3_ARG1, R3_ARG1, -1);
1148         __ addi(R4_ARG2, R4_ARG2, -1);
1149 
1150         __ bind(l_5);
1151         __ lbzu(tmp2, 1, R3_ARG1);
1152         __ stbu(tmp2, 1, R4_ARG2);
1153         __ bdnz(l_5);
1154       }
1155     }
1156 
1157     __ bind(l_4);
1158     __ li(R3_RET, 0); // return 0
1159     __ blr();
1160 
1161     return start;
1162   }
1163 
1164   // Generate stub for conjoint byte copy.  If "aligned" is true, the
1165   // "from" and "to" addresses are assumed to be heapword aligned.
1166   //
1167   // Arguments for generated stub:
1168   //      from:  R3_ARG1
1169   //      to:    R4_ARG2
1170   //      count: R5_ARG3 treated as signed
1171   //
1172   address generate_conjoint_byte_copy(bool aligned, const char * name) {
1173     StubCodeMark mark(this, "StubRoutines", name);
1174     address start = __ function_entry();
1175     assert_positive_int(R5_ARG3);
1176 
1177     Register tmp1 = R6_ARG4;
1178     Register tmp2 = R7_ARG5;
1179     Register tmp3 = R8_ARG6;
1180 
1181     address nooverlap_target = aligned ?
1182       STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy) :
1183       STUB_ENTRY(jbyte_disjoint_arraycopy);
1184 
1185     array_overlap_test(nooverlap_target, 0);
1186     // Do reverse copy. We assume the case of actual overlap is rare enough
1187     // that we don't have to optimize it.
1188     Label l_1, l_2;
1189     {
1190       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1191       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1192       __ b(l_2);
1193       __ bind(l_1);
1194       __ stbx(tmp1, R4_ARG2, R5_ARG3);
1195       __ bind(l_2);
1196       __ addic_(R5_ARG3, R5_ARG3, -1);
1197       __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1198       __ bge(CCR0, l_1);
1199     }
1200     __ li(R3_RET, 0); // return 0
1201     __ blr();
1202 
1203     return start;
1204   }
1205 
1206   // Generate stub for disjoint short copy.  If "aligned" is true, the
1207   // "from" and "to" addresses are assumed to be heapword aligned.
1208   //
1209   // Arguments for generated stub:
1210   //      from:  R3_ARG1
1211   //      to:    R4_ARG2
1212   //  elm.count: R5_ARG3 treated as signed
1213   //
1214   // Strategy for aligned==true:
1215   //
1216   //  If length <= 9:
1217   //     1. copy 2 elements at a time (l_6)
1218   //     2. copy last element if original element count was odd (l_1)
1219   //
1220   //  If length > 9:
1221   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
1222   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
1223   //     3. copy last element if one was left in step 2. (l_1)
1224   //
1225   //
1226   // Strategy for aligned==false:
1227   //
1228   //  If length <= 9: same as aligned==true case, but NOTE: load/stores
1229   //                  can be unaligned (see comment below)
1230   //
1231   //  If length > 9:
1232   //     1. continue with step 6. if the alignment of from and to mod 4
1233   //        is different.
1234   //     2. align from and to to 4 bytes by copying 1 element if necessary
1235   //     3. at l_2 from and to are 4 byte aligned; continue with
1236   //        5. if they cannot be aligned to 8 bytes because they have
1237   //        got different alignment mod 8.
1238   //     4. at this point we know that both, from and to, have the same
1239   //        alignment mod 8, now copy one element if necessary to get
1240   //        8 byte alignment of from and to.
1241   //     5. copy 4 elements at a time until less than 4 elements are
1242   //        left; depending on step 3. all load/stores are aligned or
1243   //        either all loads or all stores are unaligned.
1244   //     6. copy 2 elements at a time until less than 2 elements are
1245   //        left (l_6); arriving here from step 1., there is a chance
1246   //        that all accesses are unaligned.
1247   //     7. copy last element if one was left in step 6. (l_1)
1248   //
1249   //  There are unaligned data accesses using integer load/store
1250   //  instructions in this stub. POWER allows such accesses.
1251   //
1252   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1253   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
1254   //  integer load/stores have good performance. Only unaligned
1255   //  floating point load/stores can have poor performance.
1256   //
1257   //  TODO:
1258   //
1259   //  1. check if aligning the backbranch target of loops is beneficial
1260   //
1261   address generate_disjoint_short_copy(bool aligned, const char * name) {
1262     StubCodeMark mark(this, "StubRoutines", name);
1263 
1264     Register tmp1 = R6_ARG4;
1265     Register tmp2 = R7_ARG5;
1266     Register tmp3 = R8_ARG6;
1267     Register tmp4 = R9_ARG7;
1268 
1269     VectorSRegister tmp_vsr1  = VSR1;
1270     VectorSRegister tmp_vsr2  = VSR2;
1271 
1272     address start = __ function_entry();
1273     assert_positive_int(R5_ARG3);
1274 
1275     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1276     {
1277       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1278       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1279       // don't try anything fancy if arrays don't have many elements
1280       __ li(tmp3, 0);
1281       __ cmpwi(CCR0, R5_ARG3, 9);
1282       __ ble(CCR0, l_6); // copy 2 at a time
1283 
1284       if (!aligned) {
1285         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1286         __ andi_(tmp1, tmp1, 3);
1287         __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1288 
1289         // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1290 
1291         // Copy 1 element if necessary to align to 4 bytes.
1292         __ andi_(tmp1, R3_ARG1, 3);
1293         __ beq(CCR0, l_2);
1294 
1295         __ lhz(tmp2, 0, R3_ARG1);
1296         __ addi(R3_ARG1, R3_ARG1, 2);
1297         __ sth(tmp2, 0, R4_ARG2);
1298         __ addi(R4_ARG2, R4_ARG2, 2);
1299         __ addi(R5_ARG3, R5_ARG3, -1);
1300         __ bind(l_2);
1301 
1302         // At this point the positions of both, from and to, are at least 4 byte aligned.
1303 
1304         // Copy 4 elements at a time.
1305         // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1306         __ xorr(tmp2, R3_ARG1, R4_ARG2);
1307         __ andi_(tmp1, tmp2, 7);
1308         __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1309 
1310         // Copy a 2-element word if necessary to align to 8 bytes.
1311         __ andi_(R0, R3_ARG1, 7);
1312         __ beq(CCR0, l_7);
1313 
1314         __ lwzx(tmp2, R3_ARG1, tmp3);
1315         __ addi(R5_ARG3, R5_ARG3, -2);
1316         __ stwx(tmp2, R4_ARG2, tmp3);
1317         { // FasterArrayCopy
1318           __ addi(R3_ARG1, R3_ARG1, 4);
1319           __ addi(R4_ARG2, R4_ARG2, 4);
1320         }
1321       }
1322 
1323       __ bind(l_7);
1324 
1325       // Copy 4 elements at a time; either the loads or the stores can
1326       // be unaligned if aligned == false.
1327 
1328       { // FasterArrayCopy
1329         __ cmpwi(CCR0, R5_ARG3, 15);
1330         __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1331 
1332         __ srdi(tmp1, R5_ARG3, 4);
1333         __ andi_(R5_ARG3, R5_ARG3, 15);
1334         __ mtctr(tmp1);
1335 
1336         if (!VM_Version::has_vsx()) {
1337 
1338           __ bind(l_8);
1339           // Use unrolled version for mass copying (copy 16 elements a time).
1340           // Load feeding store gets zero latency on Power6, however not on Power5.
1341           // Therefore, the following sequence is made for the good of both.
1342           __ ld(tmp1, 0, R3_ARG1);
1343           __ ld(tmp2, 8, R3_ARG1);
1344           __ ld(tmp3, 16, R3_ARG1);
1345           __ ld(tmp4, 24, R3_ARG1);
1346           __ std(tmp1, 0, R4_ARG2);
1347           __ std(tmp2, 8, R4_ARG2);
1348           __ std(tmp3, 16, R4_ARG2);
1349           __ std(tmp4, 24, R4_ARG2);
1350           __ addi(R3_ARG1, R3_ARG1, 32);
1351           __ addi(R4_ARG2, R4_ARG2, 32);
1352           __ bdnz(l_8);
1353 
1354         } else { // Processor supports VSX, so use it to mass copy.
1355 
1356           // Prefetch src data into L2 cache.
1357           __ dcbt(R3_ARG1, 0);
1358 
1359           // If supported set DSCR pre-fetch to deepest.
1360           if (VM_Version::has_mfdscr()) {
1361             __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1362             __ mtdscr(tmp2);
1363           }
1364           __ li(tmp1, 16);
1365 
1366           // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1367           // as loop contains < 8 instructions that fit inside a single
1368           // i-cache sector.
1369           __ align(32);
1370 
1371           __ bind(l_9);
1372           // Use loop with VSX load/store instructions to
1373           // copy 16 elements a time.
1374           __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load from src.
1375           __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
1376           __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
1377           __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1378           __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
1379           __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
1380           __ bdnz(l_9);                        // Dec CTR and loop if not zero.
1381 
1382           // Restore DSCR pre-fetch value.
1383           if (VM_Version::has_mfdscr()) {
1384             __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1385             __ mtdscr(tmp2);
1386           }
1387 
1388         }
1389       } // FasterArrayCopy
1390       __ bind(l_6);
1391 
1392       // copy 2 elements at a time
1393       { // FasterArrayCopy
1394         __ cmpwi(CCR0, R5_ARG3, 2);
1395         __ blt(CCR0, l_1);
1396         __ srdi(tmp1, R5_ARG3, 1);
1397         __ andi_(R5_ARG3, R5_ARG3, 1);
1398 
1399         __ addi(R3_ARG1, R3_ARG1, -4);
1400         __ addi(R4_ARG2, R4_ARG2, -4);
1401         __ mtctr(tmp1);
1402 
1403         __ bind(l_3);
1404         __ lwzu(tmp2, 4, R3_ARG1);
1405         __ stwu(tmp2, 4, R4_ARG2);
1406         __ bdnz(l_3);
1407 
1408         __ addi(R3_ARG1, R3_ARG1, 4);
1409         __ addi(R4_ARG2, R4_ARG2, 4);
1410       }
1411 
1412       // do single element copy
1413       __ bind(l_1);
1414       __ cmpwi(CCR0, R5_ARG3, 0);
1415       __ beq(CCR0, l_4);
1416 
1417       { // FasterArrayCopy
1418         __ mtctr(R5_ARG3);
1419         __ addi(R3_ARG1, R3_ARG1, -2);
1420         __ addi(R4_ARG2, R4_ARG2, -2);
1421 
1422         __ bind(l_5);
1423         __ lhzu(tmp2, 2, R3_ARG1);
1424         __ sthu(tmp2, 2, R4_ARG2);
1425         __ bdnz(l_5);
1426       }
1427     }
1428 
1429     __ bind(l_4);
1430     __ li(R3_RET, 0); // return 0
1431     __ blr();
1432 
1433     return start;
1434   }
1435 
1436   // Generate stub for conjoint short copy.  If "aligned" is true, the
1437   // "from" and "to" addresses are assumed to be heapword aligned.
1438   //
1439   // Arguments for generated stub:
1440   //      from:  R3_ARG1
1441   //      to:    R4_ARG2
1442   //      count: R5_ARG3 treated as signed
1443   //
1444   address generate_conjoint_short_copy(bool aligned, const char * name) {
1445     StubCodeMark mark(this, "StubRoutines", name);
1446     address start = __ function_entry();
1447     assert_positive_int(R5_ARG3);
1448 
1449     Register tmp1 = R6_ARG4;
1450     Register tmp2 = R7_ARG5;
1451     Register tmp3 = R8_ARG6;
1452 
1453     address nooverlap_target = aligned ?
1454       STUB_ENTRY(arrayof_jshort_disjoint_arraycopy) :
1455       STUB_ENTRY(jshort_disjoint_arraycopy);
1456 
1457     array_overlap_test(nooverlap_target, 1);
1458 
1459     Label l_1, l_2;
1460     {
1461       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1462       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1463       __ sldi(tmp1, R5_ARG3, 1);
1464       __ b(l_2);
1465       __ bind(l_1);
1466       __ sthx(tmp2, R4_ARG2, tmp1);
1467       __ bind(l_2);
1468       __ addic_(tmp1, tmp1, -2);
1469       __ lhzx(tmp2, R3_ARG1, tmp1);
1470       __ bge(CCR0, l_1);
1471     }
1472     __ li(R3_RET, 0); // return 0
1473     __ blr();
1474 
1475     return start;
1476   }
1477 
1478   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"
1479   // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1480   //
1481   // Arguments:
1482   //      from:  R3_ARG1
1483   //      to:    R4_ARG2
1484   //      count: R5_ARG3 treated as signed
1485   //
1486   void generate_disjoint_int_copy_core(bool aligned) {
1487     Register tmp1 = R6_ARG4;
1488     Register tmp2 = R7_ARG5;
1489     Register tmp3 = R8_ARG6;
1490     Register tmp4 = R0;
1491 
1492     VectorSRegister tmp_vsr1  = VSR1;
1493     VectorSRegister tmp_vsr2  = VSR2;
1494 
1495     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1496 
1497     // for short arrays, just do single element copy
1498     __ li(tmp3, 0);
1499     __ cmpwi(CCR0, R5_ARG3, 5);
1500     __ ble(CCR0, l_2);
1501 
1502     if (!aligned) {
1503         // check if arrays have same alignment mod 8.
1504         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1505         __ andi_(R0, tmp1, 7);
1506         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1507         __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1508 
1509         // copy 1 element to align to and from on an 8 byte boundary
1510         __ andi_(R0, R3_ARG1, 7);
1511         __ beq(CCR0, l_4);
1512 
1513         __ lwzx(tmp2, R3_ARG1, tmp3);
1514         __ addi(R5_ARG3, R5_ARG3, -1);
1515         __ stwx(tmp2, R4_ARG2, tmp3);
1516         { // FasterArrayCopy
1517           __ addi(R3_ARG1, R3_ARG1, 4);
1518           __ addi(R4_ARG2, R4_ARG2, 4);
1519         }
1520         __ bind(l_4);
1521       }
1522 
1523     { // FasterArrayCopy
1524       __ cmpwi(CCR0, R5_ARG3, 7);
1525       __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
1526 
1527       __ srdi(tmp1, R5_ARG3, 3);
1528       __ andi_(R5_ARG3, R5_ARG3, 7);
1529       __ mtctr(tmp1);
1530 
1531      if (!VM_Version::has_vsx()) {
1532 
1533       __ bind(l_6);
1534       // Use unrolled version for mass copying (copy 8 elements a time).
1535       // Load feeding store gets zero latency on power6, however not on power 5.
1536       // Therefore, the following sequence is made for the good of both.
1537       __ ld(tmp1, 0, R3_ARG1);
1538       __ ld(tmp2, 8, R3_ARG1);
1539       __ ld(tmp3, 16, R3_ARG1);
1540       __ ld(tmp4, 24, R3_ARG1);
1541       __ std(tmp1, 0, R4_ARG2);
1542       __ std(tmp2, 8, R4_ARG2);
1543       __ std(tmp3, 16, R4_ARG2);
1544       __ std(tmp4, 24, R4_ARG2);
1545       __ addi(R3_ARG1, R3_ARG1, 32);
1546       __ addi(R4_ARG2, R4_ARG2, 32);
1547       __ bdnz(l_6);
1548 
1549     } else { // Processor supports VSX, so use it to mass copy.
1550 
1551       // Prefetch the data into the L2 cache.
1552       __ dcbt(R3_ARG1, 0);
1553 
1554       // If supported set DSCR pre-fetch to deepest.
1555       if (VM_Version::has_mfdscr()) {
1556         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1557         __ mtdscr(tmp2);
1558       }
1559 
1560       __ li(tmp1, 16);
1561 
1562       // Backbranch target aligned to 32-byte. Not 16-byte align as
1563       // loop contains < 8 instructions that fit inside a single
1564       // i-cache sector.
1565       __ align(32);
1566 
1567       __ bind(l_7);
1568       // Use loop with VSX load/store instructions to
1569       // copy 8 elements a time.
1570       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1571       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1572       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1573       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1574       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1575       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1576       __ bdnz(l_7);                        // Dec CTR and loop if not zero.
1577 
1578       // Restore DSCR pre-fetch value.
1579       if (VM_Version::has_mfdscr()) {
1580         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1581         __ mtdscr(tmp2);
1582       }
1583 
1584     } // VSX
1585    } // FasterArrayCopy
1586 
1587     // copy 1 element at a time
1588     __ bind(l_2);
1589     __ cmpwi(CCR0, R5_ARG3, 0);
1590     __ beq(CCR0, l_1);
1591 
1592     { // FasterArrayCopy
1593       __ mtctr(R5_ARG3);
1594       __ addi(R3_ARG1, R3_ARG1, -4);
1595       __ addi(R4_ARG2, R4_ARG2, -4);
1596 
1597       __ bind(l_3);
1598       __ lwzu(tmp2, 4, R3_ARG1);
1599       __ stwu(tmp2, 4, R4_ARG2);
1600       __ bdnz(l_3);
1601     }
1602 
1603     __ bind(l_1);
1604     return;
1605   }
1606 
1607   // Generate stub for disjoint int copy.  If "aligned" is true, the
1608   // "from" and "to" addresses are assumed to be heapword aligned.
1609   //
1610   // Arguments for generated stub:
1611   //      from:  R3_ARG1
1612   //      to:    R4_ARG2
1613   //      count: R5_ARG3 treated as signed
1614   //
1615   address generate_disjoint_int_copy(bool aligned, const char * name) {
1616     StubCodeMark mark(this, "StubRoutines", name);
1617     address start = __ function_entry();
1618     assert_positive_int(R5_ARG3);
1619     {
1620       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1621       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1622       generate_disjoint_int_copy_core(aligned);
1623     }
1624     __ li(R3_RET, 0); // return 0
1625     __ blr();
1626     return start;
1627   }
1628 
1629   // Generate core code for conjoint int copy (and oop copy on
1630   // 32-bit).  If "aligned" is true, the "from" and "to" addresses
1631   // are assumed to be heapword aligned.
1632   //
1633   // Arguments:
1634   //      from:  R3_ARG1
1635   //      to:    R4_ARG2
1636   //      count: R5_ARG3 treated as signed
1637   //
1638   void generate_conjoint_int_copy_core(bool aligned) {
1639     // Do reverse copy.  We assume the case of actual overlap is rare enough
1640     // that we don't have to optimize it.
1641 
1642     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1643 
1644     Register tmp1 = R6_ARG4;
1645     Register tmp2 = R7_ARG5;
1646     Register tmp3 = R8_ARG6;
1647     Register tmp4 = R0;
1648 
1649     VectorSRegister tmp_vsr1  = VSR1;
1650     VectorSRegister tmp_vsr2  = VSR2;
1651 
1652     { // FasterArrayCopy
1653       __ cmpwi(CCR0, R5_ARG3, 0);
1654       __ beq(CCR0, l_6);
1655 
1656       __ sldi(R5_ARG3, R5_ARG3, 2);
1657       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1658       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1659       __ srdi(R5_ARG3, R5_ARG3, 2);
1660 
1661       if (!aligned) {
1662         // check if arrays have same alignment mod 8.
1663         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1664         __ andi_(R0, tmp1, 7);
1665         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1666         __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1667 
1668         // copy 1 element to align to and from on an 8 byte boundary
1669         __ andi_(R0, R3_ARG1, 7);
1670         __ beq(CCR0, l_7);
1671 
1672         __ addi(R3_ARG1, R3_ARG1, -4);
1673         __ addi(R4_ARG2, R4_ARG2, -4);
1674         __ addi(R5_ARG3, R5_ARG3, -1);
1675         __ lwzx(tmp2, R3_ARG1);
1676         __ stwx(tmp2, R4_ARG2);
1677         __ bind(l_7);
1678       }
1679 
1680       __ cmpwi(CCR0, R5_ARG3, 7);
1681       __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
1682 
1683       __ srdi(tmp1, R5_ARG3, 3);
1684       __ andi(R5_ARG3, R5_ARG3, 7);
1685       __ mtctr(tmp1);
1686 
1687      if (!VM_Version::has_vsx()) {
1688       __ bind(l_4);
1689       // Use unrolled version for mass copying (copy 4 elements a time).
1690       // Load feeding store gets zero latency on Power6, however not on Power5.
1691       // Therefore, the following sequence is made for the good of both.
1692       __ addi(R3_ARG1, R3_ARG1, -32);
1693       __ addi(R4_ARG2, R4_ARG2, -32);
1694       __ ld(tmp4, 24, R3_ARG1);
1695       __ ld(tmp3, 16, R3_ARG1);
1696       __ ld(tmp2, 8, R3_ARG1);
1697       __ ld(tmp1, 0, R3_ARG1);
1698       __ std(tmp4, 24, R4_ARG2);
1699       __ std(tmp3, 16, R4_ARG2);
1700       __ std(tmp2, 8, R4_ARG2);
1701       __ std(tmp1, 0, R4_ARG2);
1702       __ bdnz(l_4);
1703      } else {  // Processor supports VSX, so use it to mass copy.
1704       // Prefetch the data into the L2 cache.
1705       __ dcbt(R3_ARG1, 0);
1706 
1707       // If supported set DSCR pre-fetch to deepest.
1708       if (VM_Version::has_mfdscr()) {
1709         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1710         __ mtdscr(tmp2);
1711       }
1712 
1713       __ li(tmp1, 16);
1714 
1715       // Backbranch target aligned to 32-byte. Not 16-byte align as
1716       // loop contains < 8 instructions that fit inside a single
1717       // i-cache sector.
1718       __ align(32);
1719 
1720       __ bind(l_4);
1721       // Use loop with VSX load/store instructions to
1722       // copy 8 elements a time.
1723       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1724       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1725       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1726       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1727       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1728       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1729       __ bdnz(l_4);
1730 
1731       // Restore DSCR pre-fetch value.
1732       if (VM_Version::has_mfdscr()) {
1733         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1734         __ mtdscr(tmp2);
1735       }
1736      }
1737 
1738       __ cmpwi(CCR0, R5_ARG3, 0);
1739       __ beq(CCR0, l_6);
1740 
1741       __ bind(l_5);
1742       __ mtctr(R5_ARG3);
1743       __ bind(l_3);
1744       __ lwz(R0, -4, R3_ARG1);
1745       __ stw(R0, -4, R4_ARG2);
1746       __ addi(R3_ARG1, R3_ARG1, -4);
1747       __ addi(R4_ARG2, R4_ARG2, -4);
1748       __ bdnz(l_3);
1749 
1750       __ bind(l_6);
1751     }
1752   }
1753 
1754   // Generate stub for conjoint int copy.  If "aligned" is true, the
1755   // "from" and "to" addresses are assumed to be heapword aligned.
1756   //
1757   // Arguments for generated stub:
1758   //      from:  R3_ARG1
1759   //      to:    R4_ARG2
1760   //      count: R5_ARG3 treated as signed
1761   //
1762   address generate_conjoint_int_copy(bool aligned, const char * name) {
1763     StubCodeMark mark(this, "StubRoutines", name);
1764     address start = __ function_entry();
1765     assert_positive_int(R5_ARG3);
1766     address nooverlap_target = aligned ?
1767       STUB_ENTRY(arrayof_jint_disjoint_arraycopy) :
1768       STUB_ENTRY(jint_disjoint_arraycopy);
1769 
1770     array_overlap_test(nooverlap_target, 2);
1771     {
1772       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1773       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1774       generate_conjoint_int_copy_core(aligned);
1775     }
1776 
1777     __ li(R3_RET, 0); // return 0
1778     __ blr();
1779 
1780     return start;
1781   }
1782 
1783   // Generate core code for disjoint long copy (and oop copy on
1784   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1785   // are assumed to be heapword aligned.
1786   //
1787   // Arguments:
1788   //      from:  R3_ARG1
1789   //      to:    R4_ARG2
1790   //      count: R5_ARG3 treated as signed
1791   //
1792   void generate_disjoint_long_copy_core(bool aligned) {
1793     Register tmp1 = R6_ARG4;
1794     Register tmp2 = R7_ARG5;
1795     Register tmp3 = R8_ARG6;
1796     Register tmp4 = R0;
1797 
1798     Label l_1, l_2, l_3, l_4, l_5;
1799 
1800     VectorSRegister tmp_vsr1  = VSR1;
1801     VectorSRegister tmp_vsr2  = VSR2;
1802 
1803     { // FasterArrayCopy
1804       __ cmpwi(CCR0, R5_ARG3, 3);
1805       __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
1806 
1807       __ srdi(tmp1, R5_ARG3, 2);
1808       __ andi_(R5_ARG3, R5_ARG3, 3);
1809       __ mtctr(tmp1);
1810 
1811     if (!VM_Version::has_vsx()) {
1812       __ bind(l_4);
1813       // Use unrolled version for mass copying (copy 4 elements a time).
1814       // Load feeding store gets zero latency on Power6, however not on Power5.
1815       // Therefore, the following sequence is made for the good of both.
1816       __ ld(tmp1, 0, R3_ARG1);
1817       __ ld(tmp2, 8, R3_ARG1);
1818       __ ld(tmp3, 16, R3_ARG1);
1819       __ ld(tmp4, 24, R3_ARG1);
1820       __ std(tmp1, 0, R4_ARG2);
1821       __ std(tmp2, 8, R4_ARG2);
1822       __ std(tmp3, 16, R4_ARG2);
1823       __ std(tmp4, 24, R4_ARG2);
1824       __ addi(R3_ARG1, R3_ARG1, 32);
1825       __ addi(R4_ARG2, R4_ARG2, 32);
1826       __ bdnz(l_4);
1827 
1828     } else { // Processor supports VSX, so use it to mass copy.
1829 
1830       // Prefetch the data into the L2 cache.
1831       __ dcbt(R3_ARG1, 0);
1832 
1833       // If supported set DSCR pre-fetch to deepest.
1834       if (VM_Version::has_mfdscr()) {
1835         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1836         __ mtdscr(tmp2);
1837       }
1838 
1839       __ li(tmp1, 16);
1840 
1841       // Backbranch target aligned to 32-byte. Not 16-byte align as
1842       // loop contains < 8 instructions that fit inside a single
1843       // i-cache sector.
1844       __ align(32);
1845 
1846       __ bind(l_5);
1847       // Use loop with VSX load/store instructions to
1848       // copy 4 elements a time.
1849       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1850       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1851       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1852       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1853       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1854       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1855       __ bdnz(l_5);                        // Dec CTR and loop if not zero.
1856 
1857       // Restore DSCR pre-fetch value.
1858       if (VM_Version::has_mfdscr()) {
1859         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1860         __ mtdscr(tmp2);
1861       }
1862 
1863     } // VSX
1864    } // FasterArrayCopy
1865 
1866     // copy 1 element at a time
1867     __ bind(l_3);
1868     __ cmpwi(CCR0, R5_ARG3, 0);
1869     __ beq(CCR0, l_1);
1870 
1871     { // FasterArrayCopy
1872       __ mtctr(R5_ARG3);
1873       __ addi(R3_ARG1, R3_ARG1, -8);
1874       __ addi(R4_ARG2, R4_ARG2, -8);
1875 
1876       __ bind(l_2);
1877       __ ldu(R0, 8, R3_ARG1);
1878       __ stdu(R0, 8, R4_ARG2);
1879       __ bdnz(l_2);
1880 
1881     }
1882     __ bind(l_1);
1883   }
1884 
1885   // Generate stub for disjoint long copy.  If "aligned" is true, the
1886   // "from" and "to" addresses are assumed to be heapword aligned.
1887   //
1888   // Arguments for generated stub:
1889   //      from:  R3_ARG1
1890   //      to:    R4_ARG2
1891   //      count: R5_ARG3 treated as signed
1892   //
1893   address generate_disjoint_long_copy(bool aligned, const char * name) {
1894     StubCodeMark mark(this, "StubRoutines", name);
1895     address start = __ function_entry();
1896     assert_positive_int(R5_ARG3);
1897     {
1898       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1899       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1900       generate_disjoint_long_copy_core(aligned);
1901     }
1902     __ li(R3_RET, 0); // return 0
1903     __ blr();
1904 
1905   return start;
1906   }
1907 
1908   // Generate core code for conjoint long copy (and oop copy on
1909   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1910   // are assumed to be heapword aligned.
1911   //
1912   // Arguments:
1913   //      from:  R3_ARG1
1914   //      to:    R4_ARG2
1915   //      count: R5_ARG3 treated as signed
1916   //
1917   void generate_conjoint_long_copy_core(bool aligned) {
1918     Register tmp1 = R6_ARG4;
1919     Register tmp2 = R7_ARG5;
1920     Register tmp3 = R8_ARG6;
1921     Register tmp4 = R0;
1922 
1923     VectorSRegister tmp_vsr1  = VSR1;
1924     VectorSRegister tmp_vsr2  = VSR2;
1925 
1926     Label l_1, l_2, l_3, l_4, l_5;
1927 
1928     __ cmpwi(CCR0, R5_ARG3, 0);
1929     __ beq(CCR0, l_1);
1930 
1931     { // FasterArrayCopy
1932       __ sldi(R5_ARG3, R5_ARG3, 3);
1933       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1934       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1935       __ srdi(R5_ARG3, R5_ARG3, 3);
1936 
1937       __ cmpwi(CCR0, R5_ARG3, 3);
1938       __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
1939 
1940       __ srdi(tmp1, R5_ARG3, 2);
1941       __ andi(R5_ARG3, R5_ARG3, 3);
1942       __ mtctr(tmp1);
1943 
1944      if (!VM_Version::has_vsx()) {
1945       __ bind(l_4);
1946       // Use unrolled version for mass copying (copy 4 elements a time).
1947       // Load feeding store gets zero latency on Power6, however not on Power5.
1948       // Therefore, the following sequence is made for the good of both.
1949       __ addi(R3_ARG1, R3_ARG1, -32);
1950       __ addi(R4_ARG2, R4_ARG2, -32);
1951       __ ld(tmp4, 24, R3_ARG1);
1952       __ ld(tmp3, 16, R3_ARG1);
1953       __ ld(tmp2, 8, R3_ARG1);
1954       __ ld(tmp1, 0, R3_ARG1);
1955       __ std(tmp4, 24, R4_ARG2);
1956       __ std(tmp3, 16, R4_ARG2);
1957       __ std(tmp2, 8, R4_ARG2);
1958       __ std(tmp1, 0, R4_ARG2);
1959       __ bdnz(l_4);
1960      } else { // Processor supports VSX, so use it to mass copy.
1961       // Prefetch the data into the L2 cache.
1962       __ dcbt(R3_ARG1, 0);
1963 
1964       // If supported set DSCR pre-fetch to deepest.
1965       if (VM_Version::has_mfdscr()) {
1966         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1967         __ mtdscr(tmp2);
1968       }
1969 
1970       __ li(tmp1, 16);
1971 
1972       // Backbranch target aligned to 32-byte. Not 16-byte align as
1973       // loop contains < 8 instructions that fit inside a single
1974       // i-cache sector.
1975       __ align(32);
1976 
1977       __ bind(l_4);
1978       // Use loop with VSX load/store instructions to
1979       // copy 4 elements a time.
1980       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1981       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1982       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1983       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1984       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1985       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1986       __ bdnz(l_4);
1987 
1988       // Restore DSCR pre-fetch value.
1989       if (VM_Version::has_mfdscr()) {
1990         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1991         __ mtdscr(tmp2);
1992       }
1993      }
1994 
1995       __ cmpwi(CCR0, R5_ARG3, 0);
1996       __ beq(CCR0, l_1);
1997 
1998       __ bind(l_5);
1999       __ mtctr(R5_ARG3);
2000       __ bind(l_3);
2001       __ ld(R0, -8, R3_ARG1);
2002       __ std(R0, -8, R4_ARG2);
2003       __ addi(R3_ARG1, R3_ARG1, -8);
2004       __ addi(R4_ARG2, R4_ARG2, -8);
2005       __ bdnz(l_3);
2006 
2007     }
2008     __ bind(l_1);
2009   }
2010 
2011   // Generate stub for conjoint long copy.  If "aligned" is true, the
2012   // "from" and "to" addresses are assumed to be heapword aligned.
2013   //
2014   // Arguments for generated stub:
2015   //      from:  R3_ARG1
2016   //      to:    R4_ARG2
2017   //      count: R5_ARG3 treated as signed
2018   //
2019   address generate_conjoint_long_copy(bool aligned, const char * name) {
2020     StubCodeMark mark(this, "StubRoutines", name);
2021     address start = __ function_entry();
2022     assert_positive_int(R5_ARG3);
2023     address nooverlap_target = aligned ?
2024       STUB_ENTRY(arrayof_jlong_disjoint_arraycopy) :
2025       STUB_ENTRY(jlong_disjoint_arraycopy);
2026 
2027     array_overlap_test(nooverlap_target, 3);
2028     {
2029       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
2030       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
2031       generate_conjoint_long_copy_core(aligned);
2032     }
2033     __ li(R3_RET, 0); // return 0
2034     __ blr();
2035 
2036     return start;
2037   }
2038 
2039   // Generate stub for conjoint oop copy.  If "aligned" is true, the
2040   // "from" and "to" addresses are assumed to be heapword aligned.
2041   //
2042   // Arguments for generated stub:
2043   //      from:  R3_ARG1
2044   //      to:    R4_ARG2
2045   //      count: R5_ARG3 treated as signed
2046   //      dest_uninitialized: G1 support
2047   //
2048   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2049     StubCodeMark mark(this, "StubRoutines", name);
2050 
2051     address start = __ function_entry();
2052     assert_positive_int(R5_ARG3);
2053     address nooverlap_target = aligned ?
2054       STUB_ENTRY(arrayof_oop_disjoint_arraycopy) :
2055       STUB_ENTRY(oop_disjoint_arraycopy);
2056 
2057     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2058     if (dest_uninitialized) {
2059       decorators |= IS_DEST_UNINITIALIZED;
2060     }
2061     if (aligned) {
2062       decorators |= ARRAYCOPY_ALIGNED;
2063     }
2064 
2065     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2066     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2067 
2068     if (UseCompressedOops) {
2069       array_overlap_test(nooverlap_target, 2);
2070       generate_conjoint_int_copy_core(aligned);
2071     } else {
2072       array_overlap_test(nooverlap_target, 3);
2073       generate_conjoint_long_copy_core(aligned);
2074     }
2075 
2076     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2077     __ li(R3_RET, 0); // return 0
2078     __ blr();
2079     return start;
2080   }
2081 
2082   // Generate stub for disjoint oop copy.  If "aligned" is true, the
2083   // "from" and "to" addresses are assumed to be heapword aligned.
2084   //
2085   // Arguments for generated stub:
2086   //      from:  R3_ARG1
2087   //      to:    R4_ARG2
2088   //      count: R5_ARG3 treated as signed
2089   //      dest_uninitialized: G1 support
2090   //
2091   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2092     StubCodeMark mark(this, "StubRoutines", name);
2093     address start = __ function_entry();
2094     assert_positive_int(R5_ARG3);
2095 
2096     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2097     if (dest_uninitialized) {
2098       decorators |= IS_DEST_UNINITIALIZED;
2099     }
2100     if (aligned) {
2101       decorators |= ARRAYCOPY_ALIGNED;
2102     }
2103 
2104     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2105     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2106 
2107     if (UseCompressedOops) {
2108       generate_disjoint_int_copy_core(aligned);
2109     } else {
2110       generate_disjoint_long_copy_core(aligned);
2111     }
2112 
2113     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2114     __ li(R3_RET, 0); // return 0
2115     __ blr();
2116 
2117     return start;
2118   }
2119 
2120 
2121   // Helper for generating a dynamic type check.
2122   // Smashes only the given temp registers.
2123   void generate_type_check(Register sub_klass,
2124                            Register super_check_offset,
2125                            Register super_klass,
2126                            Register temp,
2127                            Label& L_success) {
2128     assert_different_registers(sub_klass, super_check_offset, super_klass);
2129 
2130     BLOCK_COMMENT("type_check:");
2131 
2132     Label L_miss;
2133 
2134     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, R0, &L_success, &L_miss, NULL,
2135                                      super_check_offset);
2136     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp, R0, &L_success, NULL);
2137 
2138     // Fall through on failure!
2139     __ bind(L_miss);
2140   }
2141 
2142 
2143   //  Generate stub for checked oop copy.
2144   //
2145   // Arguments for generated stub:
2146   //      from:  R3
2147   //      to:    R4
2148   //      count: R5 treated as signed
2149   //      ckoff: R6 (super_check_offset)
2150   //      ckval: R7 (super_klass)
2151   //      ret:   R3 zero for success; (-1^K) where K is partial transfer count
2152   //
2153   address generate_checkcast_copy(const char *name, bool dest_uninitialized) {
2154 
2155     const Register R3_from   = R3_ARG1;      // source array address
2156     const Register R4_to     = R4_ARG2;      // destination array address
2157     const Register R5_count  = R5_ARG3;      // elements count
2158     const Register R6_ckoff  = R6_ARG4;      // super_check_offset
2159     const Register R7_ckval  = R7_ARG5;      // super_klass
2160 
2161     const Register R8_offset = R8_ARG6;      // loop var, with stride wordSize
2162     const Register R9_remain = R9_ARG7;      // loop var, with stride -1
2163     const Register R10_oop   = R10_ARG8;     // actual oop copied
2164     const Register R11_klass = R11_scratch1; // oop._klass
2165     const Register R12_tmp   = R12_scratch2;
2166 
2167     const Register R2_minus1 = R2;
2168 
2169     //__ align(CodeEntryAlignment);
2170     StubCodeMark mark(this, "StubRoutines", name);
2171     address start = __ function_entry();
2172 
2173     // Assert that int is 64 bit sign extended and arrays are not conjoint.
2174 #ifdef ASSERT
2175     {
2176     assert_positive_int(R5_ARG3);
2177     const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2;
2178     Label no_overlap;
2179     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
2180     __ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes
2181     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
2182     __ cmpld(CCR1, tmp1, tmp2);
2183     __ crnand(CCR0, Assembler::less, CCR1, Assembler::less);
2184     // Overlaps if Src before dst and distance smaller than size.
2185     // Branch to forward copy routine otherwise.
2186     __ blt(CCR0, no_overlap);
2187     __ stop("overlap in checkcast_copy");
2188     __ bind(no_overlap);
2189     }
2190 #endif
2191 
2192     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2193     if (dest_uninitialized) {
2194       decorators |= IS_DEST_UNINITIALIZED;
2195     }
2196 
2197     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2198     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_from, R4_to, R5_count, /* preserve: */ R6_ckoff, R7_ckval);
2199 
2200     //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
2201 
2202     Label load_element, store_element, store_null, success, do_epilogue;
2203     __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
2204     __ li(R8_offset, 0);                   // Offset from start of arrays.
2205     __ li(R2_minus1, -1);
2206     __ bne(CCR0, load_element);
2207 
2208     // Empty array: Nothing to do.
2209     __ li(R3_RET, 0);           // Return 0 on (trivial) success.
2210     __ blr();
2211 
2212     // ======== begin loop ========
2213     // (Entry is load_element.)
2214     __ align(OptoLoopAlignment);
2215     __ bind(store_element);
2216     if (UseCompressedOops) {
2217       __ encode_heap_oop_not_null(R10_oop);
2218       __ bind(store_null);
2219       __ stw(R10_oop, R8_offset, R4_to);
2220     } else {
2221       __ bind(store_null);
2222       __ std(R10_oop, R8_offset, R4_to);
2223     }
2224 
2225     __ addi(R8_offset, R8_offset, heapOopSize);   // Step to next offset.
2226     __ add_(R9_remain, R2_minus1, R9_remain);     // Decrement the count.
2227     __ beq(CCR0, success);
2228 
2229     // ======== loop entry is here ========
2230     __ bind(load_element);
2231     __ load_heap_oop(R10_oop, R8_offset, R3_from,
2232                      R11_scratch1, R12_tmp,
2233                      MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2234                      AS_RAW, &store_null);
2235 
2236     __ load_klass(R11_klass, R10_oop); // Query the object klass.
2237 
2238     generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp,
2239                         // Branch to this on success:
2240                         store_element);
2241     // ======== end loop ========
2242 
2243     // It was a real error; we must depend on the caller to finish the job.
2244     // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
2245     // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
2246     // and report their number to the caller.
2247     __ subf_(R5_count, R9_remain, R5_count);
2248     __ nand(R3_RET, R5_count, R5_count);   // report (-1^K) to caller
2249     __ bne(CCR0, do_epilogue);
2250     __ blr();
2251 
2252     __ bind(success);
2253     __ li(R3_RET, 0);
2254 
2255     __ bind(do_epilogue);
2256     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_to, R5_count, /* preserve */ R3_RET);
2257 
2258     __ blr();
2259     return start;
2260   }
2261 
2262 
2263   //  Generate 'unsafe' array copy stub.
2264   //  Though just as safe as the other stubs, it takes an unscaled
2265   //  size_t argument instead of an element count.
2266   //
2267   // Arguments for generated stub:
2268   //      from:  R3
2269   //      to:    R4
2270   //      count: R5 byte count, treated as ssize_t, can be zero
2271   //
2272   // Examines the alignment of the operands and dispatches
2273   // to a long, int, short, or byte copy loop.
2274   //
2275   address generate_unsafe_copy(const char* name,
2276                                address byte_copy_entry,
2277                                address short_copy_entry,
2278                                address int_copy_entry,
2279                                address long_copy_entry) {
2280 
2281     const Register R3_from   = R3_ARG1;      // source array address
2282     const Register R4_to     = R4_ARG2;      // destination array address
2283     const Register R5_count  = R5_ARG3;      // elements count (as long on PPC64)
2284 
2285     const Register R6_bits   = R6_ARG4;      // test copy of low bits
2286     const Register R7_tmp    = R7_ARG5;
2287 
2288     //__ align(CodeEntryAlignment);
2289     StubCodeMark mark(this, "StubRoutines", name);
2290     address start = __ function_entry();
2291 
2292     // Bump this on entry, not on exit:
2293     //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
2294 
2295     Label short_copy, int_copy, long_copy;
2296 
2297     __ orr(R6_bits, R3_from, R4_to);
2298     __ orr(R6_bits, R6_bits, R5_count);
2299     __ andi_(R0, R6_bits, (BytesPerLong-1));
2300     __ beq(CCR0, long_copy);
2301 
2302     __ andi_(R0, R6_bits, (BytesPerInt-1));
2303     __ beq(CCR0, int_copy);
2304 
2305     __ andi_(R0, R6_bits, (BytesPerShort-1));
2306     __ beq(CCR0, short_copy);
2307 
2308     // byte_copy:
2309     __ b(byte_copy_entry);
2310 
2311     __ bind(short_copy);
2312     __ srwi(R5_count, R5_count, LogBytesPerShort);
2313     __ b(short_copy_entry);
2314 
2315     __ bind(int_copy);
2316     __ srwi(R5_count, R5_count, LogBytesPerInt);
2317     __ b(int_copy_entry);
2318 
2319     __ bind(long_copy);
2320     __ srwi(R5_count, R5_count, LogBytesPerLong);
2321     __ b(long_copy_entry);
2322 
2323     return start;
2324   }
2325 
2326 
2327   // Perform range checks on the proposed arraycopy.
2328   // Kills the two temps, but nothing else.
2329   // Also, clean the sign bits of src_pos and dst_pos.
2330   void arraycopy_range_checks(Register src,     // source array oop
2331                               Register src_pos, // source position
2332                               Register dst,     // destination array oop
2333                               Register dst_pos, // destination position
2334                               Register length,  // length of copy
2335                               Register temp1, Register temp2,
2336                               Label& L_failed) {
2337     BLOCK_COMMENT("arraycopy_range_checks:");
2338 
2339     const Register array_length = temp1;  // scratch
2340     const Register end_pos      = temp2;  // scratch
2341 
2342     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2343     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
2344     __ add(end_pos, src_pos, length);  // src_pos + length
2345     __ cmpd(CCR0, end_pos, array_length);
2346     __ bgt(CCR0, L_failed);
2347 
2348     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2349     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
2350     __ add(end_pos, dst_pos, length);  // src_pos + length
2351     __ cmpd(CCR0, end_pos, array_length);
2352     __ bgt(CCR0, L_failed);
2353 
2354     BLOCK_COMMENT("arraycopy_range_checks done");
2355   }
2356 
2357 
2358   //
2359   //  Generate generic array copy stubs
2360   //
2361   //  Input:
2362   //    R3    -  src oop
2363   //    R4    -  src_pos
2364   //    R5    -  dst oop
2365   //    R6    -  dst_pos
2366   //    R7    -  element count
2367   //
2368   //  Output:
2369   //    R3 ==  0  -  success
2370   //    R3 == -1  -  need to call System.arraycopy
2371   //
2372   address generate_generic_copy(const char *name,
2373                                 address entry_jbyte_arraycopy,
2374                                 address entry_jshort_arraycopy,
2375                                 address entry_jint_arraycopy,
2376                                 address entry_oop_arraycopy,
2377                                 address entry_disjoint_oop_arraycopy,
2378                                 address entry_jlong_arraycopy,
2379                                 address entry_checkcast_arraycopy) {
2380     Label L_failed, L_objArray;
2381 
2382     // Input registers
2383     const Register src       = R3_ARG1;  // source array oop
2384     const Register src_pos   = R4_ARG2;  // source position
2385     const Register dst       = R5_ARG3;  // destination array oop
2386     const Register dst_pos   = R6_ARG4;  // destination position
2387     const Register length    = R7_ARG5;  // elements count
2388 
2389     // registers used as temp
2390     const Register src_klass = R8_ARG6;  // source array klass
2391     const Register dst_klass = R9_ARG7;  // destination array klass
2392     const Register lh        = R10_ARG8; // layout handler
2393     const Register temp      = R2;
2394 
2395     //__ align(CodeEntryAlignment);
2396     StubCodeMark mark(this, "StubRoutines", name);
2397     address start = __ function_entry();
2398 
2399     // Bump this on entry, not on exit:
2400     //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
2401 
2402     // In principle, the int arguments could be dirty.
2403 
2404     //-----------------------------------------------------------------------
2405     // Assembler stubs will be used for this call to arraycopy
2406     // if the following conditions are met:
2407     //
2408     // (1) src and dst must not be null.
2409     // (2) src_pos must not be negative.
2410     // (3) dst_pos must not be negative.
2411     // (4) length  must not be negative.
2412     // (5) src klass and dst klass should be the same and not NULL.
2413     // (6) src and dst should be arrays.
2414     // (7) src_pos + length must not exceed length of src.
2415     // (8) dst_pos + length must not exceed length of dst.
2416     BLOCK_COMMENT("arraycopy initial argument checks");
2417 
2418     __ cmpdi(CCR1, src, 0);      // if (src == NULL) return -1;
2419     __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
2420     __ cmpdi(CCR5, dst, 0);      // if (dst == NULL) return -1;
2421     __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
2422     __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
2423     __ cror(CCR5, Assembler::equal, CCR0, Assembler::less);
2424     __ extsw_(length, length);   // if (length < 0) return -1;
2425     __ cror(CCR1, Assembler::equal, CCR5, Assembler::equal);
2426     __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
2427     __ beq(CCR1, L_failed);
2428 
2429     BLOCK_COMMENT("arraycopy argument klass checks");
2430     __ load_klass(src_klass, src);
2431     __ load_klass(dst_klass, dst);
2432 
2433     // Load layout helper
2434     //
2435     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2436     // 32        30    24            16              8     2                 0
2437     //
2438     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2439     //
2440 
2441     int lh_offset = in_bytes(Klass::layout_helper_offset());
2442 
2443     // Load 32-bits signed value. Use br() instruction with it to check icc.
2444     __ lwz(lh, lh_offset, src_klass);
2445 
2446     // Handle objArrays completely differently...
2447     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2448     __ load_const_optimized(temp, objArray_lh, R0);
2449     __ cmpw(CCR0, lh, temp);
2450     __ beq(CCR0, L_objArray);
2451 
2452     __ cmpd(CCR5, src_klass, dst_klass);          // if (src->klass() != dst->klass()) return -1;
2453     __ cmpwi(CCR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
2454 
2455     __ crnand(CCR5, Assembler::equal, CCR6, Assembler::less);
2456     __ beq(CCR5, L_failed);
2457 
2458     // At this point, it is known to be a typeArray (array_tag 0x3).
2459 #ifdef ASSERT
2460     { Label L;
2461       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2462       __ load_const_optimized(temp, lh_prim_tag_in_place, R0);
2463       __ cmpw(CCR0, lh, temp);
2464       __ bge(CCR0, L);
2465       __ stop("must be a primitive array");
2466       __ bind(L);
2467     }
2468 #endif
2469 
2470     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2471                            temp, dst_klass, L_failed);
2472 
2473     // TypeArrayKlass
2474     //
2475     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2476     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2477     //
2478 
2479     const Register offset = dst_klass;    // array offset
2480     const Register elsize = src_klass;    // log2 element size
2481 
2482     __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
2483     __ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
2484     __ add(src, offset, src);       // src array offset
2485     __ add(dst, offset, dst);       // dst array offset
2486 
2487     // Next registers should be set before the jump to corresponding stub.
2488     const Register from     = R3_ARG1;  // source array address
2489     const Register to       = R4_ARG2;  // destination array address
2490     const Register count    = R5_ARG3;  // elements count
2491 
2492     // 'from', 'to', 'count' registers should be set in this order
2493     // since they are the same as 'src', 'src_pos', 'dst'.
2494 
2495     BLOCK_COMMENT("scale indexes to element size");
2496     __ sld(src_pos, src_pos, elsize);
2497     __ sld(dst_pos, dst_pos, elsize);
2498     __ add(from, src_pos, src);  // src_addr
2499     __ add(to, dst_pos, dst);    // dst_addr
2500     __ mr(count, length);        // length
2501 
2502     BLOCK_COMMENT("choose copy loop based on element size");
2503     // Using conditional branches with range 32kB.
2504     const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CCR0, Assembler::equal);
2505     __ cmpwi(CCR0, elsize, 0);
2506     __ bc(bo, bi, entry_jbyte_arraycopy);
2507     __ cmpwi(CCR0, elsize, LogBytesPerShort);
2508     __ bc(bo, bi, entry_jshort_arraycopy);
2509     __ cmpwi(CCR0, elsize, LogBytesPerInt);
2510     __ bc(bo, bi, entry_jint_arraycopy);
2511 #ifdef ASSERT
2512     { Label L;
2513       __ cmpwi(CCR0, elsize, LogBytesPerLong);
2514       __ beq(CCR0, L);
2515       __ stop("must be long copy, but elsize is wrong");
2516       __ bind(L);
2517     }
2518 #endif
2519     __ b(entry_jlong_arraycopy);
2520 
2521     // ObjArrayKlass
2522   __ bind(L_objArray);
2523     // live at this point:  src_klass, dst_klass, src[_pos], dst[_pos], length
2524 
2525     Label L_disjoint_plain_copy, L_checkcast_copy;
2526     //  test array classes for subtyping
2527     __ cmpd(CCR0, src_klass, dst_klass);         // usual case is exact equality
2528     __ bne(CCR0, L_checkcast_copy);
2529 
2530     // Identically typed arrays can be copied without element-wise checks.
2531     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2532                            temp, lh, L_failed);
2533 
2534     __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2535     __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2536     __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2537     __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2538     __ add(from, src_pos, src);  // src_addr
2539     __ add(to, dst_pos, dst);    // dst_addr
2540     __ mr(count, length);        // length
2541     __ b(entry_oop_arraycopy);
2542 
2543   __ bind(L_checkcast_copy);
2544     // live at this point:  src_klass, dst_klass
2545     {
2546       // Before looking at dst.length, make sure dst is also an objArray.
2547       __ lwz(temp, lh_offset, dst_klass);
2548       __ cmpw(CCR0, lh, temp);
2549       __ bne(CCR0, L_failed);
2550 
2551       // It is safe to examine both src.length and dst.length.
2552       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2553                              temp, lh, L_failed);
2554 
2555       // Marshal the base address arguments now, freeing registers.
2556       __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2557       __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2558       __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2559       __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2560       __ add(from, src_pos, src);  // src_addr
2561       __ add(to, dst_pos, dst);    // dst_addr
2562       __ mr(count, length);        // length
2563 
2564       Register sco_temp = R6_ARG4;             // This register is free now.
2565       assert_different_registers(from, to, count, sco_temp,
2566                                  dst_klass, src_klass);
2567 
2568       // Generate the type check.
2569       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2570       __ lwz(sco_temp, sco_offset, dst_klass);
2571       generate_type_check(src_klass, sco_temp, dst_klass,
2572                           temp, L_disjoint_plain_copy);
2573 
2574       // Fetch destination element klass from the ObjArrayKlass header.
2575       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2576 
2577       // The checkcast_copy loop needs two extra arguments:
2578       __ ld(R7_ARG5, ek_offset, dst_klass);   // dest elem klass
2579       __ lwz(R6_ARG4, sco_offset, R7_ARG5);   // sco of elem klass
2580       __ b(entry_checkcast_arraycopy);
2581     }
2582 
2583     __ bind(L_disjoint_plain_copy);
2584     __ b(entry_disjoint_oop_arraycopy);
2585 
2586   __ bind(L_failed);
2587     __ li(R3_RET, -1); // return -1
2588     __ blr();
2589     return start;
2590   }
2591 
2592   // Arguments for generated stub:
2593   //   R3_ARG1   - source byte array address
2594   //   R4_ARG2   - destination byte array address
2595   //   R5_ARG3   - round key array
2596   address generate_aescrypt_encryptBlock() {
2597     assert(UseAES, "need AES instructions and misaligned SSE support");
2598     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2599 
2600     address start = __ function_entry();
2601 
2602     Label L_doLast, L_error;
2603 
2604     Register from           = R3_ARG1;  // source array address
2605     Register to             = R4_ARG2;  // destination array address
2606     Register key            = R5_ARG3;  // round key array
2607 
2608     Register keylen         = R8;
2609     Register temp           = R9;
2610     Register keypos         = R10;
2611     Register fifteen        = R12;
2612 
2613     VectorRegister vRet     = VR0;
2614 
2615     VectorRegister vKey1    = VR1;
2616     VectorRegister vKey2    = VR2;
2617     VectorRegister vKey3    = VR3;
2618     VectorRegister vKey4    = VR4;
2619 
2620     VectorRegister fromPerm = VR5;
2621     VectorRegister keyPerm  = VR6;
2622     VectorRegister toPerm   = VR7;
2623     VectorRegister fSplt    = VR8;
2624 
2625     VectorRegister vTmp1    = VR9;
2626     VectorRegister vTmp2    = VR10;
2627     VectorRegister vTmp3    = VR11;
2628     VectorRegister vTmp4    = VR12;
2629 
2630     __ li              (fifteen, 15);
2631 
2632     // load unaligned from[0-15] to vRet
2633     __ lvx             (vRet, from);
2634     __ lvx             (vTmp1, fifteen, from);
2635     __ lvsl            (fromPerm, from);
2636 #ifdef VM_LITTLE_ENDIAN
2637     __ vspltisb        (fSplt, 0x0f);
2638     __ vxor            (fromPerm, fromPerm, fSplt);
2639 #endif
2640     __ vperm           (vRet, vRet, vTmp1, fromPerm);
2641 
2642     // load keylen (44 or 52 or 60)
2643     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2644 
2645     // to load keys
2646     __ load_perm       (keyPerm, key);
2647 #ifdef VM_LITTLE_ENDIAN
2648     __ vspltisb        (vTmp2, -16);
2649     __ vrld            (keyPerm, keyPerm, vTmp2);
2650     __ vrld            (keyPerm, keyPerm, vTmp2);
2651     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
2652 #endif
2653 
2654     // load the 1st round key to vTmp1
2655     __ lvx             (vTmp1, key);
2656     __ li              (keypos, 16);
2657     __ lvx             (vKey1, keypos, key);
2658     __ vec_perm        (vTmp1, vKey1, keyPerm);
2659 
2660     // 1st round
2661     __ vxor            (vRet, vRet, vTmp1);
2662 
2663     // load the 2nd round key to vKey1
2664     __ li              (keypos, 32);
2665     __ lvx             (vKey2, keypos, key);
2666     __ vec_perm        (vKey1, vKey2, keyPerm);
2667 
2668     // load the 3rd round key to vKey2
2669     __ li              (keypos, 48);
2670     __ lvx             (vKey3, keypos, key);
2671     __ vec_perm        (vKey2, vKey3, keyPerm);
2672 
2673     // load the 4th round key to vKey3
2674     __ li              (keypos, 64);
2675     __ lvx             (vKey4, keypos, key);
2676     __ vec_perm        (vKey3, vKey4, keyPerm);
2677 
2678     // load the 5th round key to vKey4
2679     __ li              (keypos, 80);
2680     __ lvx             (vTmp1, keypos, key);
2681     __ vec_perm        (vKey4, vTmp1, keyPerm);
2682 
2683     // 2nd - 5th rounds
2684     __ vcipher         (vRet, vRet, vKey1);
2685     __ vcipher         (vRet, vRet, vKey2);
2686     __ vcipher         (vRet, vRet, vKey3);
2687     __ vcipher         (vRet, vRet, vKey4);
2688 
2689     // load the 6th round key to vKey1
2690     __ li              (keypos, 96);
2691     __ lvx             (vKey2, keypos, key);
2692     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2693 
2694     // load the 7th round key to vKey2
2695     __ li              (keypos, 112);
2696     __ lvx             (vKey3, keypos, key);
2697     __ vec_perm        (vKey2, vKey3, keyPerm);
2698 
2699     // load the 8th round key to vKey3
2700     __ li              (keypos, 128);
2701     __ lvx             (vKey4, keypos, key);
2702     __ vec_perm        (vKey3, vKey4, keyPerm);
2703 
2704     // load the 9th round key to vKey4
2705     __ li              (keypos, 144);
2706     __ lvx             (vTmp1, keypos, key);
2707     __ vec_perm        (vKey4, vTmp1, keyPerm);
2708 
2709     // 6th - 9th rounds
2710     __ vcipher         (vRet, vRet, vKey1);
2711     __ vcipher         (vRet, vRet, vKey2);
2712     __ vcipher         (vRet, vRet, vKey3);
2713     __ vcipher         (vRet, vRet, vKey4);
2714 
2715     // load the 10th round key to vKey1
2716     __ li              (keypos, 160);
2717     __ lvx             (vKey2, keypos, key);
2718     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2719 
2720     // load the 11th round key to vKey2
2721     __ li              (keypos, 176);
2722     __ lvx             (vTmp1, keypos, key);
2723     __ vec_perm        (vKey2, vTmp1, keyPerm);
2724 
2725     // if all round keys are loaded, skip next 4 rounds
2726     __ cmpwi           (CCR0, keylen, 44);
2727     __ beq             (CCR0, L_doLast);
2728 
2729     // 10th - 11th rounds
2730     __ vcipher         (vRet, vRet, vKey1);
2731     __ vcipher         (vRet, vRet, vKey2);
2732 
2733     // load the 12th round key to vKey1
2734     __ li              (keypos, 192);
2735     __ lvx             (vKey2, keypos, key);
2736     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2737 
2738     // load the 13th round key to vKey2
2739     __ li              (keypos, 208);
2740     __ lvx             (vTmp1, keypos, key);
2741     __ vec_perm        (vKey2, vTmp1, keyPerm);
2742 
2743     // if all round keys are loaded, skip next 2 rounds
2744     __ cmpwi           (CCR0, keylen, 52);
2745     __ beq             (CCR0, L_doLast);
2746 
2747 #ifdef ASSERT
2748     __ cmpwi           (CCR0, keylen, 60);
2749     __ bne             (CCR0, L_error);
2750 #endif
2751 
2752     // 12th - 13th rounds
2753     __ vcipher         (vRet, vRet, vKey1);
2754     __ vcipher         (vRet, vRet, vKey2);
2755 
2756     // load the 14th round key to vKey1
2757     __ li              (keypos, 224);
2758     __ lvx             (vKey2, keypos, key);
2759     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2760 
2761     // load the 15th round key to vKey2
2762     __ li              (keypos, 240);
2763     __ lvx             (vTmp1, keypos, key);
2764     __ vec_perm        (vKey2, vTmp1, keyPerm);
2765 
2766     __ bind(L_doLast);
2767 
2768     // last two rounds
2769     __ vcipher         (vRet, vRet, vKey1);
2770     __ vcipherlast     (vRet, vRet, vKey2);
2771 
2772 #ifdef VM_LITTLE_ENDIAN
2773     // toPerm = 0x0F0E0D0C0B0A09080706050403020100
2774     __ lvsl            (toPerm, keypos); // keypos is a multiple of 16
2775     __ vxor            (toPerm, toPerm, fSplt);
2776 
2777     // Swap Bytes
2778     __ vperm           (vRet, vRet, vRet, toPerm);
2779 #endif
2780 
2781     // store result (unaligned)
2782     // Note: We can't use a read-modify-write sequence which touches additional Bytes.
2783     Register lo = temp, hi = fifteen; // Reuse
2784     __ vsldoi          (vTmp1, vRet, vRet, 8);
2785     __ mfvrd           (hi, vRet);
2786     __ mfvrd           (lo, vTmp1);
2787     __ std             (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
2788     __ std             (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
2789 
2790     __ blr();
2791 
2792 #ifdef ASSERT
2793     __ bind(L_error);
2794     __ stop("aescrypt_encryptBlock: invalid key length");
2795 #endif
2796      return start;
2797   }
2798 
2799   // Arguments for generated stub:
2800   //   R3_ARG1   - source byte array address
2801   //   R4_ARG2   - destination byte array address
2802   //   R5_ARG3   - K (key) in little endian int array
2803   address generate_aescrypt_decryptBlock() {
2804     assert(UseAES, "need AES instructions and misaligned SSE support");
2805     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2806 
2807     address start = __ function_entry();
2808 
2809     Label L_doLast, L_do44, L_do52, L_error;
2810 
2811     Register from           = R3_ARG1;  // source array address
2812     Register to             = R4_ARG2;  // destination array address
2813     Register key            = R5_ARG3;  // round key array
2814 
2815     Register keylen         = R8;
2816     Register temp           = R9;
2817     Register keypos         = R10;
2818     Register fifteen        = R12;
2819 
2820     VectorRegister vRet     = VR0;
2821 
2822     VectorRegister vKey1    = VR1;
2823     VectorRegister vKey2    = VR2;
2824     VectorRegister vKey3    = VR3;
2825     VectorRegister vKey4    = VR4;
2826     VectorRegister vKey5    = VR5;
2827 
2828     VectorRegister fromPerm = VR6;
2829     VectorRegister keyPerm  = VR7;
2830     VectorRegister toPerm   = VR8;
2831     VectorRegister fSplt    = VR9;
2832 
2833     VectorRegister vTmp1    = VR10;
2834     VectorRegister vTmp2    = VR11;
2835     VectorRegister vTmp3    = VR12;
2836     VectorRegister vTmp4    = VR13;
2837 
2838     __ li              (fifteen, 15);
2839 
2840     // load unaligned from[0-15] to vRet
2841     __ lvx             (vRet, from);
2842     __ lvx             (vTmp1, fifteen, from);
2843     __ lvsl            (fromPerm, from);
2844 #ifdef VM_LITTLE_ENDIAN
2845     __ vspltisb        (fSplt, 0x0f);
2846     __ vxor            (fromPerm, fromPerm, fSplt);
2847 #endif
2848     __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
2849 
2850     // load keylen (44 or 52 or 60)
2851     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2852 
2853     // to load keys
2854     __ load_perm       (keyPerm, key);
2855 #ifdef VM_LITTLE_ENDIAN
2856     __ vxor            (vTmp2, vTmp2, vTmp2);
2857     __ vspltisb        (vTmp2, -16);
2858     __ vrld            (keyPerm, keyPerm, vTmp2);
2859     __ vrld            (keyPerm, keyPerm, vTmp2);
2860     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
2861 #endif
2862 
2863     __ cmpwi           (CCR0, keylen, 44);
2864     __ beq             (CCR0, L_do44);
2865 
2866     __ cmpwi           (CCR0, keylen, 52);
2867     __ beq             (CCR0, L_do52);
2868 
2869 #ifdef ASSERT
2870     __ cmpwi           (CCR0, keylen, 60);
2871     __ bne             (CCR0, L_error);
2872 #endif
2873 
2874     // load the 15th round key to vKey1
2875     __ li              (keypos, 240);
2876     __ lvx             (vKey1, keypos, key);
2877     __ li              (keypos, 224);
2878     __ lvx             (vKey2, keypos, key);
2879     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
2880 
2881     // load the 14th round key to vKey2
2882     __ li              (keypos, 208);
2883     __ lvx             (vKey3, keypos, key);
2884     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2885 
2886     // load the 13th round key to vKey3
2887     __ li              (keypos, 192);
2888     __ lvx             (vKey4, keypos, key);
2889     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
2890 
2891     // load the 12th round key to vKey4
2892     __ li              (keypos, 176);
2893     __ lvx             (vKey5, keypos, key);
2894     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
2895 
2896     // load the 11th round key to vKey5
2897     __ li              (keypos, 160);
2898     __ lvx             (vTmp1, keypos, key);
2899     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
2900 
2901     // 1st - 5th rounds
2902     __ vxor            (vRet, vRet, vKey1);
2903     __ vncipher        (vRet, vRet, vKey2);
2904     __ vncipher        (vRet, vRet, vKey3);
2905     __ vncipher        (vRet, vRet, vKey4);
2906     __ vncipher        (vRet, vRet, vKey5);
2907 
2908     __ b               (L_doLast);
2909 
2910     __ align(32);
2911     __ bind            (L_do52);
2912 
2913     // load the 13th round key to vKey1
2914     __ li              (keypos, 208);
2915     __ lvx             (vKey1, keypos, key);
2916     __ li              (keypos, 192);
2917     __ lvx             (vKey2, keypos, key);
2918     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
2919 
2920     // load the 12th round key to vKey2
2921     __ li              (keypos, 176);
2922     __ lvx             (vKey3, keypos, key);
2923     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2924 
2925     // load the 11th round key to vKey3
2926     __ li              (keypos, 160);
2927     __ lvx             (vTmp1, keypos, key);
2928     __ vec_perm        (vKey3, vTmp1, vKey3, keyPerm);
2929 
2930     // 1st - 3rd rounds
2931     __ vxor            (vRet, vRet, vKey1);
2932     __ vncipher        (vRet, vRet, vKey2);
2933     __ vncipher        (vRet, vRet, vKey3);
2934 
2935     __ b               (L_doLast);
2936 
2937     __ align(32);
2938     __ bind            (L_do44);
2939 
2940     // load the 11th round key to vKey1
2941     __ li              (keypos, 176);
2942     __ lvx             (vKey1, keypos, key);
2943     __ li              (keypos, 160);
2944     __ lvx             (vTmp1, keypos, key);
2945     __ vec_perm        (vKey1, vTmp1, vKey1, keyPerm);
2946 
2947     // 1st round
2948     __ vxor            (vRet, vRet, vKey1);
2949 
2950     __ bind            (L_doLast);
2951 
2952     // load the 10th round key to vKey1
2953     __ li              (keypos, 144);
2954     __ lvx             (vKey2, keypos, key);
2955     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
2956 
2957     // load the 9th round key to vKey2
2958     __ li              (keypos, 128);
2959     __ lvx             (vKey3, keypos, key);
2960     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2961 
2962     // load the 8th round key to vKey3
2963     __ li              (keypos, 112);
2964     __ lvx             (vKey4, keypos, key);
2965     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
2966 
2967     // load the 7th round key to vKey4
2968     __ li              (keypos, 96);
2969     __ lvx             (vKey5, keypos, key);
2970     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
2971 
2972     // load the 6th round key to vKey5
2973     __ li              (keypos, 80);
2974     __ lvx             (vTmp1, keypos, key);
2975     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
2976 
2977     // last 10th - 6th rounds
2978     __ vncipher        (vRet, vRet, vKey1);
2979     __ vncipher        (vRet, vRet, vKey2);
2980     __ vncipher        (vRet, vRet, vKey3);
2981     __ vncipher        (vRet, vRet, vKey4);
2982     __ vncipher        (vRet, vRet, vKey5);
2983 
2984     // load the 5th round key to vKey1
2985     __ li              (keypos, 64);
2986     __ lvx             (vKey2, keypos, key);
2987     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
2988 
2989     // load the 4th round key to vKey2
2990     __ li              (keypos, 48);
2991     __ lvx             (vKey3, keypos, key);
2992     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2993 
2994     // load the 3rd round key to vKey3
2995     __ li              (keypos, 32);
2996     __ lvx             (vKey4, keypos, key);
2997     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
2998 
2999     // load the 2nd round key to vKey4
3000     __ li              (keypos, 16);
3001     __ lvx             (vKey5, keypos, key);
3002     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
3003 
3004     // load the 1st round key to vKey5
3005     __ lvx             (vTmp1, key);
3006     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
3007 
3008     // last 5th - 1th rounds
3009     __ vncipher        (vRet, vRet, vKey1);
3010     __ vncipher        (vRet, vRet, vKey2);
3011     __ vncipher        (vRet, vRet, vKey3);
3012     __ vncipher        (vRet, vRet, vKey4);
3013     __ vncipherlast    (vRet, vRet, vKey5);
3014 
3015 #ifdef VM_LITTLE_ENDIAN
3016     // toPerm = 0x0F0E0D0C0B0A09080706050403020100
3017     __ lvsl            (toPerm, keypos); // keypos is a multiple of 16
3018     __ vxor            (toPerm, toPerm, fSplt);
3019 
3020     // Swap Bytes
3021     __ vperm           (vRet, vRet, vRet, toPerm);
3022 #endif
3023 
3024     // store result (unaligned)
3025     // Note: We can't use a read-modify-write sequence which touches additional Bytes.
3026     Register lo = temp, hi = fifteen; // Reuse
3027     __ vsldoi          (vTmp1, vRet, vRet, 8);
3028     __ mfvrd           (hi, vRet);
3029     __ mfvrd           (lo, vTmp1);
3030     __ std             (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
3031     __ std             (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
3032 
3033     __ blr();
3034 
3035 #ifdef ASSERT
3036     __ bind(L_error);
3037     __ stop("aescrypt_decryptBlock: invalid key length");
3038 #endif
3039      return start;
3040   }
3041 
3042   address generate_sha256_implCompress(bool multi_block, const char *name) {
3043     assert(UseSHA, "need SHA instructions");
3044     StubCodeMark mark(this, "StubRoutines", name);
3045     address start = __ function_entry();
3046 
3047     __ sha256 (multi_block);
3048     __ blr();
3049 
3050     return start;
3051   }
3052 
3053   address generate_sha512_implCompress(bool multi_block, const char *name) {
3054     assert(UseSHA, "need SHA instructions");
3055     StubCodeMark mark(this, "StubRoutines", name);
3056     address start = __ function_entry();
3057 
3058     __ sha512 (multi_block);
3059     __ blr();
3060 
3061     return start;
3062   }
3063 
3064   address generate_data_cache_writeback() {
3065     const Register cacheline = R3_ARG1;
3066     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
3067     address start = __ pc();
3068 
3069     __ cache_wb(Address(cacheline));
3070     __ blr();
3071 
3072     return start;
3073   }
3074 
3075   address generate_data_cache_writeback_sync() {
3076     const Register is_presync = R3_ARG1;
3077     Register temp = R4;
3078     Label SKIP;
3079 
3080     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
3081     address start = __ pc();
3082 
3083     __ andi_(temp, is_presync, 1);
3084     __ bne(CCR0, SKIP);
3085     __ cache_wbsync(false); // post sync => emit 'sync'
3086     __ bind(SKIP);          // pre sync => emit nothing
3087     __ blr();
3088 
3089     return start;
3090   }
3091 
3092   void generate_arraycopy_stubs() {
3093     // Note: the disjoint stubs must be generated first, some of
3094     // the conjoint stubs use them.
3095 
3096     address ucm_common_error_exit       =  generate_unsafecopy_common_error_exit();
3097     UnsafeCopyMemory::set_common_exit_stub_pc(ucm_common_error_exit);
3098 
3099     // non-aligned disjoint versions
3100     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
3101     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
3102     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
3103     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
3104     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
3105     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
3106 
3107     // aligned disjoint versions
3108     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
3109     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
3110     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
3111     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
3112     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
3113     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
3114 
3115     // non-aligned conjoint versions
3116     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
3117     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(false, "jshort_arraycopy");
3118     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(false, "jint_arraycopy");
3119     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(false, "jlong_arraycopy");
3120     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
3121     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
3122 
3123     // aligned conjoint versions
3124     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
3125     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
3126     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
3127     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
3128     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
3129     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
3130 
3131     // special/generic versions
3132     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", false);
3133     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", true);
3134 
3135     StubRoutines::_unsafe_arraycopy  = generate_unsafe_copy("unsafe_arraycopy",
3136                                                             STUB_ENTRY(jbyte_arraycopy),
3137                                                             STUB_ENTRY(jshort_arraycopy),
3138                                                             STUB_ENTRY(jint_arraycopy),
3139                                                             STUB_ENTRY(jlong_arraycopy));
3140     StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3141                                                              STUB_ENTRY(jbyte_arraycopy),
3142                                                              STUB_ENTRY(jshort_arraycopy),
3143                                                              STUB_ENTRY(jint_arraycopy),
3144                                                              STUB_ENTRY(oop_arraycopy),
3145                                                              STUB_ENTRY(oop_disjoint_arraycopy),
3146                                                              STUB_ENTRY(jlong_arraycopy),
3147                                                              STUB_ENTRY(checkcast_arraycopy));
3148 
3149     // fill routines
3150 #ifdef COMPILER2
3151     if (OptimizeFill) {
3152       StubRoutines::_jbyte_fill          = generate_fill(T_BYTE,  false, "jbyte_fill");
3153       StubRoutines::_jshort_fill         = generate_fill(T_SHORT, false, "jshort_fill");
3154       StubRoutines::_jint_fill           = generate_fill(T_INT,   false, "jint_fill");
3155       StubRoutines::_arrayof_jbyte_fill  = generate_fill(T_BYTE,  true, "arrayof_jbyte_fill");
3156       StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3157       StubRoutines::_arrayof_jint_fill   = generate_fill(T_INT,   true, "arrayof_jint_fill");
3158     }
3159 #endif
3160   }
3161 
3162   // Safefetch stubs.
3163   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
3164     // safefetch signatures:
3165     //   int      SafeFetch32(int*      adr, int      errValue);
3166     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3167     //
3168     // arguments:
3169     //   R3_ARG1 = adr
3170     //   R4_ARG2 = errValue
3171     //
3172     // result:
3173     //   R3_RET  = *adr or errValue
3174 
3175     StubCodeMark mark(this, "StubRoutines", name);
3176 
3177     // Entry point, pc or function descriptor.
3178     *entry = __ function_entry();
3179 
3180     // Load *adr into R4_ARG2, may fault.
3181     *fault_pc = __ pc();
3182     switch (size) {
3183       case 4:
3184         // int32_t, signed extended
3185         __ lwa(R4_ARG2, 0, R3_ARG1);
3186         break;
3187       case 8:
3188         // int64_t
3189         __ ld(R4_ARG2, 0, R3_ARG1);
3190         break;
3191       default:
3192         ShouldNotReachHere();
3193     }
3194 
3195     // return errValue or *adr
3196     *continuation_pc = __ pc();
3197     __ mr(R3_RET, R4_ARG2);
3198     __ blr();
3199   }
3200 
3201   // Stub for BigInteger::multiplyToLen()
3202   //
3203   //  Arguments:
3204   //
3205   //  Input:
3206   //    R3 - x address
3207   //    R4 - x length
3208   //    R5 - y address
3209   //    R6 - y length
3210   //    R7 - z address
3211   //    R8 - z length
3212   //
3213   address generate_multiplyToLen() {
3214 
3215     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3216 
3217     address start = __ function_entry();
3218 
3219     const Register x     = R3;
3220     const Register xlen  = R4;
3221     const Register y     = R5;
3222     const Register ylen  = R6;
3223     const Register z     = R7;
3224     const Register zlen  = R8;
3225 
3226     const Register tmp1  = R2; // TOC not used.
3227     const Register tmp2  = R9;
3228     const Register tmp3  = R10;
3229     const Register tmp4  = R11;
3230     const Register tmp5  = R12;
3231 
3232     // non-volatile regs
3233     const Register tmp6  = R31;
3234     const Register tmp7  = R30;
3235     const Register tmp8  = R29;
3236     const Register tmp9  = R28;
3237     const Register tmp10 = R27;
3238     const Register tmp11 = R26;
3239     const Register tmp12 = R25;
3240     const Register tmp13 = R24;
3241 
3242     BLOCK_COMMENT("Entry:");
3243 
3244     // C2 does not respect int to long conversion for stub calls.
3245     __ clrldi(xlen, xlen, 32);
3246     __ clrldi(ylen, ylen, 32);
3247     __ clrldi(zlen, zlen, 32);
3248 
3249     // Save non-volatile regs (frameless).
3250     int current_offs = 8;
3251     __ std(R24, -current_offs, R1_SP); current_offs += 8;
3252     __ std(R25, -current_offs, R1_SP); current_offs += 8;
3253     __ std(R26, -current_offs, R1_SP); current_offs += 8;
3254     __ std(R27, -current_offs, R1_SP); current_offs += 8;
3255     __ std(R28, -current_offs, R1_SP); current_offs += 8;
3256     __ std(R29, -current_offs, R1_SP); current_offs += 8;
3257     __ std(R30, -current_offs, R1_SP); current_offs += 8;
3258     __ std(R31, -current_offs, R1_SP);
3259 
3260     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5,
3261                        tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
3262 
3263     // Restore non-volatile regs.
3264     current_offs = 8;
3265     __ ld(R24, -current_offs, R1_SP); current_offs += 8;
3266     __ ld(R25, -current_offs, R1_SP); current_offs += 8;
3267     __ ld(R26, -current_offs, R1_SP); current_offs += 8;
3268     __ ld(R27, -current_offs, R1_SP); current_offs += 8;
3269     __ ld(R28, -current_offs, R1_SP); current_offs += 8;
3270     __ ld(R29, -current_offs, R1_SP); current_offs += 8;
3271     __ ld(R30, -current_offs, R1_SP); current_offs += 8;
3272     __ ld(R31, -current_offs, R1_SP);
3273 
3274     __ blr();  // Return to caller.
3275 
3276     return start;
3277   }
3278 
3279   /**
3280   *  Arguments:
3281   *
3282   *  Input:
3283   *   R3_ARG1    - out address
3284   *   R4_ARG2    - in address
3285   *   R5_ARG3    - offset
3286   *   R6_ARG4    - len
3287   *   R7_ARG5    - k
3288   *  Output:
3289   *   R3_RET     - carry
3290   */
3291   address generate_mulAdd() {
3292     __ align(CodeEntryAlignment);
3293     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3294 
3295     address start = __ function_entry();
3296 
3297     // C2 does not sign extend signed parameters to full 64 bits registers:
3298     __ rldic (R5_ARG3, R5_ARG3, 2, 32);  // always positive
3299     __ clrldi(R6_ARG4, R6_ARG4, 32);     // force zero bits on higher word
3300     __ clrldi(R7_ARG5, R7_ARG5, 32);     // force zero bits on higher word
3301 
3302     __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
3303 
3304     // Moves output carry to return register
3305     __ mr    (R3_RET,  R10);
3306 
3307     __ blr();
3308 
3309     return start;
3310   }
3311 
3312   /**
3313   *  Arguments:
3314   *
3315   *  Input:
3316   *   R3_ARG1    - in address
3317   *   R4_ARG2    - in length
3318   *   R5_ARG3    - out address
3319   *   R6_ARG4    - out length
3320   */
3321   address generate_squareToLen() {
3322     __ align(CodeEntryAlignment);
3323     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3324 
3325     address start = __ function_entry();
3326 
3327     // args - higher word is cleaned (unsignedly) due to int to long casting
3328     const Register in        = R3_ARG1;
3329     const Register in_len    = R4_ARG2;
3330     __ clrldi(in_len, in_len, 32);
3331     const Register out       = R5_ARG3;
3332     const Register out_len   = R6_ARG4;
3333     __ clrldi(out_len, out_len, 32);
3334 
3335     // output
3336     const Register ret       = R3_RET;
3337 
3338     // temporaries
3339     const Register lplw_s    = R7;
3340     const Register in_aux    = R8;
3341     const Register out_aux   = R9;
3342     const Register piece     = R10;
3343     const Register product   = R14;
3344     const Register lplw      = R15;
3345     const Register i_minus1  = R16;
3346     const Register carry     = R17;
3347     const Register offset    = R18;
3348     const Register off_aux   = R19;
3349     const Register t         = R20;
3350     const Register mlen      = R21;
3351     const Register len       = R22;
3352     const Register a         = R23;
3353     const Register b         = R24;
3354     const Register i         = R25;
3355     const Register c         = R26;
3356     const Register cs        = R27;
3357 
3358     // Labels
3359     Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_LOOP_SQUARE;
3360     Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_SQUARE;
3361 
3362     // Save non-volatile regs (frameless).
3363     int current_offs = -8;
3364     __ std(R28, current_offs, R1_SP); current_offs -= 8;
3365     __ std(R27, current_offs, R1_SP); current_offs -= 8;
3366     __ std(R26, current_offs, R1_SP); current_offs -= 8;
3367     __ std(R25, current_offs, R1_SP); current_offs -= 8;
3368     __ std(R24, current_offs, R1_SP); current_offs -= 8;
3369     __ std(R23, current_offs, R1_SP); current_offs -= 8;
3370     __ std(R22, current_offs, R1_SP); current_offs -= 8;
3371     __ std(R21, current_offs, R1_SP); current_offs -= 8;
3372     __ std(R20, current_offs, R1_SP); current_offs -= 8;
3373     __ std(R19, current_offs, R1_SP); current_offs -= 8;
3374     __ std(R18, current_offs, R1_SP); current_offs -= 8;
3375     __ std(R17, current_offs, R1_SP); current_offs -= 8;
3376     __ std(R16, current_offs, R1_SP); current_offs -= 8;
3377     __ std(R15, current_offs, R1_SP); current_offs -= 8;
3378     __ std(R14, current_offs, R1_SP);
3379 
3380     // Store the squares, right shifted one bit (i.e., divided by 2)
3381     __ subi   (out_aux,   out,       8);
3382     __ subi   (in_aux,    in,        4);
3383     __ cmpwi  (CCR0,      in_len,    0);
3384     // Initialize lplw outside of the loop
3385     __ xorr   (lplw,      lplw,      lplw);
3386     __ ble    (CCR0,      SKIP_LOOP_SQUARE);    // in_len <= 0
3387     __ mtctr  (in_len);
3388 
3389     __ bind(LOOP_SQUARE);
3390     __ lwzu   (piece,     4,         in_aux);
3391     __ mulld  (product,   piece,     piece);
3392     // shift left 63 bits and only keep the MSB
3393     __ rldic  (lplw_s,    lplw,      63, 0);
3394     __ mr     (lplw,      product);
3395     // shift right 1 bit without sign extension
3396     __ srdi   (product,   product,   1);
3397     // join them to the same register and store it
3398     __ orr    (product,   lplw_s,    product);
3399 #ifdef VM_LITTLE_ENDIAN
3400     // Swap low and high words for little endian
3401     __ rldicl (product,   product,   32, 0);
3402 #endif
3403     __ stdu   (product,   8,         out_aux);
3404     __ bdnz   (LOOP_SQUARE);
3405 
3406     __ bind(SKIP_LOOP_SQUARE);
3407 
3408     // Add in off-diagonal sums
3409     __ cmpwi  (CCR0,      in_len,    0);
3410     __ ble    (CCR0,      SKIP_DIAGONAL_SUM);
3411     // Avoid CTR usage here in order to use it at mulAdd
3412     __ subi   (i_minus1,  in_len,    1);
3413     __ li     (offset,    4);
3414 
3415     __ bind(LOOP_DIAGONAL_SUM);
3416 
3417     __ sldi   (off_aux,   out_len,   2);
3418     __ sub    (off_aux,   off_aux,   offset);
3419 
3420     __ mr     (len,       i_minus1);
3421     __ sldi   (mlen,      i_minus1,  2);
3422     __ lwzx   (t,         in,        mlen);
3423 
3424     __ muladd (out, in, off_aux, len, t, a, b, carry);
3425 
3426     // begin<addOne>
3427     // off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
3428     __ addi   (mlen,      mlen,      4);
3429     __ sldi   (a,         out_len,   2);
3430     __ subi   (a,         a,         4);
3431     __ sub    (a,         a,         mlen);
3432     __ subi   (off_aux,   offset,    4);
3433     __ sub    (off_aux,   a,         off_aux);
3434 
3435     __ lwzx   (b,         off_aux,   out);
3436     __ add    (b,         b,         carry);
3437     __ stwx   (b,         off_aux,   out);
3438 
3439     // if (((uint64_t)s >> 32) != 0) {
3440     __ srdi_  (a,         b,         32);
3441     __ beq    (CCR0,      SKIP_ADDONE);
3442 
3443     // while (--mlen >= 0) {
3444     __ bind(LOOP_ADDONE);
3445     __ subi   (mlen,      mlen,      4);
3446     __ cmpwi  (CCR0,      mlen,      0);
3447     __ beq    (CCR0,      SKIP_ADDONE);
3448 
3449     // if (--offset_aux < 0) { // Carry out of number
3450     __ subi   (off_aux,   off_aux,   4);
3451     __ cmpwi  (CCR0,      off_aux,   0);
3452     __ blt    (CCR0,      SKIP_ADDONE);
3453 
3454     // } else {
3455     __ lwzx   (b,         off_aux,   out);
3456     __ addi   (b,         b,         1);
3457     __ stwx   (b,         off_aux,   out);
3458     __ cmpwi  (CCR0,      b,         0);
3459     __ bne    (CCR0,      SKIP_ADDONE);
3460     __ b      (LOOP_ADDONE);
3461 
3462     __ bind(SKIP_ADDONE);
3463     // } } } end<addOne>
3464 
3465     __ addi   (offset,    offset,    8);
3466     __ subi   (i_minus1,  i_minus1,  1);
3467     __ cmpwi  (CCR0,      i_minus1,  0);
3468     __ bge    (CCR0,      LOOP_DIAGONAL_SUM);
3469 
3470     __ bind(SKIP_DIAGONAL_SUM);
3471 
3472     // Shift back up and set low bit
3473     // Shifts 1 bit left up to len positions. Assumes no leading zeros
3474     // begin<primitiveLeftShift>
3475     __ cmpwi  (CCR0,      out_len,   0);
3476     __ ble    (CCR0,      SKIP_LSHIFT);
3477     __ li     (i,         0);
3478     __ lwz    (c,         0,         out);
3479     __ subi   (b,         out_len,   1);
3480     __ mtctr  (b);
3481 
3482     __ bind(LOOP_LSHIFT);
3483     __ mr     (b,         c);
3484     __ addi   (cs,        i,         4);
3485     __ lwzx   (c,         out,       cs);
3486 
3487     __ sldi   (b,         b,         1);
3488     __ srwi   (cs,        c,         31);
3489     __ orr    (b,         b,         cs);
3490     __ stwx   (b,         i,         out);
3491 
3492     __ addi   (i,         i,         4);
3493     __ bdnz   (LOOP_LSHIFT);
3494 
3495     __ sldi   (c,         out_len,   2);
3496     __ subi   (c,         c,         4);
3497     __ lwzx   (b,         out,       c);
3498     __ sldi   (b,         b,         1);
3499     __ stwx   (b,         out,       c);
3500 
3501     __ bind(SKIP_LSHIFT);
3502     // end<primitiveLeftShift>
3503 
3504     // Set low bit
3505     __ sldi   (i,         in_len,    2);
3506     __ subi   (i,         i,         4);
3507     __ lwzx   (i,         in,        i);
3508     __ sldi   (c,         out_len,   2);
3509     __ subi   (c,         c,         4);
3510     __ lwzx   (b,         out,       c);
3511 
3512     __ andi   (i,         i,         1);
3513     __ orr    (i,         b,         i);
3514 
3515     __ stwx   (i,         out,       c);
3516 
3517     // Restore non-volatile regs.
3518     current_offs = -8;
3519     __ ld(R28, current_offs, R1_SP); current_offs -= 8;
3520     __ ld(R27, current_offs, R1_SP); current_offs -= 8;
3521     __ ld(R26, current_offs, R1_SP); current_offs -= 8;
3522     __ ld(R25, current_offs, R1_SP); current_offs -= 8;
3523     __ ld(R24, current_offs, R1_SP); current_offs -= 8;
3524     __ ld(R23, current_offs, R1_SP); current_offs -= 8;
3525     __ ld(R22, current_offs, R1_SP); current_offs -= 8;
3526     __ ld(R21, current_offs, R1_SP); current_offs -= 8;
3527     __ ld(R20, current_offs, R1_SP); current_offs -= 8;
3528     __ ld(R19, current_offs, R1_SP); current_offs -= 8;
3529     __ ld(R18, current_offs, R1_SP); current_offs -= 8;
3530     __ ld(R17, current_offs, R1_SP); current_offs -= 8;
3531     __ ld(R16, current_offs, R1_SP); current_offs -= 8;
3532     __ ld(R15, current_offs, R1_SP); current_offs -= 8;
3533     __ ld(R14, current_offs, R1_SP);
3534 
3535     __ mr(ret, out);
3536     __ blr();
3537 
3538     return start;
3539   }
3540 
3541   /**
3542    * Arguments:
3543    *
3544    * Inputs:
3545    *   R3_ARG1    - int   crc
3546    *   R4_ARG2    - byte* buf
3547    *   R5_ARG3    - int   length (of buffer)
3548    *
3549    * scratch:
3550    *   R2, R6-R12
3551    *
3552    * Ouput:
3553    *   R3_RET     - int   crc result
3554    */
3555   // Compute CRC32 function.
3556   address generate_CRC32_updateBytes(bool is_crc32c) {
3557     __ align(CodeEntryAlignment);
3558     StubCodeMark mark(this, "StubRoutines", is_crc32c ? "CRC32C_updateBytes" : "CRC32_updateBytes");
3559     address start = __ function_entry();  // Remember stub start address (is rtn value).
3560     __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
3561     __ blr();
3562     return start;
3563   }
3564 
3565   address generate_nmethod_entry_barrier() {
3566     __ align(CodeEntryAlignment);
3567     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
3568 
3569     address stub_address = __ pc();
3570 
3571     int nbytes_save = MacroAssembler::num_volatile_regs * BytesPerWord;
3572     __ save_volatile_gprs(R1_SP, -nbytes_save, true);
3573 
3574     // Link register points to instruction in prologue of the guarded nmethod.
3575     // As the stub requires one layer of indirection (argument is of type address* and not address),
3576     // passing the link register's value directly doesn't work.
3577     // Since we have to save the link register on the stack anyway, we calculate the corresponding stack address
3578     // and pass that one instead.
3579     __ add(R3_ARG1, _abi0(lr), R1_SP);
3580 
3581     __ save_LR_CR(R0);
3582     __ push_frame_reg_args(nbytes_save, R0);
3583 
3584     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier));
3585     __ mr(R0, R3_RET);
3586 
3587     __ pop_frame();
3588     __ restore_LR_CR(R3_RET /* used as tmp register */);
3589     __ restore_volatile_gprs(R1_SP, -nbytes_save, true);
3590 
3591     __ cmpdi(CCR0, R0, 0);
3592 
3593     // Return to prologue if no deoptimization is required (bnelr)
3594     __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::equal), Assembler::bhintIsTaken);
3595 
3596     // Deoptimization required.
3597     // For actually handling the deoptimization, the 'wrong method stub' is invoked.
3598     __ load_const_optimized(R0, SharedRuntime::get_handle_wrong_method_stub());
3599     __ mtctr(R0);
3600 
3601     // Pop the frame built in the prologue.
3602     __ pop_frame();
3603 
3604     // Restore link register.  Required as the 'wrong method stub' needs the caller's frame
3605     // to properly deoptimize this method (e.g. by re-resolving the call site for compiled methods).
3606     // This method's prologue is aborted.
3607     __ restore_LR_CR(R0);
3608 
3609     __ bctr();
3610     return stub_address;
3611   }
3612 
3613 #ifdef VM_LITTLE_ENDIAN
3614 // The following Base64 decode intrinsic is based on an algorithm outlined
3615 // in here:
3616 // http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html
3617 // in the section titled "Vector lookup (pshufb with bitmask)"
3618 //
3619 // This implementation differs in the following ways:
3620 //  * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions
3621 //    are used instead.  It turns out that some of the vector operations
3622 //    needed in the algorithm require fewer AltiVec instructions.
3623 //  * The algorithm in the above mentioned paper doesn't handle the
3624 //    Base64-URL variant in RFC 4648.  Adjustments to both the code and to two
3625 //    lookup tables are needed for this.
3626 //  * The "Pack" section of the code is a complete rewrite for Power because we
3627 //    can utilize better instructions for this step.
3628 //
3629 
3630 // Offsets per group of Base64 characters
3631 // Uppercase
3632 #define UC  (signed char)((-'A' + 0) & 0xff)
3633 // Lowercase
3634 #define LC  (signed char)((-'a' + 26) & 0xff)
3635 // Digits
3636 #define DIG (signed char)((-'0' + 52) & 0xff)
3637 // Plus sign (URL = 0)
3638 #define PLS (signed char)((-'+' + 62) & 0xff)
3639 // Hyphen (URL = 1)
3640 #define HYP (signed char)((-'-' + 62) & 0xff)
3641 // Slash (URL = 0)
3642 #define SLS (signed char)((-'/' + 63) & 0xff)
3643 // Underscore (URL = 1)
3644 #define US  (signed char)((-'_' + 63) & 0xff)
3645 
3646 // For P10 (or later) only
3647 #define VALID_B64 0x80
3648 #define VB64(x) (VALID_B64 | x)
3649 
3650 #define VEC_ALIGN __attribute__ ((aligned(16)))
3651 
3652 #define BLK_OFFSETOF(x) (offsetof(constant_block, x))
3653 
3654 // In little-endian mode, the lxv instruction loads the element at EA into
3655 // element 15 of the the vector register, EA+1 goes into element 14, and so
3656 // on.
3657 //
3658 // To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the
3659 // order of the elements in a vector initialization.
3660 #define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0
3661 
3662   //
3663   // Base64 decodeBlock intrinsic
3664   address generate_base64_decodeBlock() {
3665     __ align(CodeEntryAlignment);
3666     StubCodeMark mark(this, "StubRoutines", "base64_decodeBlock");
3667     address start   = __ function_entry();
3668 
3669     typedef struct {
3670       signed char offsetLUT_val[16];
3671       signed char offsetLUT_URL_val[16];
3672       unsigned char maskLUT_val[16];
3673       unsigned char maskLUT_URL_val[16];
3674       unsigned char bitposLUT_val[16];
3675       unsigned char table_32_47_val[16];
3676       unsigned char table_32_47_URL_val[16];
3677       unsigned char table_48_63_val[16];
3678       unsigned char table_64_79_val[16];
3679       unsigned char table_80_95_val[16];
3680       unsigned char table_80_95_URL_val[16];
3681       unsigned char table_96_111_val[16];
3682       unsigned char table_112_127_val[16];
3683       unsigned char pack_lshift_val[16];
3684       unsigned char pack_rshift_val[16];
3685       unsigned char pack_permute_val[16];
3686     } constant_block;
3687 
3688     static const constant_block VEC_ALIGN const_block = {
3689 
3690       .offsetLUT_val = {
3691         ARRAY_TO_LXV_ORDER(
3692         0,   0, PLS, DIG,  UC,  UC,  LC,  LC,
3693         0,   0,   0,   0,   0,   0,   0,   0 ) },
3694 
3695       .offsetLUT_URL_val = {
3696         ARRAY_TO_LXV_ORDER(
3697         0,   0, HYP, DIG,  UC,  UC,  LC,  LC,
3698         0,   0,   0,   0,   0,   0,   0,   0 ) },
3699 
3700       .maskLUT_val = {
3701         ARRAY_TO_LXV_ORDER(
3702         /* 0        */ (unsigned char)0b10101000,
3703         /* 1 .. 9   */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3704                        (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3705                        (unsigned char)0b11111000,
3706         /* 10       */ (unsigned char)0b11110000,
3707         /* 11       */ (unsigned char)0b01010100,
3708         /* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000,
3709         /* 15       */ (unsigned char)0b01010100 ) },
3710 
3711       .maskLUT_URL_val = {
3712         ARRAY_TO_LXV_ORDER(
3713         /* 0        */ (unsigned char)0b10101000,
3714         /* 1 .. 9   */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3715                        (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3716                        (unsigned char)0b11111000,
3717         /* 10       */ (unsigned char)0b11110000,
3718         /* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000,
3719         /* 13       */ (unsigned char)0b01010100,
3720         /* 14       */ (unsigned char)0b01010000,
3721         /* 15       */ (unsigned char)0b01110000 ) },
3722 
3723       .bitposLUT_val = {
3724         ARRAY_TO_LXV_ORDER(
3725         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80,
3726         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) },
3727 
3728       // In the following table_*_val constants, a 0 value means the
3729       // character is not in the Base64 character set
3730       .table_32_47_val = {
3731         ARRAY_TO_LXV_ORDER (
3732          /* space .. '*' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '+' = 62 */ VB64(62), /* ',' .. '.' = 0 */ 0, 0, 0, /* '/' = 63 */ VB64(63) ) },
3733 
3734       .table_32_47_URL_val = {
3735         ARRAY_TO_LXV_ORDER(
3736          /* space .. ',' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '-' = 62 */ VB64(62), /* '.' .. '/' */ 0, 0 ) },
3737 
3738       .table_48_63_val = {
3739         ARRAY_TO_LXV_ORDER(
3740          /* '0' .. '9' = 52 .. 61 */ VB64(52), VB64(53), VB64(54), VB64(55), VB64(56), VB64(57), VB64(58), VB64(59), VB64(60), VB64(61),
3741          /* ':' .. '?' = 0 */ 0, 0, 0, 0, 0, 0 ) },
3742 
3743       .table_64_79_val = {
3744         ARRAY_TO_LXV_ORDER(
3745          /* '@' = 0 */ 0, /* 'A' .. 'O' = 0 .. 14 */ VB64(0), VB64(1), VB64(2), VB64(3), VB64(4), VB64(5), VB64(6), VB64(7), VB64(8),
3746          VB64(9), VB64(10), VB64(11), VB64(12), VB64(13), VB64(14) ) },
3747 
3748       .table_80_95_val = {
3749         ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3750         VB64(23), VB64(24), VB64(25), /* '[' .. '_' = 0 */ 0, 0, 0, 0, 0 ) },
3751 
3752       .table_80_95_URL_val = {
3753         ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3754         VB64(23), VB64(24), VB64(25), /* '[' .. '^' = 0 */ 0, 0, 0, 0, /* '_' = 63 */ VB64(63) ) },
3755 
3756       .table_96_111_val = {
3757         ARRAY_TO_LXV_ORDER(/* '`' = 0 */ 0, /* 'a' .. 'o' = 26 .. 40 */ VB64(26), VB64(27), VB64(28), VB64(29), VB64(30), VB64(31),
3758         VB64(32), VB64(33), VB64(34), VB64(35), VB64(36), VB64(37), VB64(38), VB64(39), VB64(40) ) },
3759 
3760       .table_112_127_val = {
3761         ARRAY_TO_LXV_ORDER(/* 'p' .. 'z' = 41 .. 51 */ VB64(41), VB64(42), VB64(43), VB64(44), VB64(45), VB64(46), VB64(47), VB64(48),
3762         VB64(49), VB64(50), VB64(51), /* '{' .. DEL = 0 */ 0, 0, 0, 0, 0 ) },
3763 
3764       .pack_lshift_val = {
3765         ARRAY_TO_LXV_ORDER(
3766         0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2 ) },
3767 
3768       .pack_rshift_val = {
3769         ARRAY_TO_LXV_ORDER(
3770         0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0 ) },
3771 
3772       // The first 4 index values are "don't care" because
3773       // we only use the first 12 bytes of the vector,
3774       // which are decoded from 16 bytes of Base64 characters.
3775       .pack_permute_val = {
3776         ARRAY_TO_LXV_ORDER(
3777          0, 0, 0, 0,
3778          0,  1,  2,
3779          4,  5,  6,
3780          8,  9, 10,
3781         12, 13, 14 ) }
3782     };
3783 
3784     const unsigned block_size = 16;  // number of bytes to process in each pass through the loop
3785     const unsigned block_size_shift = 4;
3786 
3787     // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
3788     Register s      = R3_ARG1; // source starting address of Base64 characters
3789     Register sp     = R4_ARG2; // source offset
3790     Register sl     = R5_ARG3; // source length = # of Base64 characters to be processed
3791     Register d      = R6_ARG4; // destination address
3792     Register dp     = R7_ARG5; // destination offset
3793     Register isURL  = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
3794     Register isMIME = R9_ARG7; // boolean, if non-zero indicates use of RFC 2045 MIME encoding - not used
3795 
3796     // Local variables
3797     Register const_ptr     = R9;  // used for loading constants
3798     Register tmp_reg       = R10; // used for speeding up load_constant_optimized()
3799 
3800     // Re-use R9 and R10 to avoid using non-volatile registers (requires save/restore)
3801     Register out           = R9;  // moving out (destination) pointer
3802     Register in            = R10; // moving in (source) pointer
3803 
3804     // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
3805     // VR Constants
3806     VectorRegister  vec_0s                  = VR0;
3807     VectorRegister  vec_4s                  = VR1;
3808     VectorRegister  vec_8s                  = VR2;
3809     VectorRegister  vec_special_case_char   = VR3;
3810     VectorRegister  pack_rshift             = VR4;
3811     VectorRegister  pack_lshift             = VR5;
3812 
3813     // VSR Constants
3814     VectorSRegister offsetLUT               = VSR0;
3815     VectorSRegister maskLUT                 = VSR1;
3816     VectorSRegister bitposLUT               = VSR2;
3817     VectorSRegister vec_0xfs                = VSR3;
3818     VectorSRegister vec_special_case_offset = VSR4;
3819     VectorSRegister pack_permute            = VSR5;
3820 
3821     // P10 (or later) VSR lookup constants
3822     VectorSRegister table_32_47             = VSR0;
3823     VectorSRegister table_48_63             = VSR1;
3824     VectorSRegister table_64_79             = VSR2;
3825     VectorSRegister table_80_95             = VSR3;
3826     VectorSRegister table_96_111            = VSR4;
3827     VectorSRegister table_112_127           = VSR6;
3828 
3829     // Data read in and later converted
3830     VectorRegister  input                   = VR6;
3831     // Variable for testing Base64 validity
3832     VectorRegister  non_match               = VR10;
3833 
3834     // P9 VR Variables for lookup
3835     VectorRegister  higher_nibble           = VR7;
3836     VectorRegister  eq_special_case_char    = VR8;
3837     VectorRegister  offsets                 = VR9;
3838 
3839     // P9 VSR lookup variables
3840     VectorSRegister bit                     = VSR6;
3841     VectorSRegister lower_nibble            = VSR7;
3842     VectorSRegister M                       = VSR8;
3843 
3844     // P10 (or later) VSR lookup variables
3845     VectorSRegister  xlate_a                = VSR7;
3846     VectorSRegister  xlate_b                = VSR8;
3847 
3848     // Variables for pack
3849     // VR
3850     VectorRegister  l                       = VR7;  // reuse higher_nibble's register
3851     VectorRegister  r                       = VR8;  // reuse eq_special_case_char's register
3852     VectorRegister  gathered                = VR10; // reuse non_match's register
3853 
3854     Label not_URL, calculate_size, loop_start, loop_exit, return_zero;
3855 
3856     // The upper 32 bits of the non-pointer parameter registers are not
3857     // guaranteed to be zero, so mask off those upper bits.
3858     __ clrldi(sp, sp, 32);
3859     __ clrldi(sl, sl, 32);
3860 
3861     // Don't handle the last 4 characters of the source, because this
3862     // VSX-based algorithm doesn't handle padding characters.  Also the
3863     // vector code will always write 16 bytes of decoded data on each pass,
3864     // but only the first 12 of those 16 bytes are valid data (16 base64
3865     // characters become 12 bytes of binary data), so for this reason we
3866     // need to subtract an additional 8 bytes from the source length, in
3867     // order not to write past the end of the destination buffer.  The
3868     // result of this subtraction implies that a Java function in the
3869     // Base64 class will be used to process the last 12 characters.
3870     __ sub(sl, sl, sp);
3871     __ subi(sl, sl, 12);
3872 
3873     // Load CTR with the number of passes through the loop
3874     // = sl >> block_size_shift.  After the shift, if sl <= 0, there's too
3875     // little data to be processed by this intrinsic.
3876     __ srawi_(sl, sl, block_size_shift);
3877     __ ble(CCR0, return_zero);
3878     __ mtctr(sl);
3879 
3880     // Clear the other two parameter registers upper 32 bits.
3881     __ clrldi(isURL, isURL, 32);
3882     __ clrldi(dp, dp, 32);
3883 
3884     // Load constant vec registers that need to be loaded from memory
3885     __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
3886     __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
3887     __ lxv(pack_rshift->to_vsr(), BLK_OFFSETOF(pack_rshift_val), const_ptr);
3888     __ lxv(pack_lshift->to_vsr(), BLK_OFFSETOF(pack_lshift_val), const_ptr);
3889     __ lxv(pack_permute, BLK_OFFSETOF(pack_permute_val), const_ptr);
3890 
3891     // Splat the constants that can use xxspltib
3892     __ xxspltib(vec_0s->to_vsr(), 0);
3893     __ xxspltib(vec_8s->to_vsr(), 8);
3894     if (PowerArchitecturePPC64 >= 10) {
3895       // Using VALID_B64 for the offsets effectively strips the upper bit
3896       // of each byte that was selected from the table.  Setting the upper
3897       // bit gives us a way to distinguish between the 6-bit value of 0
3898       // from an error code of 0, which will happen if the character is
3899       // outside the range of the lookup, or is an illegal Base64
3900       // character, such as %.
3901       __ xxspltib(offsets->to_vsr(), VALID_B64);
3902 
3903       __ lxv(table_48_63, BLK_OFFSETOF(table_48_63_val), const_ptr);
3904       __ lxv(table_64_79, BLK_OFFSETOF(table_64_79_val), const_ptr);
3905       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
3906       __ lxv(table_96_111, BLK_OFFSETOF(table_96_111_val), const_ptr);
3907       __ lxv(table_112_127, BLK_OFFSETOF(table_112_127_val), const_ptr);
3908     } else {
3909       __ xxspltib(vec_4s->to_vsr(), 4);
3910       __ xxspltib(vec_0xfs, 0xf);
3911       __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
3912     }
3913 
3914     // The rest of the constants use different values depending on the
3915     // setting of isURL
3916     __ cmpwi(CCR0, isURL, 0);
3917     __ beq(CCR0, not_URL);
3918 
3919     // isURL != 0 (true)
3920     if (PowerArchitecturePPC64 >= 10) {
3921       __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_URL_val), const_ptr);
3922       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_URL_val), const_ptr);
3923     } else {
3924       __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_URL_val), const_ptr);
3925       __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_URL_val), const_ptr);
3926       __ xxspltib(vec_special_case_char->to_vsr(), '_');
3927       __ xxspltib(vec_special_case_offset, (unsigned char)US);
3928     }
3929     __ b(calculate_size);
3930 
3931     // isURL = 0 (false)
3932     __ bind(not_URL);
3933     if (PowerArchitecturePPC64 >= 10) {
3934       __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_val), const_ptr);
3935       __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
3936     } else {
3937       __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_val), const_ptr);
3938       __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_val), const_ptr);
3939       __ xxspltib(vec_special_case_char->to_vsr(), '/');
3940       __ xxspltib(vec_special_case_offset, (unsigned char)SLS);
3941     }
3942 
3943     __ bind(calculate_size);
3944 
3945     // out starts at d + dp
3946     __ add(out, d, dp);
3947 
3948     // in starts at s + sp
3949     __ add(in, s, sp);
3950 
3951     __ align(32);
3952     __ bind(loop_start);
3953     __ lxv(input->to_vsr(), 0, in); // offset=0
3954 
3955     //
3956     // Lookup
3957     //
3958     if (PowerArchitecturePPC64 >= 10) {
3959       // Use xxpermx to do a lookup of each Base64 character in the
3960       // input vector and translate it to a 6-bit value + 0x80.
3961       // Characters which are not valid Base64 characters will result
3962       // in a zero in the corresponding byte.
3963       //
3964       // Note that due to align(32) call above, the xxpermx instructions do
3965       // not require align_prefix() calls, since the final xxpermx
3966       // prefix+opcode is at byte 24.
3967       __ xxpermx(xlate_a, table_32_47, table_48_63, input->to_vsr(), 1);    // offset=4
3968       __ xxpermx(xlate_b, table_64_79, table_80_95, input->to_vsr(), 2);    // offset=12
3969       __ xxlor(xlate_b, xlate_a, xlate_b);                                  // offset=20
3970       __ xxpermx(xlate_a, table_96_111, table_112_127, input->to_vsr(), 3); // offset=24
3971       __ xxlor(input->to_vsr(), xlate_a, xlate_b);
3972       // Check for non-Base64 characters by comparing each byte to zero.
3973       __ vcmpequb_(non_match, input, vec_0s);
3974     } else {
3975       // Isolate the upper 4 bits of each character by shifting it right 4 bits
3976       __ vsrb(higher_nibble, input, vec_4s);
3977       // Isolate the lower 4 bits by masking
3978       __ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
3979 
3980       // Get the offset (the value to subtract from the byte) by using
3981       // a lookup table indexed by the upper 4 bits of the character
3982       __ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
3983 
3984       // Find out which elements are the special case character (isURL ? '/' : '-')
3985       __ vcmpequb(eq_special_case_char, input, vec_special_case_char);
3986 
3987       // For each character in the input which is a special case
3988       // character, replace its offset with one that is special for that
3989       // character.
3990       __ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
3991 
3992       // Use the lower_nibble to select a mask "M" from the lookup table.
3993       __ xxperm(M, maskLUT, lower_nibble);
3994 
3995       // "bit" is used to isolate which of the bits in M is relevant.
3996       __ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
3997 
3998       // Each element of non_match correspond to one each of the 16 input
3999       // characters.  Those elements that become 0x00 after the xxland
4000       // instuction are invalid Base64 characters.
4001       __ xxland(non_match->to_vsr(), M, bit);
4002 
4003       // Compare each element to zero
4004       //
4005       __ vcmpequb_(non_match, non_match, vec_0s);
4006     }
4007     // vmcmpequb_ sets the EQ bit of CCR6 if no elements compare equal.
4008     // Any element comparing equal to zero means there is an error in
4009     // that element.  Note that the comparison result register
4010     // non_match is not referenced again.  Only CCR6-EQ matters.
4011     __ bne_predict_not_taken(CCR6, loop_exit);
4012 
4013     // The Base64 characters had no errors, so add the offsets, which in
4014     // the case of Power10 is a constant vector of all 0x80's (see earlier
4015     // comment where the offsets register is loaded).
4016     __ vaddubm(input, input, offsets);
4017 
4018     // Pack
4019     //
4020     // In the tables below, b0, b1, .. b15 are the bytes of decoded
4021     // binary data, the first line of each of the cells (except for
4022     // the constants) uses the bit-field nomenclature from the
4023     // above-linked paper, whereas the second line is more specific
4024     // about which exact bits are present, and is constructed using the
4025     // Power ISA 3.x document style, where:
4026     //
4027     // * The specifier after the colon depicts which bits are there.
4028     // * The bit numbering is big endian style (bit 0 is the most
4029     //   significant).
4030     // * || is a concatenate operator.
4031     // * Strings of 0's are a field of zeros with the shown length, and
4032     //   likewise for strings of 1's.
4033 
4034     // Note that only e12..e15 are shown here because the shifting
4035     // and OR'ing pattern replicates for e8..e11, e4..7, and
4036     // e0..e3.
4037     //
4038     // +======================+=================+======================+======================+=============+
4039     // |        Vector        |       e12       |         e13          |         e14          |     e15     |
4040     // |       Element        |                 |                      |                      |             |
4041     // +======================+=================+======================+======================+=============+
4042     // |    after vaddubm     |    00dddddd     |       00cccccc       |       00bbbbbb       |  00aaaaaa   |
4043     // |                      |   00||b2:2..7   | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4044     // +----------------------+-----------------+----------------------+----------------------+-------------+
4045     // |     pack_lshift      |                 |         << 6         |         << 4         |    << 2     |
4046     // +----------------------+-----------------+----------------------+----------------------+-------------+
4047     // |     l after vslb     |    00dddddd     |       cc000000       |       bbbb0000       |  aaaaaa00   |
4048     // |                      |   00||b2:2..7   |   b2:0..1||000000    |    b1:0..3||0000     | b0:0..5||00 |
4049     // +----------------------+-----------------+----------------------+----------------------+-------------+
4050     // |     l after vslo     |    cc000000     |       bbbb0000       |       aaaaaa00       |  00000000   |
4051     // |                      | b2:0..1||000000 |    b1:0..3||0000     |     b0:0..5||00      |  00000000   |
4052     // +----------------------+-----------------+----------------------+----------------------+-------------+
4053     // |     pack_rshift      |                 |         >> 2         |         >> 4         |             |
4054     // +----------------------+-----------------+----------------------+----------------------+-------------+
4055     // |     r after vsrb     |    00dddddd     |       0000cccc       |       000000bb       |  00aaaaaa   |
4056     // |                      |   00||b2:2..7   |    0000||b1:4..7     |   000000||b0:6..7    | 00||b0:0..5 |
4057     // +----------------------+-----------------+----------------------+----------------------+-------------+
4058     // | gathered after xxlor |    ccdddddd     |       bbbbcccc       |       aaaaaabb       |  00aaaaaa   |
4059     // |                      |     b2:0..7     |       b1:0..7        |       b0:0..7        | 00||b0:0..5 |
4060     // +======================+=================+======================+======================+=============+
4061     //
4062     // Note: there is a typo in the above-linked paper that shows the result of the gathering process is:
4063     // [ddddddcc|bbbbcccc|aaaaaabb]
4064     // but should be:
4065     // [ccdddddd|bbbbcccc|aaaaaabb]
4066     //
4067     __ vslb(l, input, pack_lshift);
4068     // vslo of vec_8s shifts the vector by one octet toward lower
4069     // element numbers, discarding element 0.  This means it actually
4070     // shifts to the right (not left) according to the order of the
4071     // table above.
4072     __ vslo(l, l, vec_8s);
4073     __ vsrb(r, input, pack_rshift);
4074     __ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
4075 
4076     // Final rearrangement of bytes into their correct positions.
4077     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4078     // |    Vector    |  e0  |  e1  |  e2  |  e3  | e4  | e5  | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
4079     // |   Elements   |      |      |      |      |     |     |    |    |    |    |     |     |     |     |     |     |
4080     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4081     // | after xxlor  | b11  | b10  |  b9  |  xx  | b8  | b7  | b6 | xx | b5 | b4 | b3  | xx  | b2  | b1  | b0  | xx  |
4082     // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4083     // | pack_permute |  0   |  0   |  0   |  0   |  0  |  1  | 2  | 4  | 5  | 6  |  8  |  9  | 10  | 12  | 13  | 14  |
4084     // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4085     // | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5  | b4  | b3  | b2  | b1  | b0  |
4086     // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4087     // xx bytes are not used to form the final data
4088     // b0..b15 are the decoded and reassembled 8-bit bytes of data
4089     // b11 with asterisk is a "don't care", because these bytes will be
4090     // overwritten on the next iteration.
4091     __ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
4092 
4093     // We cannot use a static displacement on the store, since it's a
4094     // multiple of 12, not 16.  Note that this stxv instruction actually
4095     // writes 16 bytes, even though only the first 12 are valid data.
4096     __ stxv(gathered->to_vsr(), 0, out);
4097     __ addi(out, out, 12);
4098     __ addi(in, in, 16);
4099     __ bdnz(loop_start);
4100 
4101     __ bind(loop_exit);
4102 
4103     // Return the number of out bytes produced, which is (out - (d + dp)) == out - d - dp;
4104     __ sub(R3_RET, out, d);
4105     __ sub(R3_RET, R3_RET, dp);
4106 
4107     __ blr();
4108 
4109     __ bind(return_zero);
4110     __ li(R3_RET, 0);
4111     __ blr();
4112 
4113     return start;
4114   }
4115 
4116 #undef UC
4117 #undef LC
4118 #undef DIG
4119 #undef PLS
4120 #undef HYP
4121 #undef SLS
4122 #undef US
4123 
4124 // This algorithm is based on the methods described in this paper:
4125 // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
4126 //
4127 // The details of this implementation vary from the paper due to the
4128 // difference in the ISA between SSE and AltiVec, especially in the
4129 // splitting bytes section where there is no need on Power to mask after
4130 // the shift because the shift is byte-wise rather than an entire an entire
4131 // 128-bit word.
4132 //
4133 // For the lookup part of the algorithm, different logic is used than
4134 // described in the paper because of the availability of vperm, which can
4135 // do a 64-byte table lookup in four instructions, while preserving the
4136 // branchless nature.
4137 //
4138 // Description of the ENCODE_CORE macro
4139 //
4140 // Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2
4141 // bits of each byte are zeros)
4142 //
4143 // (Note: e7..e0 are not shown because they follow the same pattern as
4144 // e8..e15)
4145 //
4146 // In the table below, b0, b1, .. b15 are the bytes of unencoded
4147 // binary data, the first line of each of the cells (except for
4148 // the constants) uses the bit-field nomenclature from the
4149 // above-linked paper, whereas the second line is more specific
4150 // about which exact bits are present, and is constructed using the
4151 // Power ISA 3.x document style, where:
4152 //
4153 // * The specifier after the colon depicts which bits are there.
4154 // * The bit numbering is big endian style (bit 0 is the most
4155 //   significant).
4156 // * || is a concatenate operator.
4157 // * Strings of 0's are a field of zeros with the shown length, and
4158 //   likewise for strings of 1's.
4159 //
4160 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4161 // |          Vector          |     e8      |          e9          |         e10          |     e11     |     e12     |         e13          |         e14          |     e15     |
4162 // |         Element          |             |                      |                      |             |             |                      |                      |             |
4163 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4164 // |        after lxv         |  jjjjkkkk   |       iiiiiijj       |       gghhhhhh       |  ffffgggg   |  eeeeeeff   |       ccdddddd       |       bbbbcccc       |  aaaaaabb   |
4165 // |                          |     b7      |          b6          |          b5          |     b4      |     b3      |          b2          |          b1          |     b0      |
4166 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4167 // |      xxperm indexes      |      0      |          10          |          11          |     12      |      0      |          13          |          14          |     15      |
4168 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4169 // |     (1) after xxperm     |             |       gghhhhhh       |       ffffgggg       |  eeeeeeff   |             |       ccdddddd       |       bbbbcccc       |  aaaaaabb   |
4170 // |                          |    (b15)    |          b5          |          b4          |     b3      |    (b15)    |          b2          |          b1          |     b0      |
4171 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4172 // |      rshift_amount       |      0      |          6           |          4           |      2      |      0      |          6           |          4           |      2      |
4173 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4174 // |        after vsrb        |             |       000000gg       |       0000ffff       |  00eeeeee   |             |       000000cc       |       0000bbbb       |  00aaaaaa   |
4175 // |                          |    (b15)    |   000000||b5:0..1    |    0000||b4:0..3     | 00||b3:0..5 |    (b15)    |   000000||b2:0..1    |    0000||b1:0..3     | 00||b0:0..5 |
4176 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4177 // |       rshift_mask        |  00000000   |      000000||11      |      0000||1111      | 00||111111  |  00000000   |      000000||11      |      0000||1111      | 00||111111  |
4178 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4179 // |    rshift after vand     |  00000000   |       000000gg       |       0000ffff       |  00eeeeee   |  00000000   |       000000cc       |       0000bbbb       |  00aaaaaa   |
4180 // |                          |  00000000   |   000000||b5:0..1    |    0000||b4:0..3     | 00||b3:0..5 |  00000000   |   000000||b2:0..1    |    0000||b1:0..3     | 00||b0:0..5 |
4181 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4182 // |    1 octet lshift (1)    |  gghhhhhh   |       ffffgggg       |       eeeeeeff       |             |  ccdddddd   |       bbbbcccc       |       aaaaaabb       |  00000000   |
4183 // |                          |     b5      |          b4          |          b3          |    (b15)    |     b2      |          b1          |          b0          |  00000000   |
4184 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4185 // |      lshift_amount       |      0      |          2           |          4           |      0      |      0      |          2           |          4           |      0      |
4186 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4187 // |        after vslb        |  gghhhhhh   |       ffgggg00       |       eeff0000       |             |  ccdddddd   |       bbcccc00       |       aabb0000       |  00000000   |
4188 // |                          |     b5      |     b4:2..7||00      |    b3:4..7||0000     |    (b15)    |   b2:0..7   |     b1:2..7||00      |    b0:4..7||0000     |  00000000   |
4189 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4190 // |       lshift_mask        | 00||111111  |     00||1111||00     |     00||11||0000     |  00000000   | 00||111111  |     00||1111||00     |     00||11||0000     |  00000000   |
4191 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4192 // |    lshift after vand     |  00hhhhhh   |       00gggg00       |       00ff0000       |  00000000   |  00dddddd   |       00cccc00       |       00bb0000       |  00000000   |
4193 // |                          | 00||b5:2..7 |   00||b4:4..7||00    |  00||b3:6..7||0000   |  00000000   | 00||b2:2..7 |   00||b1:4..7||00    |  00||b0:6..7||0000   |  00000000   |
4194 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4195 // | after vor lshift, rshift |  00hhhhhh   |       00gggggg       |       00ffffff       |  00eeeeee   |  00dddddd   |       00cccccc       |       00bbbbbb       |  00aaaaaa   |
4196 // |                          | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4197 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4198 //
4199 // Expand the first 12 bytes into 16 bytes, leaving every 4th byte
4200 // blank for now.
4201 // __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);
4202 //
4203 // Generate two bit-shifted pieces - rshift and lshift - that will
4204 // later be OR'd together.
4205 //
4206 // First the right-shifted piece
4207 // __ vsrb(rshift, input, expand_rshift);
4208 // __ vand(rshift, rshift, expand_rshift_mask);
4209 //
4210 // Now the left-shifted piece, which is done by octet shifting
4211 // the input one byte to the left, then doing a variable shift,
4212 // followed by a mask operation.
4213 //
4214 // __ vslo(lshift, input, vec_8s);
4215 // __ vslb(lshift, lshift, expand_lshift);
4216 // __ vand(lshift, lshift, expand_lshift_mask);
4217 //
4218 // Combine the two pieces by OR'ing
4219 // __ vor(expanded, rshift, lshift);
4220 //
4221 // At this point, expanded is a vector containing a 6-bit value in each
4222 // byte.  These values are used as indexes into a 64-byte lookup table that
4223 // is contained in four vector registers.  The lookup operation is done
4224 // using vperm instructions with the same indexes for the lower 32 and
4225 // upper 32 bytes.  To figure out which of the two looked-up bytes to use
4226 // at each location, all values in expanded are compared to 31.  Using
4227 // vsel, values higher than 31 use the results from the upper 32 bytes of
4228 // the lookup operation, while values less than or equal to 31 use the
4229 // lower 32 bytes of the lookup operation.
4230 //
4231 // Note: it's tempting to use a xxpermx,xxpermx,vor sequence here on
4232 // Power10 (or later), but experiments doing so on Power10 yielded a slight
4233 // performance drop, perhaps due to the need for xxpermx instruction
4234 // prefixes.
4235 
4236 #define ENCODE_CORE                                                        \
4237     __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);           \
4238     __ vsrb(rshift, input, expand_rshift);                                 \
4239     __ vand(rshift, rshift, expand_rshift_mask);                           \
4240     __ vslo(lshift, input, vec_8s);                                        \
4241     __ vslb(lshift, lshift, expand_lshift);                                \
4242     __ vand(lshift, lshift, expand_lshift_mask);                           \
4243     __ vor(expanded, rshift, lshift);                                      \
4244     __ vperm(encoded_00_31, vec_base64_00_15, vec_base64_16_31, expanded); \
4245     __ vperm(encoded_32_63, vec_base64_32_47, vec_base64_48_63, expanded); \
4246     __ vcmpgtub(gt_31, expanded, vec_31s);                                 \
4247     __ vsel(expanded, encoded_00_31, encoded_32_63, gt_31);
4248 
4249 // Intrinsic function prototype in Base64.java:
4250 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
4251 
4252   address generate_base64_encodeBlock() {
4253     __ align(CodeEntryAlignment);
4254     StubCodeMark mark(this, "StubRoutines", "base64_encodeBlock");
4255     address start   = __ function_entry();
4256 
4257     typedef struct {
4258       unsigned char expand_permute_val[16];
4259       unsigned char expand_rshift_val[16];
4260       unsigned char expand_rshift_mask_val[16];
4261       unsigned char expand_lshift_val[16];
4262       unsigned char expand_lshift_mask_val[16];
4263       unsigned char base64_00_15_val[16];
4264       unsigned char base64_16_31_val[16];
4265       unsigned char base64_32_47_val[16];
4266       unsigned char base64_48_63_val[16];
4267       unsigned char base64_48_63_URL_val[16];
4268     } constant_block;
4269 
4270     static const constant_block VEC_ALIGN const_block = {
4271       .expand_permute_val = {
4272         ARRAY_TO_LXV_ORDER(
4273         0,  4,  5,  6,
4274         0,  7,  8,  9,
4275         0, 10, 11, 12,
4276         0, 13, 14, 15 ) },
4277 
4278       .expand_rshift_val = {
4279         ARRAY_TO_LXV_ORDER(
4280         0, 6, 4, 2,
4281         0, 6, 4, 2,
4282         0, 6, 4, 2,
4283         0, 6, 4, 2 ) },
4284 
4285       .expand_rshift_mask_val = {
4286         ARRAY_TO_LXV_ORDER(
4287         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4288         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4289         0b00000000, 0b00000011, 0b00001111, 0b00111111,
4290         0b00000000, 0b00000011, 0b00001111, 0b00111111 ) },
4291 
4292       .expand_lshift_val = {
4293         ARRAY_TO_LXV_ORDER(
4294         0, 2, 4, 0,
4295         0, 2, 4, 0,
4296         0, 2, 4, 0,
4297         0, 2, 4, 0 ) },
4298 
4299       .expand_lshift_mask_val = {
4300         ARRAY_TO_LXV_ORDER(
4301         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4302         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4303         0b00111111, 0b00111100, 0b00110000, 0b00000000,
4304         0b00111111, 0b00111100, 0b00110000, 0b00000000 ) },
4305 
4306       .base64_00_15_val = {
4307         ARRAY_TO_LXV_ORDER(
4308         'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P' ) },
4309 
4310       .base64_16_31_val = {
4311         ARRAY_TO_LXV_ORDER(
4312         'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f' ) },
4313 
4314       .base64_32_47_val = {
4315         ARRAY_TO_LXV_ORDER(
4316         'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v' ) },
4317 
4318       .base64_48_63_val = {
4319         ARRAY_TO_LXV_ORDER(
4320         'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/' ) },
4321 
4322       .base64_48_63_URL_val = {
4323         ARRAY_TO_LXV_ORDER(
4324         'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) }
4325     };
4326 
4327     // Number of bytes to process in each pass through the main loop.
4328     // 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes.
4329     const unsigned block_size = 12;
4330 
4331     // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
4332     Register src       = R3_ARG1; // source starting address of Base64 characters
4333     Register sp        = R4_ARG2; // source starting position
4334     Register sl        = R5_ARG3; // total source length of the Base64 characters to be processed
4335     Register dst       = R6_ARG4; // destination address
4336     Register dp        = R7_ARG5; // destination starting position
4337     Register isURL     = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
4338 
4339     // Local variables
4340     Register const_ptr     = R12; // used for loading constants (reuses isURL's register)
4341     Register tmp_reg       = R9;  // used for speeding up load_constant()
4342 
4343     Register size           = R9;  // number of bytes to process (reuses tmp_reg's register)
4344     Register blocked_size   = R10; // number of bytes to process a block at a time
4345     Register block_modulo   = R12; // == block_size (reuse const_ptr)
4346     Register remaining      = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg)
4347     Register in             = R4;  // current input (source) pointer (reuse sp's register)
4348     Register num_blocks     = R11; // number of blocks to be processed by the loop
4349     Register out            = R8;  // current output (destination) pointer (reuse const_ptr's register)
4350     Register three          = R9;  // constant divisor (reuse size's register)
4351     Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register)
4352     Register tmp1           = R7;  // temp register for lxvl length (reuse dp's register)
4353     Register modulo_chars   = R7;  // number of bytes written during the final write % 4 (reuse tmp1's register)
4354     Register pad_char       = R6;  // literal '=' (reuse dst's register)
4355 
4356     // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
4357     // VR Constants
4358     VectorRegister  vec_8s             = VR0;
4359     VectorRegister  vec_31s            = VR1;
4360     VectorRegister  vec_base64_00_15   = VR2;
4361     VectorRegister  vec_base64_16_31   = VR3;
4362     VectorRegister  vec_base64_32_47   = VR4;
4363     VectorRegister  vec_base64_48_63   = VR5;
4364     VectorRegister  expand_rshift      = VR6;
4365     VectorRegister  expand_rshift_mask = VR7;
4366     VectorRegister  expand_lshift      = VR8;
4367     VectorRegister  expand_lshift_mask = VR9;
4368 
4369     // VR variables for expand
4370     VectorRegister  input              = VR10;
4371     VectorRegister  rshift             = VR11;
4372     VectorRegister  lshift             = VR12;
4373     VectorRegister  expanded           = VR13;
4374 
4375     // VR variables for lookup
4376     VectorRegister  encoded_00_31      = VR10; // (reuse input)
4377     VectorRegister  encoded_32_63      = VR11; // (reuse rshift)
4378     VectorRegister  gt_31              = VR12; // (reuse lshift)
4379 
4380     // VSR Constants
4381     VectorSRegister expand_permute     = VSR0;
4382 
4383     Label not_URL, calculate_size, calculate_blocked_size, skip_loop;
4384     Label loop_start, le_16_to_write, no_pad, one_pad_char;
4385 
4386     // The upper 32 bits of the non-pointer parameter registers are not
4387     // guaranteed to be zero, so mask off those upper bits.
4388     __ clrldi(sp, sp, 32);
4389     __ clrldi(sl, sl, 32);
4390     __ clrldi(dp, dp, 32);
4391     __ clrldi(isURL, isURL, 32);
4392 
4393     // load up the constants
4394     __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
4395     __ lxv(expand_permute,               BLK_OFFSETOF(expand_permute_val),     const_ptr);
4396     __ lxv(expand_rshift->to_vsr(),      BLK_OFFSETOF(expand_rshift_val),      const_ptr);
4397     __ lxv(expand_rshift_mask->to_vsr(), BLK_OFFSETOF(expand_rshift_mask_val), const_ptr);
4398     __ lxv(expand_lshift->to_vsr(),      BLK_OFFSETOF(expand_lshift_val),      const_ptr);
4399     __ lxv(expand_lshift_mask->to_vsr(), BLK_OFFSETOF(expand_lshift_mask_val), const_ptr);
4400     __ lxv(vec_base64_00_15->to_vsr(),   BLK_OFFSETOF(base64_00_15_val),       const_ptr);
4401     __ lxv(vec_base64_16_31->to_vsr(),   BLK_OFFSETOF(base64_16_31_val),       const_ptr);
4402     __ lxv(vec_base64_32_47->to_vsr(),   BLK_OFFSETOF(base64_32_47_val),       const_ptr);
4403 
4404     // Splat the constants that can use xxspltib
4405     __ xxspltib(vec_8s->to_vsr(), 8);
4406     __ xxspltib(vec_31s->to_vsr(), 31);
4407 
4408 
4409     // Use a different translation lookup table depending on the
4410     // setting of isURL
4411     __ cmpdi(CCR0, isURL, 0);
4412     __ beq(CCR0, not_URL);
4413     __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr);
4414     __ b(calculate_size);
4415 
4416     __ bind(not_URL);
4417     __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_val), const_ptr);
4418 
4419     __ bind(calculate_size);
4420 
4421     // size = sl - sp - 4 (*)
4422     // (*) Don't process the last four bytes in the main loop because
4423     // we don't want the lxv instruction to read past the end of the src
4424     // data, in case those four bytes are on the start of an unmapped or
4425     // otherwise inaccessible page.
4426     //
4427     __ sub(size, sl, sp);
4428     __ subi(size, size, 4);
4429     __ cmpdi(CCR7, size, block_size);
4430     __ bgt(CCR7, calculate_blocked_size);
4431     __ mr(remaining, size);
4432     // Add the 4 back into remaining again
4433     __ addi(remaining, remaining, 4);
4434     // make "in" point to the beginning of the source data: in = src + sp
4435     __ add(in, src, sp);
4436     // out = dst + dp
4437     __ add(out, dst, dp);
4438     __ b(skip_loop);
4439 
4440     __ bind(calculate_blocked_size);
4441     __ li(block_modulo, block_size);
4442     // num_blocks = size / block_modulo
4443     __ divwu(num_blocks, size, block_modulo);
4444     // blocked_size = num_blocks * size
4445     __ mullw(blocked_size, num_blocks, block_modulo);
4446     // remaining = size - blocked_size
4447     __ sub(remaining, size, blocked_size);
4448     __ mtctr(num_blocks);
4449 
4450     // Add the 4 back in to remaining again
4451     __ addi(remaining, remaining, 4);
4452 
4453     // make "in" point to the beginning of the source data: in = src + sp
4454     __ add(in, src, sp);
4455 
4456     // out = dst + dp
4457     __ add(out, dst, dp);
4458 
4459     __ align(32);
4460     __ bind(loop_start);
4461 
4462     __ lxv(input->to_vsr(), 0, in);
4463 
4464     ENCODE_CORE
4465 
4466     __ stxv(expanded->to_vsr(), 0, out);
4467     __ addi(in, in, 12);
4468     __ addi(out, out, 16);
4469     __ bdnz(loop_start);
4470 
4471     __ bind(skip_loop);
4472 
4473     // When there are less than 16 bytes left, we need to be careful not to
4474     // read beyond the end of the src buffer, which might be in an unmapped
4475     // page.
4476     // Load the remaining bytes using lxvl.
4477     __ rldicr(tmp1, remaining, 56, 7);
4478     __ lxvl(input->to_vsr(), in, tmp1);
4479 
4480     ENCODE_CORE
4481 
4482     // bytes_to_write = ((remaining * 4) + 2) / 3
4483     __ li(three, 3);
4484     __ rlwinm(bytes_to_write, remaining, 2, 0, 29); // remaining * 4
4485     __ addi(bytes_to_write, bytes_to_write, 2);
4486     __ divwu(bytes_to_write, bytes_to_write, three);
4487 
4488     __ cmpwi(CCR7, bytes_to_write, 16);
4489     __ ble_predict_taken(CCR7, le_16_to_write);
4490     __ stxv(expanded->to_vsr(), 0, out);
4491 
4492     // We've processed 12 of the 13-15 data bytes, so advance the pointers,
4493     // and do one final pass for the remaining 1-3 bytes.
4494     __ addi(in, in, 12);
4495     __ addi(out, out, 16);
4496     __ subi(remaining, remaining, 12);
4497     __ subi(bytes_to_write, bytes_to_write, 16);
4498     __ rldicr(tmp1, bytes_to_write, 56, 7);
4499     __ lxvl(input->to_vsr(), in, tmp1);
4500 
4501     ENCODE_CORE
4502 
4503     __ bind(le_16_to_write);
4504     // shift bytes_to_write into the upper 8 bits of t1 for use by stxvl
4505     __ rldicr(tmp1, bytes_to_write, 56, 7);
4506     __ stxvl(expanded->to_vsr(), out, tmp1);
4507     __ add(out, out, bytes_to_write);
4508 
4509     __ li(pad_char, '=');
4510     __ rlwinm_(modulo_chars, bytes_to_write, 0, 30, 31); // bytes_to_write % 4, set CCR0
4511     // Examples:
4512     //    remaining  bytes_to_write  modulo_chars  num pad chars
4513     //        0            0               0            0
4514     //        1            2               2            2
4515     //        2            3               3            1
4516     //        3            4               0            0
4517     //        4            6               2            2
4518     //        5            7               3            1
4519     //        ...
4520     //       12           16               0            0
4521     //       13           18               2            2
4522     //       14           19               3            1
4523     //       15           20               0            0
4524     __ beq(CCR0, no_pad);
4525     __ cmpwi(CCR7, modulo_chars, 3);
4526     __ beq(CCR7, one_pad_char);
4527 
4528     // two pad chars
4529     __ stb(pad_char, out);
4530     __ addi(out, out, 1);
4531 
4532     __ bind(one_pad_char);
4533     __ stb(pad_char, out);
4534 
4535     __ bind(no_pad);
4536 
4537     __ blr();
4538     return start;
4539   }
4540 
4541 #endif // VM_LITTLE_ENDIAN
4542 
4543   // Initialization
4544   void generate_initial() {
4545     // Generates all stubs and initializes the entry points
4546 
4547     // Entry points that exist in all platforms.
4548     // Note: This is code that could be shared among different platforms - however the
4549     // benefit seems to be smaller than the disadvantage of having a
4550     // much more complicated generator structure. See also comment in
4551     // stubRoutines.hpp.
4552 
4553     StubRoutines::_forward_exception_entry          = generate_forward_exception();
4554     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
4555     StubRoutines::_catch_exception_entry            = generate_catch_exception();
4556 
4557     // Build this early so it's available for the interpreter.
4558     StubRoutines::_throw_StackOverflowError_entry   =
4559       generate_throw_exception("StackOverflowError throw_exception",
4560                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
4561     StubRoutines::_throw_delayed_StackOverflowError_entry =
4562       generate_throw_exception("delayed StackOverflowError throw_exception",
4563                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError), false);
4564 
4565     // CRC32 Intrinsics.
4566     if (UseCRC32Intrinsics) {
4567       StubRoutines::_crc_table_adr = StubRoutines::ppc::generate_crc_constants(REVERSE_CRC32_POLY);
4568       StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(false);
4569     }
4570 
4571     // CRC32C Intrinsics.
4572     if (UseCRC32CIntrinsics) {
4573       StubRoutines::_crc32c_table_addr = StubRoutines::ppc::generate_crc_constants(REVERSE_CRC32C_POLY);
4574       StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(true);
4575     }
4576 
4577     // Safefetch stubs.
4578     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4579                                                        &StubRoutines::_safefetch32_fault_pc,
4580                                                        &StubRoutines::_safefetch32_continuation_pc);
4581     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4582                                                        &StubRoutines::_safefetchN_fault_pc,
4583                                                        &StubRoutines::_safefetchN_continuation_pc);
4584   }
4585 
4586   void generate_all() {
4587     // Generates all stubs and initializes the entry points
4588 
4589     // These entry points require SharedInfo::stack0 to be set up in
4590     // non-core builds
4591     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
4592     // Handle IncompatibleClassChangeError in itable stubs.
4593     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
4594     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
4595 
4596     // support for verify_oop (must happen after universe_init)
4597     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();
4598 
4599     // nmethod entry barriers for concurrent class unloading
4600     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
4601     if (bs_nm != NULL) {
4602       StubRoutines::ppc::_nmethod_entry_barrier            = generate_nmethod_entry_barrier();
4603     }
4604 
4605     // arraycopy stubs used by compilers
4606     generate_arraycopy_stubs();
4607 
4608 #ifdef COMPILER2
4609     if (UseMultiplyToLenIntrinsic) {
4610       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4611     }
4612     if (UseSquareToLenIntrinsic) {
4613       StubRoutines::_squareToLen = generate_squareToLen();
4614     }
4615     if (UseMulAddIntrinsic) {
4616       StubRoutines::_mulAdd = generate_mulAdd();
4617     }
4618     if (UseMontgomeryMultiplyIntrinsic) {
4619       StubRoutines::_montgomeryMultiply
4620         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
4621     }
4622     if (UseMontgomerySquareIntrinsic) {
4623       StubRoutines::_montgomerySquare
4624         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
4625     }
4626 #endif
4627 
4628     // data cache line writeback
4629     if (VM_Version::supports_data_cache_line_flush()) {
4630       StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
4631       StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
4632     }
4633 
4634     if (UseAESIntrinsics) {
4635       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4636       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4637     }
4638 
4639     if (UseSHA256Intrinsics) {
4640       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4641       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4642     }
4643     if (UseSHA512Intrinsics) {
4644       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
4645       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
4646     }
4647 
4648 #ifdef VM_LITTLE_ENDIAN
4649     // Currently supported on PPC64LE only
4650     if (UseBASE64Intrinsics) {
4651       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
4652       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
4653     }
4654 #endif
4655   }
4656 
4657  public:
4658   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4659     // replace the standard masm with a special one:
4660     _masm = new MacroAssembler(code);
4661     if (all) {
4662       generate_all();
4663     } else {
4664       generate_initial();
4665     }
4666   }
4667 };
4668 
4669 #define UCM_TABLE_MAX_ENTRIES 8
4670 void StubGenerator_generate(CodeBuffer* code, bool all) {
4671   if (UnsafeCopyMemory::_table == NULL) {
4672     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
4673   }
4674   StubGenerator g(code, all);
4675 }