1 /* 2 * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2025 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.inline.hpp" 27 #include "compiler/oopMap.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "gc/shared/barrierSetNMethod.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_ppc.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "prims/upcallLinker.hpp" 39 #include "runtime/continuation.hpp" 40 #include "runtime/continuationEntry.inline.hpp" 41 #include "runtime/frame.inline.hpp" 42 #include "runtime/handles.inline.hpp" 43 #include "runtime/javaThread.hpp" 44 #include "runtime/sharedRuntime.hpp" 45 #include "runtime/stubCodeGenerator.hpp" 46 #include "runtime/stubRoutines.hpp" 47 #include "runtime/vm_version.hpp" 48 #include "utilities/align.hpp" 49 #include "utilities/powerOfTwo.hpp" 50 #if INCLUDE_ZGC 51 #include "gc/z/zBarrierSetAssembler.hpp" 52 #endif 53 54 // Declaration and definition of StubGenerator (no .hpp file). 55 // For a more detailed description of the stub routine structure 56 // see the comment in stubRoutines.hpp. 57 58 #define __ _masm-> 59 60 #ifdef PRODUCT 61 #define BLOCK_COMMENT(str) // nothing 62 #else 63 #define BLOCK_COMMENT(str) __ block_comment(str) 64 #endif 65 66 #if defined(ABI_ELFv2) 67 #define STUB_ENTRY(name) StubRoutines::name 68 #else 69 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name)->entry() 70 #endif 71 72 class StubGenerator: public StubCodeGenerator { 73 private: 74 75 // Call stubs are used to call Java from C 76 // 77 // Arguments: 78 // 79 // R3 - call wrapper address : address 80 // R4 - result : intptr_t* 81 // R5 - result type : BasicType 82 // R6 - method : Method 83 // R7 - frame mgr entry point : address 84 // R8 - parameter block : intptr_t* 85 // R9 - parameter count in words : int 86 // R10 - thread : Thread* 87 // 88 address generate_call_stub(address& return_address) { 89 // Setup a new c frame, copy java arguments, call frame manager or 90 // native_entry, and process result. 91 92 StubGenStubId stub_id = StubGenStubId::call_stub_id; 93 StubCodeMark mark(this, stub_id); 94 95 address start = __ function_entry(); 96 97 int save_nonvolatile_registers_size = __ save_nonvolatile_registers_size(true, SuperwordUseVSX); 98 99 // some sanity checks 100 STATIC_ASSERT(StackAlignmentInBytes == 16); 101 assert((sizeof(frame::native_abi_minframe) % 16) == 0, "unaligned"); 102 assert((sizeof(frame::native_abi_reg_args) % 16) == 0, "unaligned"); 103 assert((save_nonvolatile_registers_size % 16) == 0, "unaligned"); 104 assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned"); 105 assert((sizeof(frame::entry_frame_locals) % 16) == 0, "unaligned"); 106 107 Register r_arg_call_wrapper_addr = R3; 108 Register r_arg_result_addr = R4; 109 Register r_arg_result_type = R5; 110 Register r_arg_method = R6; 111 Register r_arg_entry = R7; 112 Register r_arg_argument_addr = R8; 113 Register r_arg_argument_count = R9; 114 Register r_arg_thread = R10; 115 116 Register r_entryframe_fp = R2; // volatile 117 Register r_argument_size = R11_scratch1; // volatile 118 Register r_top_of_arguments_addr = R21_tmp1; 119 120 { 121 // Stack on entry to call_stub: 122 // 123 // F1 [C_FRAME] 124 // ... 125 Register r_frame_size = R12_scratch2; // volatile 126 Label arguments_copied; 127 128 // Save LR/CR to caller's C_FRAME. 129 __ save_LR_CR(R0); 130 131 // Keep copy of our frame pointer (caller's SP). 132 __ mr(r_entryframe_fp, R1_SP); 133 134 // calculate frame size 135 STATIC_ASSERT(Interpreter::logStackElementSize == 3); 136 137 // space for arguments aligned up: ((arg_count + 1) * 8) &~ 15 138 __ addi(r_frame_size, r_arg_argument_count, 1); 139 __ rldicr(r_frame_size, r_frame_size, 3, 63 - 4); 140 141 // this is the pure space for arguments (excluding alignment padding) 142 __ sldi(r_argument_size, r_arg_argument_count, 3); 143 144 __ addi(r_frame_size, r_frame_size, 145 save_nonvolatile_registers_size + frame::entry_frame_locals_size + frame::top_ijava_frame_abi_size); 146 147 // push ENTRY_FRAME 148 __ push_frame(r_frame_size, R0); 149 150 // Save non-volatiles registers to ENTRY_FRAME. 151 __ save_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size), 152 true, SuperwordUseVSX); 153 154 BLOCK_COMMENT("Push ENTRY_FRAME including arguments"); 155 // Push ENTRY_FRAME including arguments: 156 // 157 // F0 [TOP_IJAVA_FRAME_ABI] 158 // alignment (optional) 159 // [outgoing Java arguments] 160 // [non-volatiles] 161 // [ENTRY_FRAME_LOCALS] 162 // F1 [C_FRAME] 163 // ... 164 165 // initialize call_stub locals (step 1) 166 __ std(r_arg_call_wrapper_addr, _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp); 167 __ std(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp); 168 __ std(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp); 169 // we will save arguments_tos_address later 170 171 BLOCK_COMMENT("Copy Java arguments"); 172 // copy Java arguments 173 174 // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later. 175 __ addi(r_top_of_arguments_addr, r_entryframe_fp, 176 -(save_nonvolatile_registers_size + frame::entry_frame_locals_size)); 177 __ sub(r_top_of_arguments_addr, r_top_of_arguments_addr, r_argument_size); 178 179 // any arguments to copy? 180 __ cmpdi(CR0, r_arg_argument_count, 0); 181 __ beq(CR0, arguments_copied); 182 183 // prepare loop and copy arguments in reverse order 184 { 185 Register r_argument_addr = R22_tmp2; 186 Register r_argumentcopy_addr = R23_tmp3; 187 // init CTR with arg_argument_count 188 __ mtctr(r_arg_argument_count); 189 190 // let r_argumentcopy_addr point to last outgoing Java arguments P 191 __ mr(r_argumentcopy_addr, r_top_of_arguments_addr); 192 193 // let r_argument_addr point to last incoming java argument 194 __ add(r_argument_addr, r_arg_argument_addr, r_argument_size); 195 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord); 196 197 // now loop while CTR > 0 and copy arguments 198 { 199 Label next_argument; 200 __ bind(next_argument); 201 202 __ ld(R0, 0, r_argument_addr); 203 // argument_addr--; 204 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord); 205 __ std(R0, 0, r_argumentcopy_addr); 206 // argumentcopy_addr++; 207 __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord); 208 209 __ bdnz(next_argument); 210 } 211 } 212 213 // Arguments copied, continue. 214 __ bind(arguments_copied); 215 } 216 217 { 218 BLOCK_COMMENT("Call frame manager or native entry."); 219 // Call frame manager or native entry. 220 assert_different_registers(r_arg_entry, r_top_of_arguments_addr, r_arg_method, r_arg_thread); 221 222 // Register state on entry to frame manager / native entry: 223 // 224 // tos - intptr_t* sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8 225 // R19_method - Method 226 // R16_thread - JavaThread* 227 228 // Tos must point to last argument - element_size. 229 const Register tos = R15_esp; 230 231 __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize); 232 233 // initialize call_stub locals (step 2) 234 // now save tos as arguments_tos_address 235 __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp); 236 237 // load argument registers for call 238 __ mr(R19_method, r_arg_method); 239 __ mr(R16_thread, r_arg_thread); 240 assert(tos != r_arg_method, "trashed r_arg_method"); 241 assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread"); 242 243 // Set R15_prev_state to 0 for simplifying checks in callee. 244 __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R0); 245 // Stack on entry to frame manager / native entry: 246 // 247 // F0 [TOP_IJAVA_FRAME_ABI] 248 // alignment (optional) 249 // [outgoing Java arguments] 250 // [non-volatiles] 251 // [ENTRY_FRAME_LOCALS] 252 // F1 [C_FRAME] 253 // ... 254 // 255 256 // global toc register 257 __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R0); 258 // Remember the senderSP so we interpreter can pop c2i arguments off of the stack 259 // when called via a c2i. 260 261 // Pass initial_caller_sp to framemanager. 262 __ mr(R21_sender_SP, R1_SP); 263 264 // Do a light-weight C-call here, r_arg_entry holds the address 265 // of the interpreter entry point (frame manager or native entry) 266 // and save runtime-value of LR in return_address. 267 assert(r_arg_entry != tos && r_arg_entry != R19_method && r_arg_entry != R16_thread, 268 "trashed r_arg_entry"); 269 return_address = __ call_stub(r_arg_entry); 270 } 271 272 { 273 BLOCK_COMMENT("Returned from frame manager or native entry."); 274 // Returned from frame manager or native entry. 275 // Now pop frame, process result, and return to caller. 276 277 // Stack on exit from frame manager / native entry: 278 // 279 // F0 [ABI] 280 // ... 281 // [non-volatiles] 282 // [ENTRY_FRAME_LOCALS] 283 // F1 [C_FRAME] 284 // ... 285 // 286 // Just pop the topmost frame ... 287 // 288 289 Label ret_is_object; 290 Label ret_is_long; 291 Label ret_is_float; 292 Label ret_is_double; 293 294 Register r_lr = R11_scratch1; 295 Register r_cr = R12_scratch2; 296 297 // Reload some volatile registers which we've spilled before the call 298 // to frame manager / native entry. 299 // Access all locals via frame pointer, because we know nothing about 300 // the topmost frame's size. 301 __ ld(r_entryframe_fp, _abi0(callers_sp), R1_SP); // restore after call 302 assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr); 303 __ ld(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp); 304 __ ld(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp); 305 __ ld(r_cr, _abi0(cr), r_entryframe_fp); 306 __ ld(r_lr, _abi0(lr), r_entryframe_fp); 307 __ mtcr(r_cr); // restore CR 308 __ mtlr(r_lr); // restore LR 309 310 // Store result depending on type. Everything that is not 311 // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT. 312 // Using volatile CRs. 313 __ cmpwi(CR1, r_arg_result_type, T_OBJECT); 314 __ cmpwi(CR5, r_arg_result_type, T_LONG); 315 __ cmpwi(CR6, r_arg_result_type, T_FLOAT); 316 __ cmpwi(CR7, r_arg_result_type, T_DOUBLE); 317 318 __ pop_cont_fastpath(); // kills CR0, uses R16_thread 319 320 // restore non-volatile registers 321 __ restore_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size), 322 true, SuperwordUseVSX); 323 324 // pop frame 325 __ mr(R1_SP, r_entryframe_fp); 326 327 // Stack on exit from call_stub: 328 // 329 // 0 [C_FRAME] 330 // ... 331 // 332 // no call_stub frames left. 333 334 __ beq(CR1, ret_is_object); 335 __ beq(CR5, ret_is_long); 336 __ beq(CR6, ret_is_float); 337 __ beq(CR7, ret_is_double); 338 339 // default: 340 __ stw(R3_RET, 0, r_arg_result_addr); 341 __ blr(); // return to caller 342 343 // case T_OBJECT: 344 // case T_LONG: 345 __ bind(ret_is_object); 346 __ bind(ret_is_long); 347 __ std(R3_RET, 0, r_arg_result_addr); 348 __ blr(); // return to caller 349 350 // case T_FLOAT: 351 __ bind(ret_is_float); 352 __ stfs(F1_RET, 0, r_arg_result_addr); 353 __ blr(); // return to caller 354 355 // case T_DOUBLE: 356 __ bind(ret_is_double); 357 __ stfd(F1_RET, 0, r_arg_result_addr); 358 __ blr(); // return to caller 359 } 360 361 return start; 362 } 363 364 // Return point for a Java call if there's an exception thrown in 365 // Java code. The exception is caught and transformed into a 366 // pending exception stored in JavaThread that can be tested from 367 // within the VM. 368 // 369 address generate_catch_exception() { 370 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 371 StubCodeMark mark(this, stub_id); 372 373 address start = __ pc(); 374 375 // Registers alive 376 // 377 // R16_thread 378 // R3_ARG1 - address of pending exception 379 // R4_ARG2 - return address in call stub 380 381 const Register exception_file = R21_tmp1; 382 const Register exception_line = R22_tmp2; 383 384 __ load_const(exception_file, (void*)__FILE__); 385 __ load_const(exception_line, (void*)__LINE__); 386 387 __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread); 388 // store into `char *' 389 __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread); 390 // store into `int' 391 __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread); 392 393 // complete return to VM 394 assert(StubRoutines::_call_stub_return_address != nullptr, "must have been generated before"); 395 396 __ mtlr(R4_ARG2); 397 // continue in call stub 398 __ blr(); 399 400 return start; 401 } 402 403 // Continuation point for runtime calls returning with a pending 404 // exception. The pending exception check happened in the runtime 405 // or native call stub. The pending exception in Thread is 406 // converted into a Java-level exception. 407 // 408 // Read: 409 // 410 // LR: The pc the runtime library callee wants to return to. 411 // Since the exception occurred in the callee, the return pc 412 // from the point of view of Java is the exception pc. 413 // thread: Needed for method handles. 414 // 415 // Invalidate: 416 // 417 // volatile registers (except below). 418 // 419 // Update: 420 // 421 // R4_ARG2: exception 422 // 423 // (LR is unchanged and is live out). 424 // 425 address generate_forward_exception() { 426 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 427 StubCodeMark mark(this, stub_id); 428 address start = __ pc(); 429 430 if (VerifyOops) { 431 // Get pending exception oop. 432 __ ld(R3_ARG1, 433 in_bytes(Thread::pending_exception_offset()), 434 R16_thread); 435 // Make sure that this code is only executed if there is a pending exception. 436 { 437 Label L; 438 __ cmpdi(CR0, R3_ARG1, 0); 439 __ bne(CR0, L); 440 __ stop("StubRoutines::forward exception: no pending exception (1)"); 441 __ bind(L); 442 } 443 __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop"); 444 } 445 446 // Save LR/CR and copy exception pc (LR) into R4_ARG2. 447 __ save_LR(R4_ARG2); 448 __ push_frame_reg_args(0, R0); 449 // Find exception handler. 450 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 451 SharedRuntime::exception_handler_for_return_address), 452 R16_thread, 453 R4_ARG2); 454 // Copy handler's address. 455 __ mtctr(R3_RET); 456 __ pop_frame(); 457 __ restore_LR(R0); 458 459 // Set up the arguments for the exception handler: 460 // - R3_ARG1: exception oop 461 // - R4_ARG2: exception pc. 462 463 // Load pending exception oop. 464 __ ld(R3_ARG1, 465 in_bytes(Thread::pending_exception_offset()), 466 R16_thread); 467 468 // The exception pc is the return address in the caller. 469 // Must load it into R4_ARG2. 470 __ mflr(R4_ARG2); 471 472 #ifdef ASSERT 473 // Make sure exception is set. 474 { 475 Label L; 476 __ cmpdi(CR0, R3_ARG1, 0); 477 __ bne(CR0, L); 478 __ stop("StubRoutines::forward exception: no pending exception (2)"); 479 __ bind(L); 480 } 481 #endif 482 483 // Clear the pending exception. 484 __ li(R0, 0); 485 __ std(R0, 486 in_bytes(Thread::pending_exception_offset()), 487 R16_thread); 488 // Jump to exception handler. 489 __ bctr(); 490 491 return start; 492 } 493 494 #undef __ 495 #define __ _masm-> 496 497 #if !defined(PRODUCT) 498 // Wrapper which calls oopDesc::is_oop_or_null() 499 // Only called by MacroAssembler::verify_oop 500 static void verify_oop_helper(const char* message, oopDesc* o) { 501 if (!oopDesc::is_oop_or_null(o)) { 502 fatal("%s. oop: " PTR_FORMAT, message, p2i(o)); 503 } 504 ++ StubRoutines::_verify_oop_count; 505 } 506 #endif 507 508 // Return address of code to be called from code generated by 509 // MacroAssembler::verify_oop. 510 // 511 // Don't generate, rather use C++ code. 512 address generate_verify_oop() { 513 // this is actually a `FunctionDescriptor*'. 514 address start = nullptr; 515 516 #if !defined(PRODUCT) 517 start = CAST_FROM_FN_PTR(address, verify_oop_helper); 518 #endif 519 520 return start; 521 } 522 523 // Computes the Galois/Counter Mode (GCM) product and reduction. 524 // 525 // This function performs polynomial multiplication of the subkey H with 526 // the current GHASH state using vectorized polynomial multiplication (`vpmsumd`). 527 // The subkey H is divided into lower, middle, and higher halves. 528 // The multiplication results are reduced using `vConstC2` to stay within GF(2^128). 529 // The final computed value is stored back into `vState`. 530 static void computeGCMProduct(MacroAssembler* _masm, 531 VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH, 532 VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState, 533 VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct, 534 VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9, 535 VectorRegister vCombinedResult, VectorRegister vSwappedH) { 536 __ vxor(vH, vH, vState); 537 __ vpmsumd(vLowProduct, vLowerH, vH); // L : Lower Half of subkey H 538 __ vpmsumd(vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H 539 __ vpmsumd(vHighProduct, vHigherH, vH); // H : Higher Half of subkey H 540 __ vpmsumd(vReducedLow, vLowProduct, vConstC2); // Reduction 541 __ vsldoi(vTmp8, vMidProduct, vZero, 8); // mL : Extract the lower 64 bits of M 542 __ vsldoi(vTmp9, vZero, vMidProduct, 8); // mH : Extract the higher 64 bits of M 543 __ vxor(vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half 544 __ vxor(vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half 545 __ vsldoi(vLowProduct, vLowProduct, vLowProduct, 8); // Swap 546 __ vxor(vLowProduct, vLowProduct, vReducedLow); 547 __ vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8); // Swap 548 __ vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction using constant 549 __ vxor(vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products 550 __ vxor(vState, vLowProduct, vCombinedResult); 551 } 552 553 // Generate stub for ghash process blocks. 554 // 555 // Arguments for generated stub: 556 // state: R3_ARG1 (long[] state) 557 // subkeyH: R4_ARG2 (long[] subH) 558 // data: R5_ARG3 (byte[] data) 559 // blocks: R6_ARG4 (number of 16-byte blocks to process) 560 // 561 // The polynomials are processed in bit-reflected order for efficiency reasons. 562 // This optimization leverages the structure of the Galois field arithmetic 563 // to minimize the number of bit manipulations required during multiplication. 564 // For an explanation of how this works, refer : 565 // Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich, 566 // Martin Dixon. "Optimized Galois-Counter-Mode Implementation on IntelĀ® 567 // Architecture Processor" 568 // http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf 569 // 570 // 571 address generate_ghash_processBlocks() { 572 StubCodeMark mark(this, "StubRoutines", "ghash"); 573 address start = __ function_entry(); 574 575 // Registers for parameters 576 Register state = R3_ARG1; // long[] state 577 Register subkeyH = R4_ARG2; // long[] subH 578 Register data = R5_ARG3; // byte[] data 579 Register blocks = R6_ARG4; 580 Register temp1 = R8; 581 // Vector Registers 582 VectorRegister vZero = VR0; 583 VectorRegister vH = VR1; 584 VectorRegister vLowerH = VR2; 585 VectorRegister vHigherH = VR3; 586 VectorRegister vLowProduct = VR4; 587 VectorRegister vMidProduct = VR5; 588 VectorRegister vHighProduct = VR6; 589 VectorRegister vReducedLow = VR7; 590 VectorRegister vTmp8 = VR8; 591 VectorRegister vTmp9 = VR9; 592 VectorRegister vTmp10 = VR10; 593 VectorRegister vSwappedH = VR11; 594 VectorRegister vTmp12 = VR12; 595 VectorRegister loadOrder = VR13; 596 VectorRegister vHigh = VR14; 597 VectorRegister vLow = VR15; 598 VectorRegister vState = VR16; 599 VectorRegister vPerm = VR17; 600 VectorRegister vCombinedResult = VR18; 601 VectorRegister vConstC2 = VR19; 602 603 __ li(temp1, 0xc2); 604 __ sldi(temp1, temp1, 56); 605 __ vspltisb(vZero, 0); 606 __ mtvrd(vConstC2, temp1); 607 __ lxvd2x(vH->to_vsr(), subkeyH); 608 __ lxvd2x(vState->to_vsr(), state); 609 // Operations to obtain lower and higher bytes of subkey H. 610 __ vspltisb(vReducedLow, 1); 611 __ vspltisb(vTmp10, 7); 612 __ vsldoi(vTmp8, vZero, vReducedLow, 1); // 0x1 613 __ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1 614 __ vsplt(vTmp9, 0, vH); // MSB of H 615 __ vsl(vH, vH, vReducedLow); // Carry = H<<7 616 __ vsrab(vTmp9, vTmp9, vTmp10); 617 __ vand(vTmp9, vTmp9, vTmp8); // Carry 618 __ vxor(vTmp10, vH, vTmp9); 619 __ vsldoi(vConstC2, vZero, vConstC2, 8); 620 __ vsldoi(vSwappedH, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H 621 __ vsldoi(vLowerH, vZero, vSwappedH, 8); // H.L 622 __ vsldoi(vHigherH, vSwappedH, vZero, 8); // H.H 623 #ifdef ASSERT 624 __ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero 625 __ asm_assert_ne("blocks should NOT be zero"); 626 #endif 627 __ clrldi(blocks, blocks, 32); 628 __ mtctr(blocks); 629 __ lvsl(loadOrder, temp1); 630 #ifdef VM_LITTLE_ENDIAN 631 __ vspltisb(vTmp12, 0xf); 632 __ vxor(loadOrder, loadOrder, vTmp12); 633 #define LE_swap_bytes(x) __ vec_perm(x, x, x, loadOrder) 634 #else 635 #define LE_swap_bytes(x) 636 #endif 637 638 // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation. 639 // 640 // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts, 641 // performing three 128-bit multiplications and combining the results efficiently. 642 // 643 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 644 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 645 // 646 // Inputs: 647 // - vH: The data vector (state), containing both B0 (lower half) and B1 (higher half). 648 // - vLowerH: Lower half of the subkey H (A0). 649 // - vHigherH: Higher half of the subkey H (A1). 650 // - vConstC2: Constant used for reduction (for final processing). 651 // 652 // References: 653 // Shay Gueron, Michael E. Kounavis. 654 // "IntelĀ® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode" 655 // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918 656 // 657 Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop; 658 __ andi(temp1, data, 15); 659 __ cmpwi(CR0, temp1, 0); 660 __ bne(CR0, L_initialize_unaligned_loop); 661 662 __ bind(L_aligned_loop); 663 __ lvx(vH, temp1, data); 664 LE_swap_bytes(vH); 665 computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, 666 vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH); 667 __ addi(data, data, 16); 668 __ bdnz(L_aligned_loop); 669 __ b(L_store); 670 671 __ bind(L_initialize_unaligned_loop); 672 __ li(temp1, 0); 673 __ lvsl(vPerm, temp1, data); 674 __ lvx(vHigh, temp1, data); 675 #ifdef VM_LITTLE_ENDIAN 676 __ vspltisb(vTmp12, -1); 677 __ vxor(vPerm, vPerm, vTmp12); 678 #endif 679 __ bind(L_unaligned_loop); 680 __ addi(data, data, 16); 681 __ lvx(vLow, temp1, data); 682 __ vec_perm(vH, vHigh, vLow, vPerm); 683 computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, 684 vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH); 685 __ vmr(vHigh, vLow); 686 __ bdnz(L_unaligned_loop); 687 688 __ bind(L_store); 689 __ stxvd2x(vState->to_vsr(), state); 690 __ blr(); 691 692 return start; 693 } 694 // -XX:+OptimizeFill : convert fill/copy loops into intrinsic 695 // 696 // The code is implemented(ported from sparc) as we believe it benefits JVM98, however 697 // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all! 698 // 699 // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition 700 // for turning on loop predication optimization, and hence the behavior of "array range check" 701 // and "loop invariant check" could be influenced, which potentially boosted JVM98. 702 // 703 // Generate stub for disjoint short fill. If "aligned" is true, the 704 // "to" address is assumed to be heapword aligned. 705 // 706 // Arguments for generated stub: 707 // to: R3_ARG1 708 // value: R4_ARG2 709 // count: R5_ARG3 treated as signed 710 // 711 address generate_fill(StubGenStubId stub_id) { 712 BasicType t; 713 bool aligned; 714 715 switch (stub_id) { 716 case jbyte_fill_id: 717 t = T_BYTE; 718 aligned = false; 719 break; 720 case jshort_fill_id: 721 t = T_SHORT; 722 aligned = false; 723 break; 724 case jint_fill_id: 725 t = T_INT; 726 aligned = false; 727 break; 728 case arrayof_jbyte_fill_id: 729 t = T_BYTE; 730 aligned = true; 731 break; 732 case arrayof_jshort_fill_id: 733 t = T_SHORT; 734 aligned = true; 735 break; 736 case arrayof_jint_fill_id: 737 t = T_INT; 738 aligned = true; 739 break; 740 default: 741 ShouldNotReachHere(); 742 } 743 744 StubCodeMark mark(this, stub_id); 745 address start = __ function_entry(); 746 747 const Register to = R3_ARG1; // source array address 748 const Register value = R4_ARG2; // fill value 749 const Register count = R5_ARG3; // elements count 750 const Register temp = R6_ARG4; // temp register 751 752 //assert_clean_int(count, O3); // Make sure 'count' is clean int. 753 754 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; 755 Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes; 756 757 int shift = -1; 758 switch (t) { 759 case T_BYTE: 760 shift = 2; 761 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes). 762 __ rldimi(value, value, 8, 48); // 8 bit -> 16 bit 763 __ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element. 764 __ blt(CR0, L_fill_elements); 765 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit 766 break; 767 case T_SHORT: 768 shift = 1; 769 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes). 770 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit 771 __ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element. 772 __ blt(CR0, L_fill_elements); 773 break; 774 case T_INT: 775 shift = 0; 776 __ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element. 777 __ blt(CR0, L_fill_4_bytes); 778 break; 779 default: ShouldNotReachHere(); 780 } 781 782 if (!aligned && (t == T_BYTE || t == T_SHORT)) { 783 // Align source address at 4 bytes address boundary. 784 if (t == T_BYTE) { 785 // One byte misalignment happens only for byte arrays. 786 __ andi_(temp, to, 1); 787 __ beq(CR0, L_skip_align1); 788 __ stb(value, 0, to); 789 __ addi(to, to, 1); 790 __ addi(count, count, -1); 791 __ bind(L_skip_align1); 792 } 793 // Two bytes misalignment happens only for byte and short (char) arrays. 794 __ andi_(temp, to, 2); 795 __ beq(CR0, L_skip_align2); 796 __ sth(value, 0, to); 797 __ addi(to, to, 2); 798 __ addi(count, count, -(1 << (shift - 1))); 799 __ bind(L_skip_align2); 800 } 801 802 if (!aligned) { 803 // Align to 8 bytes, we know we are 4 byte aligned to start. 804 __ andi_(temp, to, 7); 805 __ beq(CR0, L_fill_32_bytes); 806 __ stw(value, 0, to); 807 __ addi(to, to, 4); 808 __ addi(count, count, -(1 << shift)); 809 __ bind(L_fill_32_bytes); 810 } 811 812 __ li(temp, 8<<shift); // Prepare for 32 byte loop. 813 // Clone bytes int->long as above. 814 __ rldimi(value, value, 32, 0); // 32 bit -> 64 bit 815 816 Label L_check_fill_8_bytes; 817 // Fill 32-byte chunks. 818 __ subf_(count, temp, count); 819 __ blt(CR0, L_check_fill_8_bytes); 820 821 Label L_fill_32_bytes_loop; 822 __ align(32); 823 __ bind(L_fill_32_bytes_loop); 824 825 __ std(value, 0, to); 826 __ std(value, 8, to); 827 __ subf_(count, temp, count); // Update count. 828 __ std(value, 16, to); 829 __ std(value, 24, to); 830 831 __ addi(to, to, 32); 832 __ bge(CR0, L_fill_32_bytes_loop); 833 834 __ bind(L_check_fill_8_bytes); 835 __ add_(count, temp, count); 836 __ beq(CR0, L_exit); 837 __ addic_(count, count, -(2 << shift)); 838 __ blt(CR0, L_fill_4_bytes); 839 840 // 841 // Length is too short, just fill 8 bytes at a time. 842 // 843 Label L_fill_8_bytes_loop; 844 __ bind(L_fill_8_bytes_loop); 845 __ std(value, 0, to); 846 __ addic_(count, count, -(2 << shift)); 847 __ addi(to, to, 8); 848 __ bge(CR0, L_fill_8_bytes_loop); 849 850 // Fill trailing 4 bytes. 851 __ bind(L_fill_4_bytes); 852 __ andi_(temp, count, 1<<shift); 853 __ beq(CR0, L_fill_2_bytes); 854 855 __ stw(value, 0, to); 856 if (t == T_BYTE || t == T_SHORT) { 857 __ addi(to, to, 4); 858 // Fill trailing 2 bytes. 859 __ bind(L_fill_2_bytes); 860 __ andi_(temp, count, 1<<(shift-1)); 861 __ beq(CR0, L_fill_byte); 862 __ sth(value, 0, to); 863 if (t == T_BYTE) { 864 __ addi(to, to, 2); 865 // Fill trailing byte. 866 __ bind(L_fill_byte); 867 __ andi_(count, count, 1); 868 __ beq(CR0, L_exit); 869 __ stb(value, 0, to); 870 } else { 871 __ bind(L_fill_byte); 872 } 873 } else { 874 __ bind(L_fill_2_bytes); 875 } 876 __ bind(L_exit); 877 __ blr(); 878 879 // Handle copies less than 8 bytes. Int is handled elsewhere. 880 if (t == T_BYTE) { 881 __ bind(L_fill_elements); 882 Label L_fill_2, L_fill_4; 883 __ andi_(temp, count, 1); 884 __ beq(CR0, L_fill_2); 885 __ stb(value, 0, to); 886 __ addi(to, to, 1); 887 __ bind(L_fill_2); 888 __ andi_(temp, count, 2); 889 __ beq(CR0, L_fill_4); 890 __ stb(value, 0, to); 891 __ stb(value, 0, to); 892 __ addi(to, to, 2); 893 __ bind(L_fill_4); 894 __ andi_(temp, count, 4); 895 __ beq(CR0, L_exit); 896 __ stb(value, 0, to); 897 __ stb(value, 1, to); 898 __ stb(value, 2, to); 899 __ stb(value, 3, to); 900 __ blr(); 901 } 902 903 if (t == T_SHORT) { 904 Label L_fill_2; 905 __ bind(L_fill_elements); 906 __ andi_(temp, count, 1); 907 __ beq(CR0, L_fill_2); 908 __ sth(value, 0, to); 909 __ addi(to, to, 2); 910 __ bind(L_fill_2); 911 __ andi_(temp, count, 2); 912 __ beq(CR0, L_exit); 913 __ sth(value, 0, to); 914 __ sth(value, 2, to); 915 __ blr(); 916 } 917 return start; 918 } 919 920 inline void assert_positive_int(Register count) { 921 #ifdef ASSERT 922 __ srdi_(R0, count, 31); 923 __ asm_assert_eq("missing zero extend"); 924 #endif 925 } 926 927 // Generate overlap test for array copy stubs. 928 // 929 // Input: 930 // R3_ARG1 - from 931 // R4_ARG2 - to 932 // R5_ARG3 - element count 933 // 934 void array_overlap_test(address no_overlap_target, int log2_elem_size) { 935 Register tmp1 = R6_ARG4; 936 Register tmp2 = R7_ARG5; 937 938 assert_positive_int(R5_ARG3); 939 940 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes 941 __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes 942 __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison! 943 __ cmpld(CR1, tmp1, tmp2); 944 __ crnand(CR0, Assembler::less, CR1, Assembler::less); 945 // Overlaps if Src before dst and distance smaller than size. 946 // Branch to forward copy routine otherwise (within range of 32kB). 947 __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::less), no_overlap_target); 948 949 // need to copy backwards 950 } 951 952 // This is common errorexit stub for UnsafeMemoryAccess. 953 address generate_unsafecopy_common_error_exit() { 954 address start_pc = __ pc(); 955 Register tmp1 = R6_ARG4; 956 // probably copy stub would have changed value reset it. 957 __ load_const_optimized(tmp1, VM_Version::_dscr_val); 958 __ mtdscr(tmp1); 959 __ li(R3_RET, 0); // return 0 960 __ blr(); 961 return start_pc; 962 } 963 964 // The guideline in the implementations of generate_disjoint_xxx_copy 965 // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with 966 // single instructions, but to avoid alignment interrupts (see subsequent 967 // comment). Furthermore, we try to minimize misaligned access, even 968 // though they cause no alignment interrupt. 969 // 970 // In Big-Endian mode, the PowerPC architecture requires implementations to 971 // handle automatically misaligned integer halfword and word accesses, 972 // word-aligned integer doubleword accesses, and word-aligned floating-point 973 // accesses. Other accesses may or may not generate an Alignment interrupt 974 // depending on the implementation. 975 // Alignment interrupt handling may require on the order of hundreds of cycles, 976 // so every effort should be made to avoid misaligned memory values. 977 // 978 // 979 // Generate stub for disjoint byte copy. If "aligned" is true, the 980 // "from" and "to" addresses are assumed to be heapword aligned. 981 // 982 // Arguments for generated stub: 983 // from: R3_ARG1 984 // to: R4_ARG2 985 // count: R5_ARG3 treated as signed 986 // 987 address generate_disjoint_byte_copy(StubGenStubId stub_id) { 988 bool aligned; 989 switch (stub_id) { 990 case jbyte_disjoint_arraycopy_id: 991 aligned = false; 992 break; 993 case arrayof_jbyte_disjoint_arraycopy_id: 994 aligned = true; 995 break; 996 default: 997 ShouldNotReachHere(); 998 } 999 1000 StubCodeMark mark(this, stub_id); 1001 address start = __ function_entry(); 1002 assert_positive_int(R5_ARG3); 1003 1004 Register tmp1 = R6_ARG4; 1005 Register tmp2 = R7_ARG5; 1006 Register tmp3 = R8_ARG6; 1007 Register tmp4 = R9_ARG7; 1008 1009 VectorSRegister tmp_vsr1 = VSR1; 1010 VectorSRegister tmp_vsr2 = VSR2; 1011 1012 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10; 1013 { 1014 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit 1015 UnsafeMemoryAccessMark umam(this, !aligned, false); 1016 1017 // Don't try anything fancy if arrays don't have many elements. 1018 __ li(tmp3, 0); 1019 __ cmpwi(CR0, R5_ARG3, 17); 1020 __ ble(CR0, l_6); // copy 4 at a time 1021 1022 if (!aligned) { 1023 __ xorr(tmp1, R3_ARG1, R4_ARG2); 1024 __ andi_(tmp1, tmp1, 3); 1025 __ bne(CR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy. 1026 1027 // Copy elements if necessary to align to 4 bytes. 1028 __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary. 1029 __ andi_(tmp1, tmp1, 3); 1030 __ beq(CR0, l_2); 1031 1032 __ subf(R5_ARG3, tmp1, R5_ARG3); 1033 __ bind(l_9); 1034 __ lbz(tmp2, 0, R3_ARG1); 1035 __ addic_(tmp1, tmp1, -1); 1036 __ stb(tmp2, 0, R4_ARG2); 1037 __ addi(R3_ARG1, R3_ARG1, 1); 1038 __ addi(R4_ARG2, R4_ARG2, 1); 1039 __ bne(CR0, l_9); 1040 1041 __ bind(l_2); 1042 } 1043 1044 // copy 8 elements at a time 1045 __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8 1046 __ andi_(tmp1, tmp2, 7); 1047 __ bne(CR0, l_7); // not same alignment -> to or from is aligned -> copy 8 1048 1049 // copy a 2-element word if necessary to align to 8 bytes 1050 __ andi_(R0, R3_ARG1, 7); 1051 __ beq(CR0, l_7); 1052 1053 __ lwzx(tmp2, R3_ARG1, tmp3); 1054 __ addi(R5_ARG3, R5_ARG3, -4); 1055 __ stwx(tmp2, R4_ARG2, tmp3); 1056 { // FasterArrayCopy 1057 __ addi(R3_ARG1, R3_ARG1, 4); 1058 __ addi(R4_ARG2, R4_ARG2, 4); 1059 } 1060 __ bind(l_7); 1061 1062 { // FasterArrayCopy 1063 __ cmpwi(CR0, R5_ARG3, 31); 1064 __ ble(CR0, l_6); // copy 2 at a time if less than 32 elements remain 1065 1066 __ srdi(tmp1, R5_ARG3, 5); 1067 __ andi_(R5_ARG3, R5_ARG3, 31); 1068 __ mtctr(tmp1); 1069 1070 1071 // Prefetch the data into the L2 cache. 1072 __ dcbt(R3_ARG1, 0); 1073 1074 // If supported set DSCR pre-fetch to deepest. 1075 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); 1076 __ mtdscr(tmp2); 1077 1078 __ li(tmp1, 16); 1079 1080 // Backbranch target aligned to 32-byte. Not 16-byte align as 1081 // loop contains < 8 instructions that fit inside a single 1082 // i-cache sector. 1083 __ align(32); 1084 1085 __ bind(l_10); 1086 // Use loop with VSX load/store instructions to 1087 // copy 32 elements a time. 1088 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src 1089 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst 1090 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1091 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1092 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1093 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1094 __ bdnz(l_10); // Dec CTR and loop if not zero. 1095 1096 // Restore DSCR pre-fetch value. 1097 __ load_const_optimized(tmp2, VM_Version::_dscr_val); 1098 __ mtdscr(tmp2); 1099 1100 } // FasterArrayCopy 1101 1102 __ bind(l_6); 1103 1104 // copy 4 elements at a time 1105 __ cmpwi(CR0, R5_ARG3, 4); 1106 __ blt(CR0, l_1); 1107 __ srdi(tmp1, R5_ARG3, 2); 1108 __ mtctr(tmp1); // is > 0 1109 __ andi_(R5_ARG3, R5_ARG3, 3); 1110 1111 { // FasterArrayCopy 1112 __ addi(R3_ARG1, R3_ARG1, -4); 1113 __ addi(R4_ARG2, R4_ARG2, -4); 1114 __ bind(l_3); 1115 __ lwzu(tmp2, 4, R3_ARG1); 1116 __ stwu(tmp2, 4, R4_ARG2); 1117 __ bdnz(l_3); 1118 __ addi(R3_ARG1, R3_ARG1, 4); 1119 __ addi(R4_ARG2, R4_ARG2, 4); 1120 } 1121 1122 // do single element copy 1123 __ bind(l_1); 1124 __ cmpwi(CR0, R5_ARG3, 0); 1125 __ beq(CR0, l_4); 1126 1127 { // FasterArrayCopy 1128 __ mtctr(R5_ARG3); 1129 __ addi(R3_ARG1, R3_ARG1, -1); 1130 __ addi(R4_ARG2, R4_ARG2, -1); 1131 1132 __ bind(l_5); 1133 __ lbzu(tmp2, 1, R3_ARG1); 1134 __ stbu(tmp2, 1, R4_ARG2); 1135 __ bdnz(l_5); 1136 } 1137 } 1138 1139 __ bind(l_4); 1140 __ li(R3_RET, 0); // return 0 1141 __ blr(); 1142 1143 return start; 1144 } 1145 1146 // Generate stub for conjoint byte copy. If "aligned" is true, the 1147 // "from" and "to" addresses are assumed to be heapword aligned. 1148 // 1149 // Arguments for generated stub: 1150 // from: R3_ARG1 1151 // to: R4_ARG2 1152 // count: R5_ARG3 treated as signed 1153 // 1154 address generate_conjoint_byte_copy(StubGenStubId stub_id) { 1155 bool aligned; 1156 switch (stub_id) { 1157 case jbyte_arraycopy_id: 1158 aligned = false; 1159 break; 1160 case arrayof_jbyte_arraycopy_id: 1161 aligned = true; 1162 break; 1163 default: 1164 ShouldNotReachHere(); 1165 } 1166 1167 StubCodeMark mark(this, stub_id); 1168 address start = __ function_entry(); 1169 assert_positive_int(R5_ARG3); 1170 1171 Register tmp1 = R6_ARG4; 1172 Register tmp2 = R7_ARG5; 1173 Register tmp3 = R8_ARG6; 1174 1175 address nooverlap_target = aligned ? 1176 STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy()) : 1177 STUB_ENTRY(jbyte_disjoint_arraycopy()); 1178 1179 array_overlap_test(nooverlap_target, 0); 1180 // Do reverse copy. We assume the case of actual overlap is rare enough 1181 // that we don't have to optimize it. 1182 Label l_1, l_2; 1183 { 1184 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit 1185 UnsafeMemoryAccessMark umam(this, !aligned, false); 1186 __ b(l_2); 1187 __ bind(l_1); 1188 __ stbx(tmp1, R4_ARG2, R5_ARG3); 1189 __ bind(l_2); 1190 __ addic_(R5_ARG3, R5_ARG3, -1); 1191 __ lbzx(tmp1, R3_ARG1, R5_ARG3); 1192 __ bge(CR0, l_1); 1193 } 1194 __ li(R3_RET, 0); // return 0 1195 __ blr(); 1196 1197 return start; 1198 } 1199 1200 // Generate stub for disjoint short copy. If "aligned" is true, the 1201 // "from" and "to" addresses are assumed to be heapword aligned. 1202 // 1203 // Arguments for generated stub: 1204 // from: R3_ARG1 1205 // to: R4_ARG2 1206 // elm.count: R5_ARG3 treated as signed 1207 // 1208 // Strategy for aligned==true: 1209 // 1210 // If length <= 9: 1211 // 1. copy 2 elements at a time (l_6) 1212 // 2. copy last element if original element count was odd (l_1) 1213 // 1214 // If length > 9: 1215 // 1. copy 4 elements at a time until less than 4 elements are left (l_7) 1216 // 2. copy 2 elements at a time until less than 2 elements are left (l_6) 1217 // 3. copy last element if one was left in step 2. (l_1) 1218 // 1219 // 1220 // Strategy for aligned==false: 1221 // 1222 // If length <= 9: same as aligned==true case, but NOTE: load/stores 1223 // can be unaligned (see comment below) 1224 // 1225 // If length > 9: 1226 // 1. continue with step 6. if the alignment of from and to mod 4 1227 // is different. 1228 // 2. align from and to to 4 bytes by copying 1 element if necessary 1229 // 3. at l_2 from and to are 4 byte aligned; continue with 1230 // 5. if they cannot be aligned to 8 bytes because they have 1231 // got different alignment mod 8. 1232 // 4. at this point we know that both, from and to, have the same 1233 // alignment mod 8, now copy one element if necessary to get 1234 // 8 byte alignment of from and to. 1235 // 5. copy 4 elements at a time until less than 4 elements are 1236 // left; depending on step 3. all load/stores are aligned or 1237 // either all loads or all stores are unaligned. 1238 // 6. copy 2 elements at a time until less than 2 elements are 1239 // left (l_6); arriving here from step 1., there is a chance 1240 // that all accesses are unaligned. 1241 // 7. copy last element if one was left in step 6. (l_1) 1242 // 1243 // There are unaligned data accesses using integer load/store 1244 // instructions in this stub. POWER allows such accesses. 1245 // 1246 // According to the manuals (PowerISA_V2.06_PUBLIC, Book II, 1247 // Chapter 2: Effect of Operand Placement on Performance) unaligned 1248 // integer load/stores have good performance. Only unaligned 1249 // floating point load/stores can have poor performance. 1250 // 1251 // TODO: 1252 // 1253 // 1. check if aligning the backbranch target of loops is beneficial 1254 // 1255 address generate_disjoint_short_copy(StubGenStubId stub_id) { 1256 bool aligned; 1257 switch (stub_id) { 1258 case jshort_disjoint_arraycopy_id: 1259 aligned = false; 1260 break; 1261 case arrayof_jshort_disjoint_arraycopy_id: 1262 aligned = true; 1263 break; 1264 default: 1265 ShouldNotReachHere(); 1266 } 1267 1268 StubCodeMark mark(this, stub_id); 1269 1270 Register tmp1 = R6_ARG4; 1271 Register tmp2 = R7_ARG5; 1272 Register tmp3 = R8_ARG6; 1273 Register tmp4 = R9_ARG7; 1274 1275 VectorSRegister tmp_vsr1 = VSR1; 1276 VectorSRegister tmp_vsr2 = VSR2; 1277 1278 address start = __ function_entry(); 1279 assert_positive_int(R5_ARG3); 1280 1281 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9; 1282 { 1283 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit 1284 UnsafeMemoryAccessMark umam(this, !aligned, false); 1285 // don't try anything fancy if arrays don't have many elements 1286 __ li(tmp3, 0); 1287 __ cmpwi(CR0, R5_ARG3, 9); 1288 __ ble(CR0, l_6); // copy 2 at a time 1289 1290 if (!aligned) { 1291 __ xorr(tmp1, R3_ARG1, R4_ARG2); 1292 __ andi_(tmp1, tmp1, 3); 1293 __ bne(CR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy 1294 1295 // At this point it is guaranteed that both, from and to have the same alignment mod 4. 1296 1297 // Copy 1 element if necessary to align to 4 bytes. 1298 __ andi_(tmp1, R3_ARG1, 3); 1299 __ beq(CR0, l_2); 1300 1301 __ lhz(tmp2, 0, R3_ARG1); 1302 __ addi(R3_ARG1, R3_ARG1, 2); 1303 __ sth(tmp2, 0, R4_ARG2); 1304 __ addi(R4_ARG2, R4_ARG2, 2); 1305 __ addi(R5_ARG3, R5_ARG3, -1); 1306 __ bind(l_2); 1307 1308 // At this point the positions of both, from and to, are at least 4 byte aligned. 1309 1310 // Copy 4 elements at a time. 1311 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8. 1312 __ xorr(tmp2, R3_ARG1, R4_ARG2); 1313 __ andi_(tmp1, tmp2, 7); 1314 __ bne(CR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned 1315 1316 // Copy a 2-element word if necessary to align to 8 bytes. 1317 __ andi_(R0, R3_ARG1, 7); 1318 __ beq(CR0, l_7); 1319 1320 __ lwzx(tmp2, R3_ARG1, tmp3); 1321 __ addi(R5_ARG3, R5_ARG3, -2); 1322 __ stwx(tmp2, R4_ARG2, tmp3); 1323 { // FasterArrayCopy 1324 __ addi(R3_ARG1, R3_ARG1, 4); 1325 __ addi(R4_ARG2, R4_ARG2, 4); 1326 } 1327 } 1328 1329 __ bind(l_7); 1330 1331 // Copy 4 elements at a time; either the loads or the stores can 1332 // be unaligned if aligned == false. 1333 1334 { // FasterArrayCopy 1335 __ cmpwi(CR0, R5_ARG3, 15); 1336 __ ble(CR0, l_6); // copy 2 at a time if less than 16 elements remain 1337 1338 __ srdi(tmp1, R5_ARG3, 4); 1339 __ andi_(R5_ARG3, R5_ARG3, 15); 1340 __ mtctr(tmp1); 1341 1342 1343 // Processor supports VSX, so use it to mass copy. 1344 1345 // Prefetch src data into L2 cache. 1346 __ dcbt(R3_ARG1, 0); 1347 1348 // If supported set DSCR pre-fetch to deepest. 1349 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); 1350 __ mtdscr(tmp2); 1351 __ li(tmp1, 16); 1352 1353 // Backbranch target aligned to 32-byte. It's not aligned 16-byte 1354 // as loop contains < 8 instructions that fit inside a single 1355 // i-cache sector. 1356 __ align(32); 1357 1358 __ bind(l_9); 1359 // Use loop with VSX load/store instructions to 1360 // copy 16 elements a time. 1361 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src. 1362 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst. 1363 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. 1364 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. 1365 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. 1366 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32. 1367 __ bdnz(l_9); // Dec CTR and loop if not zero. 1368 1369 // Restore DSCR pre-fetch value. 1370 __ load_const_optimized(tmp2, VM_Version::_dscr_val); 1371 __ mtdscr(tmp2); 1372 } // FasterArrayCopy 1373 __ bind(l_6); 1374 1375 // copy 2 elements at a time 1376 { // FasterArrayCopy 1377 __ cmpwi(CR0, R5_ARG3, 2); 1378 __ blt(CR0, l_1); 1379 __ srdi(tmp1, R5_ARG3, 1); 1380 __ andi_(R5_ARG3, R5_ARG3, 1); 1381 1382 __ addi(R3_ARG1, R3_ARG1, -4); 1383 __ addi(R4_ARG2, R4_ARG2, -4); 1384 __ mtctr(tmp1); 1385 1386 __ bind(l_3); 1387 __ lwzu(tmp2, 4, R3_ARG1); 1388 __ stwu(tmp2, 4, R4_ARG2); 1389 __ bdnz(l_3); 1390 1391 __ addi(R3_ARG1, R3_ARG1, 4); 1392 __ addi(R4_ARG2, R4_ARG2, 4); 1393 } 1394 1395 // do single element copy 1396 __ bind(l_1); 1397 __ cmpwi(CR0, R5_ARG3, 0); 1398 __ beq(CR0, l_4); 1399 1400 { // FasterArrayCopy 1401 __ mtctr(R5_ARG3); 1402 __ addi(R3_ARG1, R3_ARG1, -2); 1403 __ addi(R4_ARG2, R4_ARG2, -2); 1404 1405 __ bind(l_5); 1406 __ lhzu(tmp2, 2, R3_ARG1); 1407 __ sthu(tmp2, 2, R4_ARG2); 1408 __ bdnz(l_5); 1409 } 1410 } 1411 1412 __ bind(l_4); 1413 __ li(R3_RET, 0); // return 0 1414 __ blr(); 1415 1416 return start; 1417 } 1418 1419 // Generate stub for conjoint short copy. If "aligned" is true, the 1420 // "from" and "to" addresses are assumed to be heapword aligned. 1421 // 1422 // Arguments for generated stub: 1423 // from: R3_ARG1 1424 // to: R4_ARG2 1425 // count: R5_ARG3 treated as signed 1426 // 1427 address generate_conjoint_short_copy(StubGenStubId stub_id) { 1428 bool aligned; 1429 switch (stub_id) { 1430 case jshort_arraycopy_id: 1431 aligned = false; 1432 break; 1433 case arrayof_jshort_arraycopy_id: 1434 aligned = true; 1435 break; 1436 default: 1437 ShouldNotReachHere(); 1438 } 1439 1440 StubCodeMark mark(this, stub_id); 1441 address start = __ function_entry(); 1442 assert_positive_int(R5_ARG3); 1443 1444 Register tmp1 = R6_ARG4; 1445 Register tmp2 = R7_ARG5; 1446 Register tmp3 = R8_ARG6; 1447 1448 address nooverlap_target = aligned ? 1449 STUB_ENTRY(arrayof_jshort_disjoint_arraycopy()) : 1450 STUB_ENTRY(jshort_disjoint_arraycopy()); 1451 1452 array_overlap_test(nooverlap_target, 1); 1453 1454 Label l_1, l_2; 1455 { 1456 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit 1457 UnsafeMemoryAccessMark umam(this, !aligned, false); 1458 __ sldi(tmp1, R5_ARG3, 1); 1459 __ b(l_2); 1460 __ bind(l_1); 1461 __ sthx(tmp2, R4_ARG2, tmp1); 1462 __ bind(l_2); 1463 __ addic_(tmp1, tmp1, -2); 1464 __ lhzx(tmp2, R3_ARG1, tmp1); 1465 __ bge(CR0, l_1); 1466 } 1467 __ li(R3_RET, 0); // return 0 1468 __ blr(); 1469 1470 return start; 1471 } 1472 1473 // Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned" 1474 // is true, the "from" and "to" addresses are assumed to be heapword aligned. 1475 // 1476 // Arguments: 1477 // from: R3_ARG1 1478 // to: R4_ARG2 1479 // count: R5_ARG3 treated as signed 1480 // 1481 void generate_disjoint_int_copy_core(bool aligned) { 1482 Register tmp1 = R6_ARG4; 1483 Register tmp2 = R7_ARG5; 1484 Register tmp3 = R8_ARG6; 1485 Register tmp4 = R0; 1486 1487 VectorSRegister tmp_vsr1 = VSR1; 1488 VectorSRegister tmp_vsr2 = VSR2; 1489 1490 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7; 1491 1492 // for short arrays, just do single element copy 1493 __ li(tmp3, 0); 1494 __ cmpwi(CR0, R5_ARG3, 5); 1495 __ ble(CR0, l_2); 1496 1497 if (!aligned) { 1498 // check if arrays have same alignment mod 8. 1499 __ xorr(tmp1, R3_ARG1, R4_ARG2); 1500 __ andi_(R0, tmp1, 7); 1501 // Not the same alignment, but ld and std just need to be 4 byte aligned. 1502 __ bne(CR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time 1503 1504 // copy 1 element to align to and from on an 8 byte boundary 1505 __ andi_(R0, R3_ARG1, 7); 1506 __ beq(CR0, l_4); 1507 1508 __ lwzx(tmp2, R3_ARG1, tmp3); 1509 __ addi(R5_ARG3, R5_ARG3, -1); 1510 __ stwx(tmp2, R4_ARG2, tmp3); 1511 { // FasterArrayCopy 1512 __ addi(R3_ARG1, R3_ARG1, 4); 1513 __ addi(R4_ARG2, R4_ARG2, 4); 1514 } 1515 __ bind(l_4); 1516 } 1517 1518 { // FasterArrayCopy 1519 __ cmpwi(CR0, R5_ARG3, 7); 1520 __ ble(CR0, l_2); // copy 1 at a time if less than 8 elements remain 1521 1522 __ srdi(tmp1, R5_ARG3, 3); 1523 __ andi_(R5_ARG3, R5_ARG3, 7); 1524 __ mtctr(tmp1); 1525 1526 // Processor supports VSX, so use it to mass copy. 1527 1528 // Prefetch the data into the L2 cache. 1529 __ dcbt(R3_ARG1, 0); 1530 1531 // Set DSCR pre-fetch to deepest. 1532 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); 1533 __ mtdscr(tmp2); 1534 1535 __ li(tmp1, 16); 1536 1537 // Backbranch target aligned to 32-byte. Not 16-byte align as 1538 // loop contains < 8 instructions that fit inside a single 1539 // i-cache sector. 1540 __ align(32); 1541 1542 __ bind(l_7); 1543 // Use loop with VSX load/store instructions to 1544 // copy 8 elements a time. 1545 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src 1546 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst 1547 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1548 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1549 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1550 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1551 __ bdnz(l_7); // Dec CTR and loop if not zero. 1552 1553 // Restore DSCR pre-fetch value. 1554 __ load_const_optimized(tmp2, VM_Version::_dscr_val); 1555 __ mtdscr(tmp2); 1556 1557 1558 } // FasterArrayCopy 1559 1560 // copy 1 element at a time 1561 __ bind(l_2); 1562 __ cmpwi(CR0, R5_ARG3, 0); 1563 __ beq(CR0, l_1); 1564 1565 { // FasterArrayCopy 1566 __ mtctr(R5_ARG3); 1567 __ addi(R3_ARG1, R3_ARG1, -4); 1568 __ addi(R4_ARG2, R4_ARG2, -4); 1569 1570 __ bind(l_3); 1571 __ lwzu(tmp2, 4, R3_ARG1); 1572 __ stwu(tmp2, 4, R4_ARG2); 1573 __ bdnz(l_3); 1574 } 1575 1576 __ bind(l_1); 1577 return; 1578 } 1579 1580 // Generate stub for disjoint int copy. If "aligned" is true, the 1581 // "from" and "to" addresses are assumed to be heapword aligned. 1582 // 1583 // Arguments for generated stub: 1584 // from: R3_ARG1 1585 // to: R4_ARG2 1586 // count: R5_ARG3 treated as signed 1587 // 1588 address generate_disjoint_int_copy(StubGenStubId stub_id) { 1589 bool aligned; 1590 switch (stub_id) { 1591 case jint_disjoint_arraycopy_id: 1592 aligned = false; 1593 break; 1594 case arrayof_jint_disjoint_arraycopy_id: 1595 aligned = true; 1596 break; 1597 default: 1598 ShouldNotReachHere(); 1599 } 1600 1601 StubCodeMark mark(this, stub_id); 1602 address start = __ function_entry(); 1603 assert_positive_int(R5_ARG3); 1604 { 1605 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit 1606 UnsafeMemoryAccessMark umam(this, !aligned, false); 1607 generate_disjoint_int_copy_core(aligned); 1608 } 1609 __ li(R3_RET, 0); // return 0 1610 __ blr(); 1611 return start; 1612 } 1613 1614 // Generate core code for conjoint int copy (and oop copy on 1615 // 32-bit). If "aligned" is true, the "from" and "to" addresses 1616 // are assumed to be heapword aligned. 1617 // 1618 // Arguments: 1619 // from: R3_ARG1 1620 // to: R4_ARG2 1621 // count: R5_ARG3 treated as signed 1622 // 1623 void generate_conjoint_int_copy_core(bool aligned) { 1624 // Do reverse copy. We assume the case of actual overlap is rare enough 1625 // that we don't have to optimize it. 1626 1627 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7; 1628 1629 Register tmp1 = R6_ARG4; 1630 Register tmp2 = R7_ARG5; 1631 Register tmp3 = R8_ARG6; 1632 Register tmp4 = R0; 1633 1634 VectorSRegister tmp_vsr1 = VSR1; 1635 VectorSRegister tmp_vsr2 = VSR2; 1636 1637 { // FasterArrayCopy 1638 __ cmpwi(CR0, R5_ARG3, 0); 1639 __ beq(CR0, l_6); 1640 1641 __ sldi(R5_ARG3, R5_ARG3, 2); 1642 __ add(R3_ARG1, R3_ARG1, R5_ARG3); 1643 __ add(R4_ARG2, R4_ARG2, R5_ARG3); 1644 __ srdi(R5_ARG3, R5_ARG3, 2); 1645 1646 if (!aligned) { 1647 // check if arrays have same alignment mod 8. 1648 __ xorr(tmp1, R3_ARG1, R4_ARG2); 1649 __ andi_(R0, tmp1, 7); 1650 // Not the same alignment, but ld and std just need to be 4 byte aligned. 1651 __ bne(CR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time 1652 1653 // copy 1 element to align to and from on an 8 byte boundary 1654 __ andi_(R0, R3_ARG1, 7); 1655 __ beq(CR0, l_7); 1656 1657 __ addi(R3_ARG1, R3_ARG1, -4); 1658 __ addi(R4_ARG2, R4_ARG2, -4); 1659 __ addi(R5_ARG3, R5_ARG3, -1); 1660 __ lwzx(tmp2, R3_ARG1); 1661 __ stwx(tmp2, R4_ARG2); 1662 __ bind(l_7); 1663 } 1664 1665 __ cmpwi(CR0, R5_ARG3, 7); 1666 __ ble(CR0, l_5); // copy 1 at a time if less than 8 elements remain 1667 1668 __ srdi(tmp1, R5_ARG3, 3); 1669 __ andi(R5_ARG3, R5_ARG3, 7); 1670 __ mtctr(tmp1); 1671 1672 // Processor supports VSX, so use it to mass copy. 1673 // Prefetch the data into the L2 cache. 1674 __ dcbt(R3_ARG1, 0); 1675 1676 // Set DSCR pre-fetch to deepest. 1677 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); 1678 __ mtdscr(tmp2); 1679 1680 __ li(tmp1, 16); 1681 1682 // Backbranch target aligned to 32-byte. Not 16-byte align as 1683 // loop contains < 8 instructions that fit inside a single 1684 // i-cache sector. 1685 __ align(32); 1686 1687 __ bind(l_4); 1688 // Use loop with VSX load/store instructions to 1689 // copy 8 elements a time. 1690 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32 1691 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32 1692 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16 1693 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src 1694 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16 1695 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst 1696 __ bdnz(l_4); 1697 1698 // Restore DSCR pre-fetch value. 1699 __ load_const_optimized(tmp2, VM_Version::_dscr_val); 1700 __ mtdscr(tmp2); 1701 1702 __ cmpwi(CR0, R5_ARG3, 0); 1703 __ beq(CR0, l_6); 1704 1705 __ bind(l_5); 1706 __ mtctr(R5_ARG3); 1707 __ bind(l_3); 1708 __ lwz(R0, -4, R3_ARG1); 1709 __ stw(R0, -4, R4_ARG2); 1710 __ addi(R3_ARG1, R3_ARG1, -4); 1711 __ addi(R4_ARG2, R4_ARG2, -4); 1712 __ bdnz(l_3); 1713 1714 __ bind(l_6); 1715 } 1716 } 1717 1718 // Generate stub for conjoint int copy. If "aligned" is true, the 1719 // "from" and "to" addresses are assumed to be heapword aligned. 1720 // 1721 // Arguments for generated stub: 1722 // from: R3_ARG1 1723 // to: R4_ARG2 1724 // count: R5_ARG3 treated as signed 1725 // 1726 address generate_conjoint_int_copy(StubGenStubId stub_id) { 1727 bool aligned; 1728 switch (stub_id) { 1729 case jint_arraycopy_id: 1730 aligned = false; 1731 break; 1732 case arrayof_jint_arraycopy_id: 1733 aligned = true; 1734 break; 1735 default: 1736 ShouldNotReachHere(); 1737 } 1738 1739 StubCodeMark mark(this, stub_id); 1740 address start = __ function_entry(); 1741 assert_positive_int(R5_ARG3); 1742 address nooverlap_target = aligned ? 1743 STUB_ENTRY(arrayof_jint_disjoint_arraycopy()) : 1744 STUB_ENTRY(jint_disjoint_arraycopy()); 1745 1746 array_overlap_test(nooverlap_target, 2); 1747 { 1748 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit 1749 UnsafeMemoryAccessMark umam(this, !aligned, false); 1750 generate_conjoint_int_copy_core(aligned); 1751 } 1752 1753 __ li(R3_RET, 0); // return 0 1754 __ blr(); 1755 1756 return start; 1757 } 1758 1759 // Generate core code for disjoint long copy (and oop copy on 1760 // 64-bit). If "aligned" is true, the "from" and "to" addresses 1761 // are assumed to be heapword aligned. 1762 // 1763 // Arguments: 1764 // from: R3_ARG1 1765 // to: R4_ARG2 1766 // count: R5_ARG3 treated as signed 1767 // 1768 void generate_disjoint_long_copy_core(bool aligned) { 1769 Register tmp1 = R6_ARG4; 1770 Register tmp2 = R7_ARG5; 1771 Register tmp3 = R8_ARG6; 1772 Register tmp4 = R0; 1773 1774 Label l_1, l_2, l_3, l_4, l_5; 1775 1776 VectorSRegister tmp_vsr1 = VSR1; 1777 VectorSRegister tmp_vsr2 = VSR2; 1778 1779 { // FasterArrayCopy 1780 __ cmpwi(CR0, R5_ARG3, 3); 1781 __ ble(CR0, l_3); // copy 1 at a time if less than 4 elements remain 1782 1783 __ srdi(tmp1, R5_ARG3, 2); 1784 __ andi_(R5_ARG3, R5_ARG3, 3); 1785 __ mtctr(tmp1); 1786 1787 // Processor supports VSX, so use it to mass copy. 1788 1789 // Prefetch the data into the L2 cache. 1790 __ dcbt(R3_ARG1, 0); 1791 1792 // Set DSCR pre-fetch to deepest. 1793 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); 1794 __ mtdscr(tmp2); 1795 1796 __ li(tmp1, 16); 1797 1798 // Backbranch target aligned to 32-byte. Not 16-byte align as 1799 // loop contains < 8 instructions that fit inside a single 1800 // i-cache sector. 1801 __ align(32); 1802 1803 __ bind(l_5); 1804 // Use loop with VSX load/store instructions to 1805 // copy 4 elements a time. 1806 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src 1807 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst 1808 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1809 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1810 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1811 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1812 __ bdnz(l_5); // Dec CTR and loop if not zero. 1813 1814 // Restore DSCR pre-fetch value. 1815 __ load_const_optimized(tmp2, VM_Version::_dscr_val); 1816 __ mtdscr(tmp2); 1817 1818 } // FasterArrayCopy 1819 1820 // copy 1 element at a time 1821 __ bind(l_3); 1822 __ cmpwi(CR0, R5_ARG3, 0); 1823 __ beq(CR0, l_1); 1824 1825 { // FasterArrayCopy 1826 __ mtctr(R5_ARG3); 1827 __ addi(R3_ARG1, R3_ARG1, -8); 1828 __ addi(R4_ARG2, R4_ARG2, -8); 1829 1830 __ bind(l_2); 1831 __ ldu(R0, 8, R3_ARG1); 1832 __ stdu(R0, 8, R4_ARG2); 1833 __ bdnz(l_2); 1834 1835 } 1836 __ bind(l_1); 1837 } 1838 1839 // Generate stub for disjoint long copy. If "aligned" is true, the 1840 // "from" and "to" addresses are assumed to be heapword aligned. 1841 // 1842 // Arguments for generated stub: 1843 // from: R3_ARG1 1844 // to: R4_ARG2 1845 // count: R5_ARG3 treated as signed 1846 // 1847 address generate_disjoint_long_copy(StubGenStubId stub_id) { 1848 bool aligned; 1849 switch (stub_id) { 1850 case jlong_disjoint_arraycopy_id: 1851 aligned = false; 1852 break; 1853 case arrayof_jlong_disjoint_arraycopy_id: 1854 aligned = true; 1855 break; 1856 default: 1857 ShouldNotReachHere(); 1858 } 1859 1860 StubCodeMark mark(this, stub_id); 1861 address start = __ function_entry(); 1862 assert_positive_int(R5_ARG3); 1863 { 1864 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit 1865 UnsafeMemoryAccessMark umam(this, !aligned, false); 1866 generate_disjoint_long_copy_core(aligned); 1867 } 1868 __ li(R3_RET, 0); // return 0 1869 __ blr(); 1870 1871 return start; 1872 } 1873 1874 // Generate core code for conjoint long copy (and oop copy on 1875 // 64-bit). If "aligned" is true, the "from" and "to" addresses 1876 // are assumed to be heapword aligned. 1877 // 1878 // Arguments: 1879 // from: R3_ARG1 1880 // to: R4_ARG2 1881 // count: R5_ARG3 treated as signed 1882 // 1883 void generate_conjoint_long_copy_core(bool aligned) { 1884 Register tmp1 = R6_ARG4; 1885 Register tmp2 = R7_ARG5; 1886 Register tmp3 = R8_ARG6; 1887 Register tmp4 = R0; 1888 1889 VectorSRegister tmp_vsr1 = VSR1; 1890 VectorSRegister tmp_vsr2 = VSR2; 1891 1892 Label l_1, l_2, l_3, l_4, l_5; 1893 1894 __ cmpwi(CR0, R5_ARG3, 0); 1895 __ beq(CR0, l_1); 1896 1897 { // FasterArrayCopy 1898 __ sldi(R5_ARG3, R5_ARG3, 3); 1899 __ add(R3_ARG1, R3_ARG1, R5_ARG3); 1900 __ add(R4_ARG2, R4_ARG2, R5_ARG3); 1901 __ srdi(R5_ARG3, R5_ARG3, 3); 1902 1903 __ cmpwi(CR0, R5_ARG3, 3); 1904 __ ble(CR0, l_5); // copy 1 at a time if less than 4 elements remain 1905 1906 __ srdi(tmp1, R5_ARG3, 2); 1907 __ andi(R5_ARG3, R5_ARG3, 3); 1908 __ mtctr(tmp1); 1909 1910 // Processor supports VSX, so use it to mass copy. 1911 // Prefetch the data into the L2 cache. 1912 __ dcbt(R3_ARG1, 0); 1913 1914 // Set DSCR pre-fetch to deepest. 1915 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); 1916 __ mtdscr(tmp2); 1917 1918 __ li(tmp1, 16); 1919 1920 // Backbranch target aligned to 32-byte. Not 16-byte align as 1921 // loop contains < 8 instructions that fit inside a single 1922 // i-cache sector. 1923 __ align(32); 1924 1925 __ bind(l_4); 1926 // Use loop with VSX load/store instructions to 1927 // copy 4 elements a time. 1928 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32 1929 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32 1930 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16 1931 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src 1932 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16 1933 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst 1934 __ bdnz(l_4); 1935 1936 // Restore DSCR pre-fetch value. 1937 __ load_const_optimized(tmp2, VM_Version::_dscr_val); 1938 __ mtdscr(tmp2); 1939 1940 __ cmpwi(CR0, R5_ARG3, 0); 1941 __ beq(CR0, l_1); 1942 1943 __ bind(l_5); 1944 __ mtctr(R5_ARG3); 1945 __ bind(l_3); 1946 __ ld(R0, -8, R3_ARG1); 1947 __ std(R0, -8, R4_ARG2); 1948 __ addi(R3_ARG1, R3_ARG1, -8); 1949 __ addi(R4_ARG2, R4_ARG2, -8); 1950 __ bdnz(l_3); 1951 1952 } 1953 __ bind(l_1); 1954 } 1955 1956 // Generate stub for conjoint long copy. If "aligned" is true, the 1957 // "from" and "to" addresses are assumed to be heapword aligned. 1958 // 1959 // Arguments for generated stub: 1960 // from: R3_ARG1 1961 // to: R4_ARG2 1962 // count: R5_ARG3 treated as signed 1963 // 1964 address generate_conjoint_long_copy(StubGenStubId stub_id) { 1965 bool aligned; 1966 switch (stub_id) { 1967 case jlong_arraycopy_id: 1968 aligned = false; 1969 break; 1970 case arrayof_jlong_arraycopy_id: 1971 aligned = true; 1972 break; 1973 default: 1974 ShouldNotReachHere(); 1975 } 1976 1977 StubCodeMark mark(this, stub_id); 1978 address start = __ function_entry(); 1979 assert_positive_int(R5_ARG3); 1980 address nooverlap_target = aligned ? 1981 STUB_ENTRY(arrayof_jlong_disjoint_arraycopy()) : 1982 STUB_ENTRY(jlong_disjoint_arraycopy()); 1983 1984 array_overlap_test(nooverlap_target, 3); 1985 { 1986 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit 1987 UnsafeMemoryAccessMark umam(this, !aligned, false); 1988 generate_conjoint_long_copy_core(aligned); 1989 } 1990 __ li(R3_RET, 0); // return 0 1991 __ blr(); 1992 1993 return start; 1994 } 1995 1996 // Generate stub for conjoint oop copy. If "aligned" is true, the 1997 // "from" and "to" addresses are assumed to be heapword aligned. 1998 // 1999 // Arguments for generated stub: 2000 // from: R3_ARG1 2001 // to: R4_ARG2 2002 // count: R5_ARG3 treated as signed 2003 // dest_uninitialized: G1 support 2004 // 2005 address generate_conjoint_oop_copy(StubGenStubId stub_id) { 2006 bool aligned; 2007 bool dest_uninitialized; 2008 switch (stub_id) { 2009 case oop_arraycopy_id: 2010 aligned = false; 2011 dest_uninitialized = false; 2012 break; 2013 case arrayof_oop_arraycopy_id: 2014 aligned = true; 2015 dest_uninitialized = false; 2016 break; 2017 case oop_arraycopy_uninit_id: 2018 aligned = false; 2019 dest_uninitialized = true; 2020 break; 2021 case arrayof_oop_arraycopy_uninit_id: 2022 aligned = true; 2023 dest_uninitialized = true; 2024 break; 2025 default: 2026 ShouldNotReachHere(); 2027 } 2028 2029 StubCodeMark mark(this, stub_id); 2030 address start = __ function_entry(); 2031 assert_positive_int(R5_ARG3); 2032 address nooverlap_target = aligned ? 2033 STUB_ENTRY(arrayof_oop_disjoint_arraycopy(dest_uninitialized)) : 2034 STUB_ENTRY(oop_disjoint_arraycopy(dest_uninitialized)); 2035 2036 array_overlap_test(nooverlap_target, UseCompressedOops ? 2 : 3); 2037 2038 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 2039 if (dest_uninitialized) { 2040 decorators |= IS_DEST_UNINITIALIZED; 2041 } 2042 if (aligned) { 2043 decorators |= ARRAYCOPY_ALIGNED; 2044 } 2045 2046 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2047 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg); 2048 2049 if (UseCompressedOops) { 2050 generate_conjoint_int_copy_core(aligned); 2051 } else { 2052 #if INCLUDE_ZGC 2053 if (UseZGC) { 2054 ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs; 2055 zbs->generate_conjoint_oop_copy(_masm, dest_uninitialized); 2056 } else 2057 #endif 2058 generate_conjoint_long_copy_core(aligned); 2059 } 2060 2061 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg); 2062 __ li(R3_RET, 0); // return 0 2063 __ blr(); 2064 return start; 2065 } 2066 2067 // Generate stub for disjoint oop copy. If "aligned" is true, the 2068 // "from" and "to" addresses are assumed to be heapword aligned. 2069 // 2070 // Arguments for generated stub: 2071 // from: R3_ARG1 2072 // to: R4_ARG2 2073 // count: R5_ARG3 treated as signed 2074 // dest_uninitialized: G1 support 2075 // 2076 address generate_disjoint_oop_copy(StubGenStubId stub_id) { 2077 bool aligned; 2078 bool dest_uninitialized; 2079 switch (stub_id) { 2080 case oop_disjoint_arraycopy_id: 2081 aligned = false; 2082 dest_uninitialized = false; 2083 break; 2084 case arrayof_oop_disjoint_arraycopy_id: 2085 aligned = true; 2086 dest_uninitialized = false; 2087 break; 2088 case oop_disjoint_arraycopy_uninit_id: 2089 aligned = false; 2090 dest_uninitialized = true; 2091 break; 2092 case arrayof_oop_disjoint_arraycopy_uninit_id: 2093 aligned = true; 2094 dest_uninitialized = true; 2095 break; 2096 default: 2097 ShouldNotReachHere(); 2098 } 2099 2100 StubCodeMark mark(this, stub_id); 2101 address start = __ function_entry(); 2102 assert_positive_int(R5_ARG3); 2103 2104 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 2105 if (dest_uninitialized) { 2106 decorators |= IS_DEST_UNINITIALIZED; 2107 } 2108 if (aligned) { 2109 decorators |= ARRAYCOPY_ALIGNED; 2110 } 2111 2112 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2113 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg); 2114 2115 if (UseCompressedOops) { 2116 generate_disjoint_int_copy_core(aligned); 2117 } else { 2118 #if INCLUDE_ZGC 2119 if (UseZGC) { 2120 ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs; 2121 zbs->generate_disjoint_oop_copy(_masm, dest_uninitialized); 2122 } else 2123 #endif 2124 generate_disjoint_long_copy_core(aligned); 2125 } 2126 2127 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg); 2128 __ li(R3_RET, 0); // return 0 2129 __ blr(); 2130 2131 return start; 2132 } 2133 2134 2135 // Helper for generating a dynamic type check. 2136 // Smashes only the given temp registers. 2137 void generate_type_check(Register sub_klass, 2138 Register super_check_offset, 2139 Register super_klass, 2140 Register temp1, 2141 Register temp2, 2142 Label& L_success) { 2143 assert_different_registers(sub_klass, super_check_offset, super_klass); 2144 2145 BLOCK_COMMENT("type_check:"); 2146 2147 Label L_miss; 2148 2149 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp1, temp2, &L_success, &L_miss, nullptr, 2150 super_check_offset); 2151 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success); 2152 2153 // Fall through on failure! 2154 __ bind(L_miss); 2155 } 2156 2157 2158 // Generate stub for checked oop copy. 2159 // 2160 // Arguments for generated stub: 2161 // from: R3 2162 // to: R4 2163 // count: R5 treated as signed 2164 // ckoff: R6 (super_check_offset) 2165 // ckval: R7 (super_klass) 2166 // ret: R3 zero for success; (-1^K) where K is partial transfer count 2167 // 2168 address generate_checkcast_copy(StubGenStubId stub_id) { 2169 const Register R3_from = R3_ARG1; // source array address 2170 const Register R4_to = R4_ARG2; // destination array address 2171 const Register R5_count = R5_ARG3; // elements count 2172 const Register R6_ckoff = R6_ARG4; // super_check_offset 2173 const Register R7_ckval = R7_ARG5; // super_klass 2174 2175 const Register R8_offset = R8_ARG6; // loop var, with stride wordSize 2176 const Register R9_remain = R9_ARG7; // loop var, with stride -1 2177 const Register R10_oop = R10_ARG8; // actual oop copied 2178 const Register R11_klass = R11_scratch1; // oop._klass 2179 const Register R12_tmp = R12_scratch2; 2180 const Register R2_tmp = R2; 2181 2182 bool dest_uninitialized; 2183 switch (stub_id) { 2184 case checkcast_arraycopy_id: 2185 dest_uninitialized = false; 2186 break; 2187 case checkcast_arraycopy_uninit_id: 2188 dest_uninitialized = true; 2189 break; 2190 default: 2191 ShouldNotReachHere(); 2192 } 2193 //__ align(CodeEntryAlignment); 2194 StubCodeMark mark(this, stub_id); 2195 address start = __ function_entry(); 2196 2197 // Assert that int is 64 bit sign extended and arrays are not conjoint. 2198 #ifdef ASSERT 2199 { 2200 assert_positive_int(R5_ARG3); 2201 const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2; 2202 Label no_overlap; 2203 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes 2204 __ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes 2205 __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison! 2206 __ cmpld(CR1, tmp1, tmp2); 2207 __ crnand(CR0, Assembler::less, CR1, Assembler::less); 2208 // Overlaps if Src before dst and distance smaller than size. 2209 // Branch to forward copy routine otherwise. 2210 __ blt(CR0, no_overlap); 2211 __ stop("overlap in checkcast_copy"); 2212 __ bind(no_overlap); 2213 } 2214 #endif 2215 2216 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 2217 if (dest_uninitialized) { 2218 decorators |= IS_DEST_UNINITIALIZED; 2219 } 2220 2221 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2222 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_from, R4_to, R5_count, /* preserve: */ R6_ckoff, R7_ckval); 2223 2224 //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET); 2225 2226 Label load_element, store_element, store_null, success, do_epilogue; 2227 __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it. 2228 __ li(R8_offset, 0); // Offset from start of arrays. 2229 __ bne(CR0, load_element); 2230 2231 // Empty array: Nothing to do. 2232 __ li(R3_RET, 0); // Return 0 on (trivial) success. 2233 __ blr(); 2234 2235 // ======== begin loop ======== 2236 // (Entry is load_element.) 2237 __ align(OptoLoopAlignment); 2238 __ bind(store_element); 2239 if (UseCompressedOops) { 2240 __ encode_heap_oop_not_null(R10_oop); 2241 __ bind(store_null); 2242 __ stw(R10_oop, R8_offset, R4_to); 2243 } else { 2244 __ bind(store_null); 2245 #if INCLUDE_ZGC 2246 if (UseZGC) { 2247 __ store_heap_oop(R10_oop, R8_offset, R4_to, R11_scratch1, R12_tmp, noreg, 2248 MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS, 2249 dest_uninitialized ? IS_DEST_UNINITIALIZED : 0); 2250 } else 2251 #endif 2252 __ std(R10_oop, R8_offset, R4_to); 2253 } 2254 2255 __ addi(R8_offset, R8_offset, heapOopSize); // Step to next offset. 2256 __ addic_(R9_remain, R9_remain, -1); // Decrement the count. 2257 __ beq(CR0, success); 2258 2259 // ======== loop entry is here ======== 2260 __ bind(load_element); 2261 #if INCLUDE_ZGC 2262 if (UseZGC) { 2263 __ load_heap_oop(R10_oop, R8_offset, R3_from, 2264 R11_scratch1, R12_tmp, 2265 MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS, 2266 0, &store_null); 2267 } else 2268 #endif 2269 __ load_heap_oop(R10_oop, R8_offset, R3_from, 2270 R11_scratch1, R12_tmp, 2271 MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS, 2272 AS_RAW, &store_null); 2273 2274 __ load_klass(R11_klass, R10_oop); // Query the object klass. 2275 2276 generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, R2_tmp, 2277 // Branch to this on success: 2278 store_element); 2279 // ======== end loop ======== 2280 2281 // It was a real error; we must depend on the caller to finish the job. 2282 // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops. 2283 // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain), 2284 // and report their number to the caller. 2285 __ subf_(R5_count, R9_remain, R5_count); 2286 __ nand(R3_RET, R5_count, R5_count); // report (-1^K) to caller 2287 __ bne(CR0, do_epilogue); 2288 __ blr(); 2289 2290 __ bind(success); 2291 __ li(R3_RET, 0); 2292 2293 __ bind(do_epilogue); 2294 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_to, R5_count, /* preserve */ R3_RET); 2295 2296 __ blr(); 2297 return start; 2298 } 2299 2300 2301 // Generate 'unsafe' array copy stub. 2302 // Though just as safe as the other stubs, it takes an unscaled 2303 // size_t argument instead of an element count. 2304 // 2305 // Arguments for generated stub: 2306 // from: R3 2307 // to: R4 2308 // count: R5 byte count, treated as ssize_t, can be zero 2309 // 2310 // Examines the alignment of the operands and dispatches 2311 // to a long, int, short, or byte copy loop. 2312 // 2313 address generate_unsafe_copy(address byte_copy_entry, 2314 address short_copy_entry, 2315 address int_copy_entry, 2316 address long_copy_entry) { 2317 2318 const Register R3_from = R3_ARG1; // source array address 2319 const Register R4_to = R4_ARG2; // destination array address 2320 const Register R5_count = R5_ARG3; // elements count (as long on PPC64) 2321 2322 const Register R6_bits = R6_ARG4; // test copy of low bits 2323 const Register R7_tmp = R7_ARG5; 2324 2325 //__ align(CodeEntryAlignment); 2326 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2327 StubCodeMark mark(this, stub_id); 2328 address start = __ function_entry(); 2329 2330 // Bump this on entry, not on exit: 2331 //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp); 2332 2333 Label short_copy, int_copy, long_copy; 2334 2335 __ orr(R6_bits, R3_from, R4_to); 2336 __ orr(R6_bits, R6_bits, R5_count); 2337 __ andi_(R0, R6_bits, (BytesPerLong-1)); 2338 __ beq(CR0, long_copy); 2339 2340 __ andi_(R0, R6_bits, (BytesPerInt-1)); 2341 __ beq(CR0, int_copy); 2342 2343 __ andi_(R0, R6_bits, (BytesPerShort-1)); 2344 __ beq(CR0, short_copy); 2345 2346 // byte_copy: 2347 __ b(byte_copy_entry); 2348 2349 __ bind(short_copy); 2350 __ srwi(R5_count, R5_count, LogBytesPerShort); 2351 __ b(short_copy_entry); 2352 2353 __ bind(int_copy); 2354 __ srwi(R5_count, R5_count, LogBytesPerInt); 2355 __ b(int_copy_entry); 2356 2357 __ bind(long_copy); 2358 __ srwi(R5_count, R5_count, LogBytesPerLong); 2359 __ b(long_copy_entry); 2360 2361 return start; 2362 } 2363 2364 2365 // Perform range checks on the proposed arraycopy. 2366 // Kills the two temps, but nothing else. 2367 // Also, clean the sign bits of src_pos and dst_pos. 2368 void arraycopy_range_checks(Register src, // source array oop 2369 Register src_pos, // source position 2370 Register dst, // destination array oop 2371 Register dst_pos, // destination position 2372 Register length, // length of copy 2373 Register temp1, Register temp2, 2374 Label& L_failed) { 2375 BLOCK_COMMENT("arraycopy_range_checks:"); 2376 2377 const Register array_length = temp1; // scratch 2378 const Register end_pos = temp2; // scratch 2379 2380 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 2381 __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src); 2382 __ add(end_pos, src_pos, length); // src_pos + length 2383 __ cmpd(CR0, end_pos, array_length); 2384 __ bgt(CR0, L_failed); 2385 2386 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 2387 __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst); 2388 __ add(end_pos, dst_pos, length); // src_pos + length 2389 __ cmpd(CR0, end_pos, array_length); 2390 __ bgt(CR0, L_failed); 2391 2392 BLOCK_COMMENT("arraycopy_range_checks done"); 2393 } 2394 2395 2396 // Helper for generate_unsafe_setmemory 2397 // 2398 // Atomically fill an array of memory using 1-, 2-, 4-, or 8-byte chunks and return. 2399 static void do_setmemory_atomic_loop(int elem_size, Register dest, Register size, Register byteVal, 2400 MacroAssembler *_masm) { 2401 2402 Label L_Loop, L_Tail; // 2x unrolled loop 2403 2404 // Propagate byte to required width 2405 if (elem_size > 1) __ rldimi(byteVal, byteVal, 8, 64 - 2 * 8); 2406 if (elem_size > 2) __ rldimi(byteVal, byteVal, 16, 64 - 2 * 16); 2407 if (elem_size > 4) __ rldimi(byteVal, byteVal, 32, 64 - 2 * 32); 2408 2409 __ srwi_(R0, size, exact_log2(2 * elem_size)); // size is a 32 bit value 2410 __ beq(CR0, L_Tail); 2411 __ mtctr(R0); 2412 2413 __ align(32); // loop alignment 2414 __ bind(L_Loop); 2415 __ store_sized_value(byteVal, 0, dest, elem_size); 2416 __ store_sized_value(byteVal, elem_size, dest, elem_size); 2417 __ addi(dest, dest, 2 * elem_size); 2418 __ bdnz(L_Loop); 2419 2420 __ bind(L_Tail); 2421 __ andi_(R0, size, elem_size); 2422 __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintbhBCLRisReturn); 2423 __ store_sized_value(byteVal, 0, dest, elem_size); 2424 __ blr(); 2425 } 2426 2427 // 2428 // Generate 'unsafe' set memory stub 2429 // Though just as safe as the other stubs, it takes an unscaled 2430 // size_t (# bytes) argument instead of an element count. 2431 // 2432 // Input: 2433 // R3_ARG1 - destination array address 2434 // R4_ARG2 - byte count (size_t) 2435 // R5_ARG3 - byte value 2436 // 2437 address generate_unsafe_setmemory(address unsafe_byte_fill) { 2438 __ align(CodeEntryAlignment); 2439 StubCodeMark mark(this, StubGenStubId::unsafe_setmemory_id); 2440 address start = __ function_entry(); 2441 2442 // bump this on entry, not on exit: 2443 // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr); 2444 2445 { 2446 Label L_fill8Bytes, L_fill4Bytes, L_fillBytes; 2447 2448 const Register dest = R3_ARG1; 2449 const Register size = R4_ARG2; 2450 const Register byteVal = R5_ARG3; 2451 const Register rScratch1 = R6; 2452 2453 // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char) 2454 2455 // Check for pointer & size alignment 2456 __ orr(rScratch1, dest, size); 2457 2458 __ andi_(R0, rScratch1, 7); 2459 __ beq(CR0, L_fill8Bytes); 2460 2461 __ andi_(R0, rScratch1, 3); 2462 __ beq(CR0, L_fill4Bytes); 2463 2464 __ andi_(R0, rScratch1, 1); 2465 __ bne(CR0, L_fillBytes); 2466 2467 // Mark remaining code as such which performs Unsafe accesses. 2468 UnsafeMemoryAccessMark umam(this, true, false); 2469 2470 // At this point, we know the lower bit of size is zero and a 2471 // multiple of 2 2472 do_setmemory_atomic_loop(2, dest, size, byteVal, _masm); 2473 2474 __ align(32); 2475 __ bind(L_fill8Bytes); 2476 // At this point, we know the lower 3 bits of size are zero and a 2477 // multiple of 8 2478 do_setmemory_atomic_loop(8, dest, size, byteVal, _masm); 2479 2480 __ align(32); 2481 __ bind(L_fill4Bytes); 2482 // At this point, we know the lower 2 bits of size are zero and a 2483 // multiple of 4 2484 do_setmemory_atomic_loop(4, dest, size, byteVal, _masm); 2485 2486 __ align(32); 2487 __ bind(L_fillBytes); 2488 do_setmemory_atomic_loop(1, dest, size, byteVal, _masm); 2489 } 2490 2491 return start; 2492 } 2493 2494 2495 // 2496 // Generate generic array copy stubs 2497 // 2498 // Input: 2499 // R3 - src oop 2500 // R4 - src_pos 2501 // R5 - dst oop 2502 // R6 - dst_pos 2503 // R7 - element count 2504 // 2505 // Output: 2506 // R3 == 0 - success 2507 // R3 == -1 - need to call System.arraycopy 2508 // 2509 address generate_generic_copy(address entry_jbyte_arraycopy, 2510 address entry_jshort_arraycopy, 2511 address entry_jint_arraycopy, 2512 address entry_oop_arraycopy, 2513 address entry_disjoint_oop_arraycopy, 2514 address entry_jlong_arraycopy, 2515 address entry_checkcast_arraycopy) { 2516 Label L_failed, L_objArray; 2517 2518 // Input registers 2519 const Register src = R3_ARG1; // source array oop 2520 const Register src_pos = R4_ARG2; // source position 2521 const Register dst = R5_ARG3; // destination array oop 2522 const Register dst_pos = R6_ARG4; // destination position 2523 const Register length = R7_ARG5; // elements count 2524 2525 // registers used as temp 2526 const Register src_klass = R8_ARG6; // source array klass 2527 const Register dst_klass = R9_ARG7; // destination array klass 2528 const Register lh = R10_ARG8; // layout handler 2529 const Register temp = R2; 2530 2531 //__ align(CodeEntryAlignment); 2532 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2533 StubCodeMark mark(this, stub_id); 2534 address start = __ function_entry(); 2535 2536 // Bump this on entry, not on exit: 2537 //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp); 2538 2539 // In principle, the int arguments could be dirty. 2540 2541 //----------------------------------------------------------------------- 2542 // Assembler stubs will be used for this call to arraycopy 2543 // if the following conditions are met: 2544 // 2545 // (1) src and dst must not be null. 2546 // (2) src_pos must not be negative. 2547 // (3) dst_pos must not be negative. 2548 // (4) length must not be negative. 2549 // (5) src klass and dst klass should be the same and not null. 2550 // (6) src and dst should be arrays. 2551 // (7) src_pos + length must not exceed length of src. 2552 // (8) dst_pos + length must not exceed length of dst. 2553 BLOCK_COMMENT("arraycopy initial argument checks"); 2554 2555 __ cmpdi(CR1, src, 0); // if (src == nullptr) return -1; 2556 __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1; 2557 __ cmpdi(CR5, dst, 0); // if (dst == nullptr) return -1; 2558 __ cror(CR1, Assembler::equal, CR0, Assembler::less); 2559 __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1; 2560 __ cror(CR5, Assembler::equal, CR0, Assembler::less); 2561 __ extsw_(length, length); // if (length < 0) return -1; 2562 __ cror(CR1, Assembler::equal, CR5, Assembler::equal); 2563 __ cror(CR1, Assembler::equal, CR0, Assembler::less); 2564 __ beq(CR1, L_failed); 2565 2566 BLOCK_COMMENT("arraycopy argument klass checks"); 2567 __ load_klass(src_klass, src); 2568 __ load_klass(dst_klass, dst); 2569 2570 // Load layout helper 2571 // 2572 // |array_tag| | header_size | element_type | |log2_element_size| 2573 // 32 30 24 16 8 2 0 2574 // 2575 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2576 // 2577 2578 int lh_offset = in_bytes(Klass::layout_helper_offset()); 2579 2580 // Load 32-bits signed value. Use br() instruction with it to check icc. 2581 __ lwz(lh, lh_offset, src_klass); 2582 2583 // Handle objArrays completely differently... 2584 jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2585 __ load_const_optimized(temp, objArray_lh, R0); 2586 __ cmpw(CR0, lh, temp); 2587 __ beq(CR0, L_objArray); 2588 2589 __ cmpd(CR5, src_klass, dst_klass); // if (src->klass() != dst->klass()) return -1; 2590 __ cmpwi(CR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1; 2591 2592 __ crnand(CR5, Assembler::equal, CR6, Assembler::less); 2593 __ beq(CR5, L_failed); 2594 2595 // At this point, it is known to be a typeArray (array_tag 0x3). 2596 #ifdef ASSERT 2597 { Label L; 2598 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2599 __ load_const_optimized(temp, lh_prim_tag_in_place, R0); 2600 __ cmpw(CR0, lh, temp); 2601 __ bge(CR0, L); 2602 __ stop("must be a primitive array"); 2603 __ bind(L); 2604 } 2605 #endif 2606 2607 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2608 temp, dst_klass, L_failed); 2609 2610 // TypeArrayKlass 2611 // 2612 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2613 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2614 // 2615 2616 const Register offset = dst_klass; // array offset 2617 const Register elsize = src_klass; // log2 element size 2618 2619 __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1)); 2620 __ andi(elsize, lh, Klass::_lh_log2_element_size_mask); 2621 __ add(src, offset, src); // src array offset 2622 __ add(dst, offset, dst); // dst array offset 2623 2624 // Next registers should be set before the jump to corresponding stub. 2625 const Register from = R3_ARG1; // source array address 2626 const Register to = R4_ARG2; // destination array address 2627 const Register count = R5_ARG3; // elements count 2628 2629 // 'from', 'to', 'count' registers should be set in this order 2630 // since they are the same as 'src', 'src_pos', 'dst'. 2631 2632 BLOCK_COMMENT("scale indexes to element size"); 2633 __ sld(src_pos, src_pos, elsize); 2634 __ sld(dst_pos, dst_pos, elsize); 2635 __ add(from, src_pos, src); // src_addr 2636 __ add(to, dst_pos, dst); // dst_addr 2637 __ mr(count, length); // length 2638 2639 BLOCK_COMMENT("choose copy loop based on element size"); 2640 // Using conditional branches with range 32kB. 2641 const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CR0, Assembler::equal); 2642 __ cmpwi(CR0, elsize, 0); 2643 __ bc(bo, bi, entry_jbyte_arraycopy); 2644 __ cmpwi(CR0, elsize, LogBytesPerShort); 2645 __ bc(bo, bi, entry_jshort_arraycopy); 2646 __ cmpwi(CR0, elsize, LogBytesPerInt); 2647 __ bc(bo, bi, entry_jint_arraycopy); 2648 #ifdef ASSERT 2649 { Label L; 2650 __ cmpwi(CR0, elsize, LogBytesPerLong); 2651 __ beq(CR0, L); 2652 __ stop("must be long copy, but elsize is wrong"); 2653 __ bind(L); 2654 } 2655 #endif 2656 __ b(entry_jlong_arraycopy); 2657 2658 // ObjArrayKlass 2659 __ bind(L_objArray); 2660 // live at this point: src_klass, dst_klass, src[_pos], dst[_pos], length 2661 2662 Label L_disjoint_plain_copy, L_checkcast_copy; 2663 // test array classes for subtyping 2664 __ cmpd(CR0, src_klass, dst_klass); // usual case is exact equality 2665 __ bne(CR0, L_checkcast_copy); 2666 2667 // Identically typed arrays can be copied without element-wise checks. 2668 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2669 temp, lh, L_failed); 2670 2671 __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 2672 __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 2673 __ sldi(src_pos, src_pos, LogBytesPerHeapOop); 2674 __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop); 2675 __ add(from, src_pos, src); // src_addr 2676 __ add(to, dst_pos, dst); // dst_addr 2677 __ mr(count, length); // length 2678 __ b(entry_oop_arraycopy); 2679 2680 __ bind(L_checkcast_copy); 2681 // live at this point: src_klass, dst_klass 2682 { 2683 // Before looking at dst.length, make sure dst is also an objArray. 2684 __ lwz(temp, lh_offset, dst_klass); 2685 __ cmpw(CR0, lh, temp); 2686 __ bne(CR0, L_failed); 2687 2688 // It is safe to examine both src.length and dst.length. 2689 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2690 temp, lh, L_failed); 2691 2692 // Marshal the base address arguments now, freeing registers. 2693 __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 2694 __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 2695 __ sldi(src_pos, src_pos, LogBytesPerHeapOop); 2696 __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop); 2697 __ add(from, src_pos, src); // src_addr 2698 __ add(to, dst_pos, dst); // dst_addr 2699 __ mr(count, length); // length 2700 2701 Register sco_temp = R6_ARG4; // This register is free now. 2702 assert_different_registers(from, to, count, sco_temp, 2703 dst_klass, src_klass); 2704 2705 // Generate the type check. 2706 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2707 __ lwz(sco_temp, sco_offset, dst_klass); 2708 generate_type_check(src_klass, sco_temp, dst_klass, 2709 temp, /* temp */ R10_ARG8, L_disjoint_plain_copy); 2710 2711 // Fetch destination element klass from the ObjArrayKlass header. 2712 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2713 2714 // The checkcast_copy loop needs two extra arguments: 2715 __ ld(R7_ARG5, ek_offset, dst_klass); // dest elem klass 2716 __ lwz(R6_ARG4, sco_offset, R7_ARG5); // sco of elem klass 2717 __ b(entry_checkcast_arraycopy); 2718 } 2719 2720 __ bind(L_disjoint_plain_copy); 2721 __ b(entry_disjoint_oop_arraycopy); 2722 2723 __ bind(L_failed); 2724 __ li(R3_RET, -1); // return -1 2725 __ blr(); 2726 return start; 2727 } 2728 2729 // Arguments for generated stub: 2730 // R3_ARG1 - source byte array address 2731 // R4_ARG2 - destination byte array address 2732 // R5_ARG3 - round key array 2733 address generate_aescrypt_encryptBlock() { 2734 assert(UseAES, "need AES instructions and misaligned SSE support"); 2735 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2736 StubCodeMark mark(this, stub_id); 2737 2738 address start = __ function_entry(); 2739 2740 Label L_doLast, L_error; 2741 2742 Register from = R3_ARG1; // source array address 2743 Register to = R4_ARG2; // destination array address 2744 Register key = R5_ARG3; // round key array 2745 2746 Register keylen = R8; 2747 Register temp = R9; 2748 Register keypos = R10; 2749 Register fifteen = R12; 2750 2751 VectorRegister vRet = VR0; 2752 2753 VectorRegister vKey1 = VR1; 2754 VectorRegister vKey2 = VR2; 2755 VectorRegister vKey3 = VR3; 2756 VectorRegister vKey4 = VR4; 2757 2758 VectorRegister fromPerm = VR5; 2759 VectorRegister keyPerm = VR6; 2760 VectorRegister toPerm = VR7; 2761 VectorRegister fSplt = VR8; 2762 2763 VectorRegister vTmp1 = VR9; 2764 VectorRegister vTmp2 = VR10; 2765 VectorRegister vTmp3 = VR11; 2766 VectorRegister vTmp4 = VR12; 2767 2768 __ li (fifteen, 15); 2769 2770 // load unaligned from[0-15] to vRet 2771 __ lvx (vRet, from); 2772 __ lvx (vTmp1, fifteen, from); 2773 __ lvsl (fromPerm, from); 2774 #ifdef VM_LITTLE_ENDIAN 2775 __ vspltisb (fSplt, 0x0f); 2776 __ vxor (fromPerm, fromPerm, fSplt); 2777 #endif 2778 __ vperm (vRet, vRet, vTmp1, fromPerm); 2779 2780 // load keylen (44 or 52 or 60) 2781 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key); 2782 2783 // to load keys 2784 __ load_perm (keyPerm, key); 2785 #ifdef VM_LITTLE_ENDIAN 2786 __ vspltisb (vTmp2, -16); 2787 __ vrld (keyPerm, keyPerm, vTmp2); 2788 __ vrld (keyPerm, keyPerm, vTmp2); 2789 __ vsldoi (keyPerm, keyPerm, keyPerm, 8); 2790 #endif 2791 2792 // load the 1st round key to vTmp1 2793 __ lvx (vTmp1, key); 2794 __ li (keypos, 16); 2795 __ lvx (vKey1, keypos, key); 2796 __ vec_perm (vTmp1, vKey1, keyPerm); 2797 2798 // 1st round 2799 __ vxor (vRet, vRet, vTmp1); 2800 2801 // load the 2nd round key to vKey1 2802 __ li (keypos, 32); 2803 __ lvx (vKey2, keypos, key); 2804 __ vec_perm (vKey1, vKey2, keyPerm); 2805 2806 // load the 3rd round key to vKey2 2807 __ li (keypos, 48); 2808 __ lvx (vKey3, keypos, key); 2809 __ vec_perm (vKey2, vKey3, keyPerm); 2810 2811 // load the 4th round key to vKey3 2812 __ li (keypos, 64); 2813 __ lvx (vKey4, keypos, key); 2814 __ vec_perm (vKey3, vKey4, keyPerm); 2815 2816 // load the 5th round key to vKey4 2817 __ li (keypos, 80); 2818 __ lvx (vTmp1, keypos, key); 2819 __ vec_perm (vKey4, vTmp1, keyPerm); 2820 2821 // 2nd - 5th rounds 2822 __ vcipher (vRet, vRet, vKey1); 2823 __ vcipher (vRet, vRet, vKey2); 2824 __ vcipher (vRet, vRet, vKey3); 2825 __ vcipher (vRet, vRet, vKey4); 2826 2827 // load the 6th round key to vKey1 2828 __ li (keypos, 96); 2829 __ lvx (vKey2, keypos, key); 2830 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 2831 2832 // load the 7th round key to vKey2 2833 __ li (keypos, 112); 2834 __ lvx (vKey3, keypos, key); 2835 __ vec_perm (vKey2, vKey3, keyPerm); 2836 2837 // load the 8th round key to vKey3 2838 __ li (keypos, 128); 2839 __ lvx (vKey4, keypos, key); 2840 __ vec_perm (vKey3, vKey4, keyPerm); 2841 2842 // load the 9th round key to vKey4 2843 __ li (keypos, 144); 2844 __ lvx (vTmp1, keypos, key); 2845 __ vec_perm (vKey4, vTmp1, keyPerm); 2846 2847 // 6th - 9th rounds 2848 __ vcipher (vRet, vRet, vKey1); 2849 __ vcipher (vRet, vRet, vKey2); 2850 __ vcipher (vRet, vRet, vKey3); 2851 __ vcipher (vRet, vRet, vKey4); 2852 2853 // load the 10th round key to vKey1 2854 __ li (keypos, 160); 2855 __ lvx (vKey2, keypos, key); 2856 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 2857 2858 // load the 11th round key to vKey2 2859 __ li (keypos, 176); 2860 __ lvx (vTmp1, keypos, key); 2861 __ vec_perm (vKey2, vTmp1, keyPerm); 2862 2863 // if all round keys are loaded, skip next 4 rounds 2864 __ cmpwi (CR0, keylen, 44); 2865 __ beq (CR0, L_doLast); 2866 2867 // 10th - 11th rounds 2868 __ vcipher (vRet, vRet, vKey1); 2869 __ vcipher (vRet, vRet, vKey2); 2870 2871 // load the 12th round key to vKey1 2872 __ li (keypos, 192); 2873 __ lvx (vKey2, keypos, key); 2874 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 2875 2876 // load the 13th round key to vKey2 2877 __ li (keypos, 208); 2878 __ lvx (vTmp1, keypos, key); 2879 __ vec_perm (vKey2, vTmp1, keyPerm); 2880 2881 // if all round keys are loaded, skip next 2 rounds 2882 __ cmpwi (CR0, keylen, 52); 2883 __ beq (CR0, L_doLast); 2884 2885 #ifdef ASSERT 2886 __ cmpwi (CR0, keylen, 60); 2887 __ bne (CR0, L_error); 2888 #endif 2889 2890 // 12th - 13th rounds 2891 __ vcipher (vRet, vRet, vKey1); 2892 __ vcipher (vRet, vRet, vKey2); 2893 2894 // load the 14th round key to vKey1 2895 __ li (keypos, 224); 2896 __ lvx (vKey2, keypos, key); 2897 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm); 2898 2899 // load the 15th round key to vKey2 2900 __ li (keypos, 240); 2901 __ lvx (vTmp1, keypos, key); 2902 __ vec_perm (vKey2, vTmp1, keyPerm); 2903 2904 __ bind(L_doLast); 2905 2906 // last two rounds 2907 __ vcipher (vRet, vRet, vKey1); 2908 __ vcipherlast (vRet, vRet, vKey2); 2909 2910 #ifdef VM_LITTLE_ENDIAN 2911 // toPerm = 0x0F0E0D0C0B0A09080706050403020100 2912 __ lvsl (toPerm, keypos); // keypos is a multiple of 16 2913 __ vxor (toPerm, toPerm, fSplt); 2914 2915 // Swap Bytes 2916 __ vperm (vRet, vRet, vRet, toPerm); 2917 #endif 2918 2919 // store result (unaligned) 2920 // Note: We can't use a read-modify-write sequence which touches additional Bytes. 2921 Register lo = temp, hi = fifteen; // Reuse 2922 __ vsldoi (vTmp1, vRet, vRet, 8); 2923 __ mfvrd (hi, vRet); 2924 __ mfvrd (lo, vTmp1); 2925 __ std (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to); 2926 __ std (lo, 0 BIG_ENDIAN_ONLY(+ 8), to); 2927 2928 __ blr(); 2929 2930 #ifdef ASSERT 2931 __ bind(L_error); 2932 __ stop("aescrypt_encryptBlock: invalid key length"); 2933 #endif 2934 return start; 2935 } 2936 2937 // Arguments for generated stub: 2938 // R3_ARG1 - source byte array address 2939 // R4_ARG2 - destination byte array address 2940 // R5_ARG3 - K (key) in little endian int array 2941 address generate_aescrypt_decryptBlock() { 2942 assert(UseAES, "need AES instructions and misaligned SSE support"); 2943 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2944 StubCodeMark mark(this, stub_id); 2945 2946 address start = __ function_entry(); 2947 2948 Label L_doLast, L_do44, L_do52, L_error; 2949 2950 Register from = R3_ARG1; // source array address 2951 Register to = R4_ARG2; // destination array address 2952 Register key = R5_ARG3; // round key array 2953 2954 Register keylen = R8; 2955 Register temp = R9; 2956 Register keypos = R10; 2957 Register fifteen = R12; 2958 2959 VectorRegister vRet = VR0; 2960 2961 VectorRegister vKey1 = VR1; 2962 VectorRegister vKey2 = VR2; 2963 VectorRegister vKey3 = VR3; 2964 VectorRegister vKey4 = VR4; 2965 VectorRegister vKey5 = VR5; 2966 2967 VectorRegister fromPerm = VR6; 2968 VectorRegister keyPerm = VR7; 2969 VectorRegister toPerm = VR8; 2970 VectorRegister fSplt = VR9; 2971 2972 VectorRegister vTmp1 = VR10; 2973 VectorRegister vTmp2 = VR11; 2974 VectorRegister vTmp3 = VR12; 2975 VectorRegister vTmp4 = VR13; 2976 2977 __ li (fifteen, 15); 2978 2979 // load unaligned from[0-15] to vRet 2980 __ lvx (vRet, from); 2981 __ lvx (vTmp1, fifteen, from); 2982 __ lvsl (fromPerm, from); 2983 #ifdef VM_LITTLE_ENDIAN 2984 __ vspltisb (fSplt, 0x0f); 2985 __ vxor (fromPerm, fromPerm, fSplt); 2986 #endif 2987 __ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE] 2988 2989 // load keylen (44 or 52 or 60) 2990 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key); 2991 2992 // to load keys 2993 __ load_perm (keyPerm, key); 2994 #ifdef VM_LITTLE_ENDIAN 2995 __ vxor (vTmp2, vTmp2, vTmp2); 2996 __ vspltisb (vTmp2, -16); 2997 __ vrld (keyPerm, keyPerm, vTmp2); 2998 __ vrld (keyPerm, keyPerm, vTmp2); 2999 __ vsldoi (keyPerm, keyPerm, keyPerm, 8); 3000 #endif 3001 3002 __ cmpwi (CR0, keylen, 44); 3003 __ beq (CR0, L_do44); 3004 3005 __ cmpwi (CR0, keylen, 52); 3006 __ beq (CR0, L_do52); 3007 3008 #ifdef ASSERT 3009 __ cmpwi (CR0, keylen, 60); 3010 __ bne (CR0, L_error); 3011 #endif 3012 3013 // load the 15th round key to vKey1 3014 __ li (keypos, 240); 3015 __ lvx (vKey1, keypos, key); 3016 __ li (keypos, 224); 3017 __ lvx (vKey2, keypos, key); 3018 __ vec_perm (vKey1, vKey2, vKey1, keyPerm); 3019 3020 // load the 14th round key to vKey2 3021 __ li (keypos, 208); 3022 __ lvx (vKey3, keypos, key); 3023 __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 3024 3025 // load the 13th round key to vKey3 3026 __ li (keypos, 192); 3027 __ lvx (vKey4, keypos, key); 3028 __ vec_perm (vKey3, vKey4, vKey3, keyPerm); 3029 3030 // load the 12th round key to vKey4 3031 __ li (keypos, 176); 3032 __ lvx (vKey5, keypos, key); 3033 __ vec_perm (vKey4, vKey5, vKey4, keyPerm); 3034 3035 // load the 11th round key to vKey5 3036 __ li (keypos, 160); 3037 __ lvx (vTmp1, keypos, key); 3038 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm); 3039 3040 // 1st - 5th rounds 3041 __ vxor (vRet, vRet, vKey1); 3042 __ vncipher (vRet, vRet, vKey2); 3043 __ vncipher (vRet, vRet, vKey3); 3044 __ vncipher (vRet, vRet, vKey4); 3045 __ vncipher (vRet, vRet, vKey5); 3046 3047 __ b (L_doLast); 3048 3049 __ align(32); 3050 __ bind (L_do52); 3051 3052 // load the 13th round key to vKey1 3053 __ li (keypos, 208); 3054 __ lvx (vKey1, keypos, key); 3055 __ li (keypos, 192); 3056 __ lvx (vKey2, keypos, key); 3057 __ vec_perm (vKey1, vKey2, vKey1, keyPerm); 3058 3059 // load the 12th round key to vKey2 3060 __ li (keypos, 176); 3061 __ lvx (vKey3, keypos, key); 3062 __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 3063 3064 // load the 11th round key to vKey3 3065 __ li (keypos, 160); 3066 __ lvx (vTmp1, keypos, key); 3067 __ vec_perm (vKey3, vTmp1, vKey3, keyPerm); 3068 3069 // 1st - 3rd rounds 3070 __ vxor (vRet, vRet, vKey1); 3071 __ vncipher (vRet, vRet, vKey2); 3072 __ vncipher (vRet, vRet, vKey3); 3073 3074 __ b (L_doLast); 3075 3076 __ align(32); 3077 __ bind (L_do44); 3078 3079 // load the 11th round key to vKey1 3080 __ li (keypos, 176); 3081 __ lvx (vKey1, keypos, key); 3082 __ li (keypos, 160); 3083 __ lvx (vTmp1, keypos, key); 3084 __ vec_perm (vKey1, vTmp1, vKey1, keyPerm); 3085 3086 // 1st round 3087 __ vxor (vRet, vRet, vKey1); 3088 3089 __ bind (L_doLast); 3090 3091 // load the 10th round key to vKey1 3092 __ li (keypos, 144); 3093 __ lvx (vKey2, keypos, key); 3094 __ vec_perm (vKey1, vKey2, vTmp1, keyPerm); 3095 3096 // load the 9th round key to vKey2 3097 __ li (keypos, 128); 3098 __ lvx (vKey3, keypos, key); 3099 __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 3100 3101 // load the 8th round key to vKey3 3102 __ li (keypos, 112); 3103 __ lvx (vKey4, keypos, key); 3104 __ vec_perm (vKey3, vKey4, vKey3, keyPerm); 3105 3106 // load the 7th round key to vKey4 3107 __ li (keypos, 96); 3108 __ lvx (vKey5, keypos, key); 3109 __ vec_perm (vKey4, vKey5, vKey4, keyPerm); 3110 3111 // load the 6th round key to vKey5 3112 __ li (keypos, 80); 3113 __ lvx (vTmp1, keypos, key); 3114 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm); 3115 3116 // last 10th - 6th rounds 3117 __ vncipher (vRet, vRet, vKey1); 3118 __ vncipher (vRet, vRet, vKey2); 3119 __ vncipher (vRet, vRet, vKey3); 3120 __ vncipher (vRet, vRet, vKey4); 3121 __ vncipher (vRet, vRet, vKey5); 3122 3123 // load the 5th round key to vKey1 3124 __ li (keypos, 64); 3125 __ lvx (vKey2, keypos, key); 3126 __ vec_perm (vKey1, vKey2, vTmp1, keyPerm); 3127 3128 // load the 4th round key to vKey2 3129 __ li (keypos, 48); 3130 __ lvx (vKey3, keypos, key); 3131 __ vec_perm (vKey2, vKey3, vKey2, keyPerm); 3132 3133 // load the 3rd round key to vKey3 3134 __ li (keypos, 32); 3135 __ lvx (vKey4, keypos, key); 3136 __ vec_perm (vKey3, vKey4, vKey3, keyPerm); 3137 3138 // load the 2nd round key to vKey4 3139 __ li (keypos, 16); 3140 __ lvx (vKey5, keypos, key); 3141 __ vec_perm (vKey4, vKey5, vKey4, keyPerm); 3142 3143 // load the 1st round key to vKey5 3144 __ lvx (vTmp1, key); 3145 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm); 3146 3147 // last 5th - 1th rounds 3148 __ vncipher (vRet, vRet, vKey1); 3149 __ vncipher (vRet, vRet, vKey2); 3150 __ vncipher (vRet, vRet, vKey3); 3151 __ vncipher (vRet, vRet, vKey4); 3152 __ vncipherlast (vRet, vRet, vKey5); 3153 3154 #ifdef VM_LITTLE_ENDIAN 3155 // toPerm = 0x0F0E0D0C0B0A09080706050403020100 3156 __ lvsl (toPerm, keypos); // keypos is a multiple of 16 3157 __ vxor (toPerm, toPerm, fSplt); 3158 3159 // Swap Bytes 3160 __ vperm (vRet, vRet, vRet, toPerm); 3161 #endif 3162 3163 // store result (unaligned) 3164 // Note: We can't use a read-modify-write sequence which touches additional Bytes. 3165 Register lo = temp, hi = fifteen; // Reuse 3166 __ vsldoi (vTmp1, vRet, vRet, 8); 3167 __ mfvrd (hi, vRet); 3168 __ mfvrd (lo, vTmp1); 3169 __ std (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to); 3170 __ std (lo, 0 BIG_ENDIAN_ONLY(+ 8), to); 3171 3172 __ blr(); 3173 3174 #ifdef ASSERT 3175 __ bind(L_error); 3176 __ stop("aescrypt_decryptBlock: invalid key length"); 3177 #endif 3178 return start; 3179 } 3180 3181 address generate_sha256_implCompress(StubGenStubId stub_id) { 3182 assert(UseSHA, "need SHA instructions"); 3183 bool multi_block; 3184 switch (stub_id) { 3185 case sha256_implCompress_id: 3186 multi_block = false; 3187 break; 3188 case sha256_implCompressMB_id: 3189 multi_block = true; 3190 break; 3191 default: 3192 ShouldNotReachHere(); 3193 } 3194 StubCodeMark mark(this, stub_id); 3195 address start = __ function_entry(); 3196 3197 __ sha256 (multi_block); 3198 __ blr(); 3199 3200 return start; 3201 } 3202 3203 address generate_sha512_implCompress(StubGenStubId stub_id) { 3204 assert(UseSHA, "need SHA instructions"); 3205 bool multi_block; 3206 switch (stub_id) { 3207 case sha512_implCompress_id: 3208 multi_block = false; 3209 break; 3210 case sha512_implCompressMB_id: 3211 multi_block = true; 3212 break; 3213 default: 3214 ShouldNotReachHere(); 3215 } 3216 StubCodeMark mark(this, stub_id); 3217 address start = __ function_entry(); 3218 3219 __ sha512 (multi_block); 3220 __ blr(); 3221 3222 return start; 3223 } 3224 3225 address generate_data_cache_writeback() { 3226 const Register cacheline = R3_ARG1; 3227 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 3228 StubCodeMark mark(this, stub_id); 3229 address start = __ pc(); 3230 3231 __ cache_wb(Address(cacheline)); 3232 __ blr(); 3233 3234 return start; 3235 } 3236 3237 address generate_data_cache_writeback_sync() { 3238 const Register is_presync = R3_ARG1; 3239 Register temp = R4; 3240 Label SKIP; 3241 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 3242 StubCodeMark mark(this, stub_id); 3243 address start = __ pc(); 3244 3245 __ andi_(temp, is_presync, 1); 3246 __ bne(CR0, SKIP); 3247 __ cache_wbsync(false); // post sync => emit 'sync' 3248 __ bind(SKIP); // pre sync => emit nothing 3249 __ blr(); 3250 3251 return start; 3252 } 3253 3254 void generate_arraycopy_stubs() { 3255 // Note: the disjoint stubs must be generated first, some of 3256 // the conjoint stubs use them. 3257 3258 address ucm_common_error_exit = generate_unsafecopy_common_error_exit(); 3259 UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit); 3260 3261 // non-aligned disjoint versions 3262 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(StubGenStubId::jbyte_disjoint_arraycopy_id); 3263 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(StubGenStubId::jshort_disjoint_arraycopy_id); 3264 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(StubGenStubId::jint_disjoint_arraycopy_id); 3265 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(StubGenStubId::jlong_disjoint_arraycopy_id); 3266 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(StubGenStubId::oop_disjoint_arraycopy_id); 3267 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubGenStubId::oop_disjoint_arraycopy_uninit_id); 3268 3269 // aligned disjoint versions 3270 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id); 3271 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id); 3272 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id); 3273 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id); 3274 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id); 3275 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubGenStubId::oop_disjoint_arraycopy_uninit_id); 3276 3277 // non-aligned conjoint versions 3278 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(StubGenStubId::jbyte_arraycopy_id); 3279 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(StubGenStubId::jshort_arraycopy_id); 3280 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(StubGenStubId::jint_arraycopy_id); 3281 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(StubGenStubId::jlong_arraycopy_id); 3282 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(StubGenStubId::oop_arraycopy_id); 3283 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubGenStubId::oop_arraycopy_uninit_id); 3284 3285 // aligned conjoint versions 3286 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(StubGenStubId::arrayof_jbyte_arraycopy_id); 3287 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(StubGenStubId::arrayof_jshort_arraycopy_id); 3288 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(StubGenStubId::arrayof_jint_arraycopy_id); 3289 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(StubGenStubId::arrayof_jlong_arraycopy_id); 3290 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(StubGenStubId::arrayof_oop_arraycopy_id); 3291 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubGenStubId::arrayof_oop_arraycopy_id); 3292 3293 // special/generic versions 3294 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id); 3295 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id); 3296 3297 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(STUB_ENTRY(jbyte_arraycopy()), 3298 STUB_ENTRY(jshort_arraycopy()), 3299 STUB_ENTRY(jint_arraycopy()), 3300 STUB_ENTRY(jlong_arraycopy())); 3301 StubRoutines::_generic_arraycopy = generate_generic_copy(STUB_ENTRY(jbyte_arraycopy()), 3302 STUB_ENTRY(jshort_arraycopy()), 3303 STUB_ENTRY(jint_arraycopy()), 3304 STUB_ENTRY(oop_arraycopy()), 3305 STUB_ENTRY(oop_disjoint_arraycopy()), 3306 STUB_ENTRY(jlong_arraycopy()), 3307 STUB_ENTRY(checkcast_arraycopy())); 3308 3309 // fill routines 3310 #ifdef COMPILER2 3311 if (OptimizeFill) { 3312 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 3313 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 3314 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 3315 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 3316 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 3317 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 3318 } 3319 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill); 3320 #endif 3321 } 3322 3323 // Stub for BigInteger::multiplyToLen() 3324 // 3325 // Arguments: 3326 // 3327 // Input: 3328 // R3 - x address 3329 // R4 - x length 3330 // R5 - y address 3331 // R6 - y length 3332 // R7 - z address 3333 // 3334 address generate_multiplyToLen() { 3335 3336 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 3337 StubCodeMark mark(this, stub_id); 3338 3339 address start = __ function_entry(); 3340 3341 const Register x = R3; 3342 const Register xlen = R4; 3343 const Register y = R5; 3344 const Register ylen = R6; 3345 const Register z = R7; 3346 3347 const Register tmp1 = R2; // TOC not used. 3348 const Register tmp2 = R9; 3349 const Register tmp3 = R10; 3350 const Register tmp4 = R11; 3351 const Register tmp5 = R12; 3352 3353 // non-volatile regs 3354 const Register tmp6 = R31; 3355 const Register tmp7 = R30; 3356 const Register tmp8 = R29; 3357 const Register tmp9 = R28; 3358 const Register tmp10 = R27; 3359 const Register tmp11 = R26; 3360 const Register tmp12 = R25; 3361 const Register tmp13 = R24; 3362 3363 BLOCK_COMMENT("Entry:"); 3364 3365 // C2 does not respect int to long conversion for stub calls. 3366 __ clrldi(xlen, xlen, 32); 3367 __ clrldi(ylen, ylen, 32); 3368 3369 // Save non-volatile regs (frameless). 3370 int current_offs = 8; 3371 __ std(R24, -current_offs, R1_SP); current_offs += 8; 3372 __ std(R25, -current_offs, R1_SP); current_offs += 8; 3373 __ std(R26, -current_offs, R1_SP); current_offs += 8; 3374 __ std(R27, -current_offs, R1_SP); current_offs += 8; 3375 __ std(R28, -current_offs, R1_SP); current_offs += 8; 3376 __ std(R29, -current_offs, R1_SP); current_offs += 8; 3377 __ std(R30, -current_offs, R1_SP); current_offs += 8; 3378 __ std(R31, -current_offs, R1_SP); 3379 3380 __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5, 3381 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13); 3382 3383 // Restore non-volatile regs. 3384 current_offs = 8; 3385 __ ld(R24, -current_offs, R1_SP); current_offs += 8; 3386 __ ld(R25, -current_offs, R1_SP); current_offs += 8; 3387 __ ld(R26, -current_offs, R1_SP); current_offs += 8; 3388 __ ld(R27, -current_offs, R1_SP); current_offs += 8; 3389 __ ld(R28, -current_offs, R1_SP); current_offs += 8; 3390 __ ld(R29, -current_offs, R1_SP); current_offs += 8; 3391 __ ld(R30, -current_offs, R1_SP); current_offs += 8; 3392 __ ld(R31, -current_offs, R1_SP); 3393 3394 __ blr(); // Return to caller. 3395 3396 return start; 3397 } 3398 3399 /** 3400 * Arguments: 3401 * 3402 * Input: 3403 * R3_ARG1 - out address 3404 * R4_ARG2 - in address 3405 * R5_ARG3 - offset 3406 * R6_ARG4 - len 3407 * R7_ARG5 - k 3408 * Output: 3409 * R3_RET - carry 3410 */ 3411 address generate_mulAdd() { 3412 __ align(CodeEntryAlignment); 3413 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 3414 StubCodeMark mark(this, stub_id); 3415 3416 address start = __ function_entry(); 3417 3418 // C2 does not sign extend signed parameters to full 64 bits registers: 3419 __ rldic (R5_ARG3, R5_ARG3, 2, 32); // always positive 3420 __ clrldi(R6_ARG4, R6_ARG4, 32); // force zero bits on higher word 3421 __ clrldi(R7_ARG5, R7_ARG5, 32); // force zero bits on higher word 3422 3423 __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10); 3424 3425 // Moves output carry to return register 3426 __ mr (R3_RET, R10); 3427 3428 __ blr(); 3429 3430 return start; 3431 } 3432 3433 /** 3434 * Arguments: 3435 * 3436 * Input: 3437 * R3_ARG1 - in address 3438 * R4_ARG2 - in length 3439 * R5_ARG3 - out address 3440 * R6_ARG4 - out length 3441 */ 3442 address generate_squareToLen() { 3443 __ align(CodeEntryAlignment); 3444 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 3445 StubCodeMark mark(this, stub_id); 3446 3447 address start = __ function_entry(); 3448 3449 // args - higher word is cleaned (unsignedly) due to int to long casting 3450 const Register in = R3_ARG1; 3451 const Register in_len = R4_ARG2; 3452 __ clrldi(in_len, in_len, 32); 3453 const Register out = R5_ARG3; 3454 const Register out_len = R6_ARG4; 3455 __ clrldi(out_len, out_len, 32); 3456 3457 // output 3458 const Register ret = R3_RET; 3459 3460 // temporaries 3461 const Register lplw_s = R7; 3462 const Register in_aux = R8; 3463 const Register out_aux = R9; 3464 const Register piece = R10; 3465 const Register product = R14; 3466 const Register lplw = R15; 3467 const Register i_minus1 = R16; 3468 const Register carry = R17; 3469 const Register offset = R18; 3470 const Register off_aux = R19; 3471 const Register t = R20; 3472 const Register mlen = R21; 3473 const Register len = R22; 3474 const Register a = R23; 3475 const Register b = R24; 3476 const Register i = R25; 3477 const Register c = R26; 3478 const Register cs = R27; 3479 3480 // Labels 3481 Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_LOOP_SQUARE; 3482 Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_SQUARE; 3483 3484 // Save non-volatile regs (frameless). 3485 int current_offs = -8; 3486 __ std(R28, current_offs, R1_SP); current_offs -= 8; 3487 __ std(R27, current_offs, R1_SP); current_offs -= 8; 3488 __ std(R26, current_offs, R1_SP); current_offs -= 8; 3489 __ std(R25, current_offs, R1_SP); current_offs -= 8; 3490 __ std(R24, current_offs, R1_SP); current_offs -= 8; 3491 __ std(R23, current_offs, R1_SP); current_offs -= 8; 3492 __ std(R22, current_offs, R1_SP); current_offs -= 8; 3493 __ std(R21, current_offs, R1_SP); current_offs -= 8; 3494 __ std(R20, current_offs, R1_SP); current_offs -= 8; 3495 __ std(R19, current_offs, R1_SP); current_offs -= 8; 3496 __ std(R18, current_offs, R1_SP); current_offs -= 8; 3497 __ std(R17, current_offs, R1_SP); current_offs -= 8; 3498 __ std(R16, current_offs, R1_SP); current_offs -= 8; 3499 __ std(R15, current_offs, R1_SP); current_offs -= 8; 3500 __ std(R14, current_offs, R1_SP); 3501 3502 // Store the squares, right shifted one bit (i.e., divided by 2) 3503 __ subi (out_aux, out, 8); 3504 __ subi (in_aux, in, 4); 3505 __ cmpwi (CR0, in_len, 0); 3506 // Initialize lplw outside of the loop 3507 __ xorr (lplw, lplw, lplw); 3508 __ ble (CR0, SKIP_LOOP_SQUARE); // in_len <= 0 3509 __ mtctr (in_len); 3510 3511 __ bind(LOOP_SQUARE); 3512 __ lwzu (piece, 4, in_aux); 3513 __ mulld (product, piece, piece); 3514 // shift left 63 bits and only keep the MSB 3515 __ rldic (lplw_s, lplw, 63, 0); 3516 __ mr (lplw, product); 3517 // shift right 1 bit without sign extension 3518 __ srdi (product, product, 1); 3519 // join them to the same register and store it 3520 __ orr (product, lplw_s, product); 3521 #ifdef VM_LITTLE_ENDIAN 3522 // Swap low and high words for little endian 3523 __ rldicl (product, product, 32, 0); 3524 #endif 3525 __ stdu (product, 8, out_aux); 3526 __ bdnz (LOOP_SQUARE); 3527 3528 __ bind(SKIP_LOOP_SQUARE); 3529 3530 // Add in off-diagonal sums 3531 __ cmpwi (CR0, in_len, 0); 3532 __ ble (CR0, SKIP_DIAGONAL_SUM); 3533 // Avoid CTR usage here in order to use it at mulAdd 3534 __ subi (i_minus1, in_len, 1); 3535 __ li (offset, 4); 3536 3537 __ bind(LOOP_DIAGONAL_SUM); 3538 3539 __ sldi (off_aux, out_len, 2); 3540 __ sub (off_aux, off_aux, offset); 3541 3542 __ mr (len, i_minus1); 3543 __ sldi (mlen, i_minus1, 2); 3544 __ lwzx (t, in, mlen); 3545 3546 __ muladd (out, in, off_aux, len, t, a, b, carry); 3547 3548 // begin<addOne> 3549 // off_aux = out_len*4 - 4 - mlen - offset*4 - 4; 3550 __ addi (mlen, mlen, 4); 3551 __ sldi (a, out_len, 2); 3552 __ subi (a, a, 4); 3553 __ sub (a, a, mlen); 3554 __ subi (off_aux, offset, 4); 3555 __ sub (off_aux, a, off_aux); 3556 3557 __ lwzx (b, off_aux, out); 3558 __ add (b, b, carry); 3559 __ stwx (b, off_aux, out); 3560 3561 // if (((uint64_t)s >> 32) != 0) { 3562 __ srdi_ (a, b, 32); 3563 __ beq (CR0, SKIP_ADDONE); 3564 3565 // while (--mlen >= 0) { 3566 __ bind(LOOP_ADDONE); 3567 __ subi (mlen, mlen, 4); 3568 __ cmpwi (CR0, mlen, 0); 3569 __ beq (CR0, SKIP_ADDONE); 3570 3571 // if (--offset_aux < 0) { // Carry out of number 3572 __ subi (off_aux, off_aux, 4); 3573 __ cmpwi (CR0, off_aux, 0); 3574 __ blt (CR0, SKIP_ADDONE); 3575 3576 // } else { 3577 __ lwzx (b, off_aux, out); 3578 __ addi (b, b, 1); 3579 __ stwx (b, off_aux, out); 3580 __ cmpwi (CR0, b, 0); 3581 __ bne (CR0, SKIP_ADDONE); 3582 __ b (LOOP_ADDONE); 3583 3584 __ bind(SKIP_ADDONE); 3585 // } } } end<addOne> 3586 3587 __ addi (offset, offset, 8); 3588 __ subi (i_minus1, i_minus1, 1); 3589 __ cmpwi (CR0, i_minus1, 0); 3590 __ bge (CR0, LOOP_DIAGONAL_SUM); 3591 3592 __ bind(SKIP_DIAGONAL_SUM); 3593 3594 // Shift back up and set low bit 3595 // Shifts 1 bit left up to len positions. Assumes no leading zeros 3596 // begin<primitiveLeftShift> 3597 __ cmpwi (CR0, out_len, 0); 3598 __ ble (CR0, SKIP_LSHIFT); 3599 __ li (i, 0); 3600 __ lwz (c, 0, out); 3601 __ subi (b, out_len, 1); 3602 __ mtctr (b); 3603 3604 __ bind(LOOP_LSHIFT); 3605 __ mr (b, c); 3606 __ addi (cs, i, 4); 3607 __ lwzx (c, out, cs); 3608 3609 __ sldi (b, b, 1); 3610 __ srwi (cs, c, 31); 3611 __ orr (b, b, cs); 3612 __ stwx (b, i, out); 3613 3614 __ addi (i, i, 4); 3615 __ bdnz (LOOP_LSHIFT); 3616 3617 __ sldi (c, out_len, 2); 3618 __ subi (c, c, 4); 3619 __ lwzx (b, out, c); 3620 __ sldi (b, b, 1); 3621 __ stwx (b, out, c); 3622 3623 __ bind(SKIP_LSHIFT); 3624 // end<primitiveLeftShift> 3625 3626 // Set low bit 3627 __ sldi (i, in_len, 2); 3628 __ subi (i, i, 4); 3629 __ lwzx (i, in, i); 3630 __ sldi (c, out_len, 2); 3631 __ subi (c, c, 4); 3632 __ lwzx (b, out, c); 3633 3634 __ andi (i, i, 1); 3635 __ orr (i, b, i); 3636 3637 __ stwx (i, out, c); 3638 3639 // Restore non-volatile regs. 3640 current_offs = -8; 3641 __ ld(R28, current_offs, R1_SP); current_offs -= 8; 3642 __ ld(R27, current_offs, R1_SP); current_offs -= 8; 3643 __ ld(R26, current_offs, R1_SP); current_offs -= 8; 3644 __ ld(R25, current_offs, R1_SP); current_offs -= 8; 3645 __ ld(R24, current_offs, R1_SP); current_offs -= 8; 3646 __ ld(R23, current_offs, R1_SP); current_offs -= 8; 3647 __ ld(R22, current_offs, R1_SP); current_offs -= 8; 3648 __ ld(R21, current_offs, R1_SP); current_offs -= 8; 3649 __ ld(R20, current_offs, R1_SP); current_offs -= 8; 3650 __ ld(R19, current_offs, R1_SP); current_offs -= 8; 3651 __ ld(R18, current_offs, R1_SP); current_offs -= 8; 3652 __ ld(R17, current_offs, R1_SP); current_offs -= 8; 3653 __ ld(R16, current_offs, R1_SP); current_offs -= 8; 3654 __ ld(R15, current_offs, R1_SP); current_offs -= 8; 3655 __ ld(R14, current_offs, R1_SP); 3656 3657 __ mr(ret, out); 3658 __ blr(); 3659 3660 return start; 3661 } 3662 3663 /** 3664 * Arguments: 3665 * 3666 * Inputs: 3667 * R3_ARG1 - int crc 3668 * R4_ARG2 - byte* buf 3669 * R5_ARG3 - int length (of buffer) 3670 * 3671 * scratch: 3672 * R2, R6-R12 3673 * 3674 * Output: 3675 * R3_RET - int crc result 3676 */ 3677 // Compute CRC32 function. 3678 address generate_CRC32_updateBytes(StubGenStubId stub_id) { 3679 bool is_crc32c; 3680 switch (stub_id) { 3681 case updateBytesCRC32_id: 3682 is_crc32c = false; 3683 break; 3684 case updateBytesCRC32C_id: 3685 is_crc32c = true; 3686 break; 3687 default: 3688 ShouldNotReachHere(); 3689 } 3690 __ align(CodeEntryAlignment); 3691 StubCodeMark mark(this, stub_id); 3692 address start = __ function_entry(); // Remember stub start address (is rtn value). 3693 __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c); 3694 __ blr(); 3695 return start; 3696 } 3697 3698 address generate_floatToFloat16() { 3699 __ align(CodeEntryAlignment); 3700 StubCodeMark mark(this, "StubRoutines", "floatToFloat16"); 3701 address start = __ function_entry(); 3702 __ f2hf(R3_RET, F1_ARG1, F0); 3703 __ blr(); 3704 return start; 3705 } 3706 3707 address generate_float16ToFloat() { 3708 __ align(CodeEntryAlignment); 3709 StubCodeMark mark(this, "StubRoutines", "float16ToFloat"); 3710 address start = __ function_entry(); 3711 __ hf2f(F1_RET, R3_ARG1); 3712 __ blr(); 3713 return start; 3714 } 3715 3716 address generate_method_entry_barrier() { 3717 __ align(CodeEntryAlignment); 3718 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 3719 StubCodeMark mark(this, stub_id); 3720 3721 address stub_address = __ pc(); 3722 3723 int nbytes_save = MacroAssembler::num_volatile_regs * BytesPerWord; 3724 __ save_volatile_gprs(R1_SP, -nbytes_save, true); 3725 3726 // Link register points to instruction in prologue of the guarded nmethod. 3727 // As the stub requires one layer of indirection (argument is of type address* and not address), 3728 // passing the link register's value directly doesn't work. 3729 // Since we have to save the link register on the stack anyway, we calculate the corresponding stack address 3730 // and pass that one instead. 3731 __ addi(R3_ARG1, R1_SP, _abi0(lr)); 3732 3733 __ save_LR(R0); 3734 __ push_frame_reg_args(nbytes_save, R0); 3735 3736 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier)); 3737 __ mr(R0, R3_RET); 3738 3739 __ pop_frame(); 3740 __ restore_LR(R3_RET /* used as tmp register */); 3741 __ restore_volatile_gprs(R1_SP, -nbytes_save, true); 3742 3743 __ cmpdi(CR0, R0, 0); 3744 3745 // Return to prologue if no deoptimization is required (bnelr) 3746 __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintIsTaken); 3747 3748 // Deoptimization required. 3749 // For actually handling the deoptimization, the 'wrong method stub' is invoked. 3750 __ load_const_optimized(R0, SharedRuntime::get_handle_wrong_method_stub()); 3751 __ mtctr(R0); 3752 3753 // Pop the frame built in the prologue. 3754 __ pop_frame(); 3755 3756 // Restore link register. Required as the 'wrong method stub' needs the caller's frame 3757 // to properly deoptimize this method (e.g. by re-resolving the call site for compiled methods). 3758 // This method's prologue is aborted. 3759 __ restore_LR(R0); 3760 3761 __ bctr(); 3762 return stub_address; 3763 } 3764 3765 #ifdef VM_LITTLE_ENDIAN 3766 // The following Base64 decode intrinsic is based on an algorithm outlined 3767 // in here: 3768 // http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html 3769 // in the section titled "Vector lookup (pshufb with bitmask)" 3770 // 3771 // This implementation differs in the following ways: 3772 // * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions 3773 // are used instead. It turns out that some of the vector operations 3774 // needed in the algorithm require fewer AltiVec instructions. 3775 // * The algorithm in the above mentioned paper doesn't handle the 3776 // Base64-URL variant in RFC 4648. Adjustments to both the code and to two 3777 // lookup tables are needed for this. 3778 // * The "Pack" section of the code is a complete rewrite for Power because we 3779 // can utilize better instructions for this step. 3780 // 3781 3782 // Offsets per group of Base64 characters 3783 // Uppercase 3784 #define UC (signed char)((-'A' + 0) & 0xff) 3785 // Lowercase 3786 #define LC (signed char)((-'a' + 26) & 0xff) 3787 // Digits 3788 #define DIG (signed char)((-'0' + 52) & 0xff) 3789 // Plus sign (URL = 0) 3790 #define PLS (signed char)((-'+' + 62) & 0xff) 3791 // Hyphen (URL = 1) 3792 #define HYP (signed char)((-'-' + 62) & 0xff) 3793 // Slash (URL = 0) 3794 #define SLS (signed char)((-'/' + 63) & 0xff) 3795 // Underscore (URL = 1) 3796 #define US (signed char)((-'_' + 63) & 0xff) 3797 3798 // For P10 (or later) only 3799 #define VALID_B64 0x80 3800 #define VB64(x) (VALID_B64 | x) 3801 3802 #define BLK_OFFSETOF(x) (offsetof(constant_block, x)) 3803 3804 // In little-endian mode, the lxv instruction loads the element at EA into 3805 // element 15 of the vector register, EA+1 goes into element 14, and so 3806 // on. 3807 // 3808 // To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the 3809 // order of the elements in a vector initialization. 3810 #define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0 3811 3812 // 3813 // Base64 decodeBlock intrinsic 3814 address generate_base64_decodeBlock() { 3815 __ align(CodeEntryAlignment); 3816 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 3817 StubCodeMark mark(this, stub_id); 3818 address start = __ function_entry(); 3819 3820 typedef struct { 3821 signed char offsetLUT_val[16]; 3822 signed char offsetLUT_URL_val[16]; 3823 unsigned char maskLUT_val[16]; 3824 unsigned char maskLUT_URL_val[16]; 3825 unsigned char bitposLUT_val[16]; 3826 unsigned char table_32_47_val[16]; 3827 unsigned char table_32_47_URL_val[16]; 3828 unsigned char table_48_63_val[16]; 3829 unsigned char table_64_79_val[16]; 3830 unsigned char table_80_95_val[16]; 3831 unsigned char table_80_95_URL_val[16]; 3832 unsigned char table_96_111_val[16]; 3833 unsigned char table_112_127_val[16]; 3834 unsigned char pack_lshift_val[16]; 3835 unsigned char pack_rshift_val[16]; 3836 unsigned char pack_permute_val[16]; 3837 } constant_block; 3838 3839 alignas(16) static const constant_block const_block = { 3840 3841 .offsetLUT_val = { 3842 ARRAY_TO_LXV_ORDER( 3843 0, 0, PLS, DIG, UC, UC, LC, LC, 3844 0, 0, 0, 0, 0, 0, 0, 0 ) }, 3845 3846 .offsetLUT_URL_val = { 3847 ARRAY_TO_LXV_ORDER( 3848 0, 0, HYP, DIG, UC, UC, LC, LC, 3849 0, 0, 0, 0, 0, 0, 0, 0 ) }, 3850 3851 .maskLUT_val = { 3852 ARRAY_TO_LXV_ORDER( 3853 /* 0 */ (unsigned char)0b10101000, 3854 /* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, 3855 (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, 3856 (unsigned char)0b11111000, 3857 /* 10 */ (unsigned char)0b11110000, 3858 /* 11 */ (unsigned char)0b01010100, 3859 /* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000, 3860 /* 15 */ (unsigned char)0b01010100 ) }, 3861 3862 .maskLUT_URL_val = { 3863 ARRAY_TO_LXV_ORDER( 3864 /* 0 */ (unsigned char)0b10101000, 3865 /* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, 3866 (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, 3867 (unsigned char)0b11111000, 3868 /* 10 */ (unsigned char)0b11110000, 3869 /* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000, 3870 /* 13 */ (unsigned char)0b01010100, 3871 /* 14 */ (unsigned char)0b01010000, 3872 /* 15 */ (unsigned char)0b01110000 ) }, 3873 3874 .bitposLUT_val = { 3875 ARRAY_TO_LXV_ORDER( 3876 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80, 3877 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) }, 3878 3879 // In the following table_*_val constants, a 0 value means the 3880 // character is not in the Base64 character set 3881 .table_32_47_val = { 3882 ARRAY_TO_LXV_ORDER ( 3883 /* space .. '*' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '+' = 62 */ VB64(62), /* ',' .. '.' = 0 */ 0, 0, 0, /* '/' = 63 */ VB64(63) ) }, 3884 3885 .table_32_47_URL_val = { 3886 ARRAY_TO_LXV_ORDER( 3887 /* space .. ',' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '-' = 62 */ VB64(62), /* '.' .. '/' */ 0, 0 ) }, 3888 3889 .table_48_63_val = { 3890 ARRAY_TO_LXV_ORDER( 3891 /* '0' .. '9' = 52 .. 61 */ VB64(52), VB64(53), VB64(54), VB64(55), VB64(56), VB64(57), VB64(58), VB64(59), VB64(60), VB64(61), 3892 /* ':' .. '?' = 0 */ 0, 0, 0, 0, 0, 0 ) }, 3893 3894 .table_64_79_val = { 3895 ARRAY_TO_LXV_ORDER( 3896 /* '@' = 0 */ 0, /* 'A' .. 'O' = 0 .. 14 */ VB64(0), VB64(1), VB64(2), VB64(3), VB64(4), VB64(5), VB64(6), VB64(7), VB64(8), 3897 VB64(9), VB64(10), VB64(11), VB64(12), VB64(13), VB64(14) ) }, 3898 3899 .table_80_95_val = { 3900 ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22), 3901 VB64(23), VB64(24), VB64(25), /* '[' .. '_' = 0 */ 0, 0, 0, 0, 0 ) }, 3902 3903 .table_80_95_URL_val = { 3904 ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22), 3905 VB64(23), VB64(24), VB64(25), /* '[' .. '^' = 0 */ 0, 0, 0, 0, /* '_' = 63 */ VB64(63) ) }, 3906 3907 .table_96_111_val = { 3908 ARRAY_TO_LXV_ORDER(/* '`' = 0 */ 0, /* 'a' .. 'o' = 26 .. 40 */ VB64(26), VB64(27), VB64(28), VB64(29), VB64(30), VB64(31), 3909 VB64(32), VB64(33), VB64(34), VB64(35), VB64(36), VB64(37), VB64(38), VB64(39), VB64(40) ) }, 3910 3911 .table_112_127_val = { 3912 ARRAY_TO_LXV_ORDER(/* 'p' .. 'z' = 41 .. 51 */ VB64(41), VB64(42), VB64(43), VB64(44), VB64(45), VB64(46), VB64(47), VB64(48), 3913 VB64(49), VB64(50), VB64(51), /* '{' .. DEL = 0 */ 0, 0, 0, 0, 0 ) }, 3914 3915 .pack_lshift_val = { 3916 ARRAY_TO_LXV_ORDER( 3917 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2 ) }, 3918 3919 .pack_rshift_val = { 3920 ARRAY_TO_LXV_ORDER( 3921 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0 ) }, 3922 3923 // The first 4 index values are "don't care" because 3924 // we only use the first 12 bytes of the vector, 3925 // which are decoded from 16 bytes of Base64 characters. 3926 .pack_permute_val = { 3927 ARRAY_TO_LXV_ORDER( 3928 0, 0, 0, 0, 3929 0, 1, 2, 3930 4, 5, 6, 3931 8, 9, 10, 3932 12, 13, 14 ) } 3933 }; 3934 3935 const unsigned block_size = 16; // number of bytes to process in each pass through the loop 3936 const unsigned block_size_shift = 4; 3937 3938 // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore 3939 Register s = R3_ARG1; // source starting address of Base64 characters 3940 Register sp = R4_ARG2; // source offset 3941 Register sl = R5_ARG3; // source length = # of Base64 characters to be processed 3942 Register d = R6_ARG4; // destination address 3943 Register dp = R7_ARG5; // destination offset 3944 Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding 3945 Register isMIME = R9_ARG7; // boolean, if non-zero indicates use of RFC 2045 MIME encoding - not used 3946 3947 // Local variables 3948 Register const_ptr = R9; // used for loading constants 3949 Register tmp_reg = R10; // used for speeding up load_constant_optimized() 3950 3951 // Re-use R9 and R10 to avoid using non-volatile registers (requires save/restore) 3952 Register out = R9; // moving out (destination) pointer 3953 Register in = R10; // moving in (source) pointer 3954 3955 // Volatile VSRS are 0..13, 32..51 (VR0..VR13) 3956 // VR Constants 3957 VectorRegister vec_0s = VR0; 3958 VectorRegister vec_4s = VR1; 3959 VectorRegister vec_8s = VR2; 3960 VectorRegister vec_special_case_char = VR3; 3961 VectorRegister pack_rshift = VR4; 3962 VectorRegister pack_lshift = VR5; 3963 3964 // VSR Constants 3965 VectorSRegister offsetLUT = VSR0; 3966 VectorSRegister maskLUT = VSR1; 3967 VectorSRegister bitposLUT = VSR2; 3968 VectorSRegister vec_0xfs = VSR3; 3969 VectorSRegister vec_special_case_offset = VSR4; 3970 VectorSRegister pack_permute = VSR5; 3971 3972 // P10 (or later) VSR lookup constants 3973 VectorSRegister table_32_47 = VSR0; 3974 VectorSRegister table_48_63 = VSR1; 3975 VectorSRegister table_64_79 = VSR2; 3976 VectorSRegister table_80_95 = VSR3; 3977 VectorSRegister table_96_111 = VSR4; 3978 VectorSRegister table_112_127 = VSR6; 3979 3980 // Data read in and later converted 3981 VectorRegister input = VR6; 3982 // Variable for testing Base64 validity 3983 VectorRegister non_match = VR10; 3984 3985 // P9 VR Variables for lookup 3986 VectorRegister higher_nibble = VR7; 3987 VectorRegister eq_special_case_char = VR8; 3988 VectorRegister offsets = VR9; 3989 3990 // P9 VSR lookup variables 3991 VectorSRegister bit = VSR6; 3992 VectorSRegister lower_nibble = VSR7; 3993 VectorSRegister M = VSR8; 3994 3995 // P10 (or later) VSR lookup variables 3996 VectorSRegister xlate_a = VSR7; 3997 VectorSRegister xlate_b = VSR8; 3998 3999 // Variables for pack 4000 // VR 4001 VectorRegister l = VR7; // reuse higher_nibble's register 4002 VectorRegister r = VR8; // reuse eq_special_case_char's register 4003 VectorRegister gathered = VR10; // reuse non_match's register 4004 4005 Label not_URL, calculate_size, loop_start, loop_exit, return_zero; 4006 4007 // The upper 32 bits of the non-pointer parameter registers are not 4008 // guaranteed to be zero, so mask off those upper bits. 4009 __ clrldi(sp, sp, 32); 4010 __ clrldi(sl, sl, 32); 4011 4012 // Don't handle the last 4 characters of the source, because this 4013 // VSX-based algorithm doesn't handle padding characters. Also the 4014 // vector code will always write 16 bytes of decoded data on each pass, 4015 // but only the first 12 of those 16 bytes are valid data (16 base64 4016 // characters become 12 bytes of binary data), so for this reason we 4017 // need to subtract an additional 8 bytes from the source length, in 4018 // order not to write past the end of the destination buffer. The 4019 // result of this subtraction implies that a Java function in the 4020 // Base64 class will be used to process the last 12 characters. 4021 __ sub(sl, sl, sp); 4022 __ subi(sl, sl, 12); 4023 4024 // Load CTR with the number of passes through the loop 4025 // = sl >> block_size_shift. After the shift, if sl <= 0, there's too 4026 // little data to be processed by this intrinsic. 4027 __ srawi_(sl, sl, block_size_shift); 4028 __ ble(CR0, return_zero); 4029 __ mtctr(sl); 4030 4031 // Clear the other two parameter registers upper 32 bits. 4032 __ clrldi(isURL, isURL, 32); 4033 __ clrldi(dp, dp, 32); 4034 4035 // Load constant vec registers that need to be loaded from memory 4036 __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg); 4037 __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr); 4038 __ lxv(pack_rshift->to_vsr(), BLK_OFFSETOF(pack_rshift_val), const_ptr); 4039 __ lxv(pack_lshift->to_vsr(), BLK_OFFSETOF(pack_lshift_val), const_ptr); 4040 __ lxv(pack_permute, BLK_OFFSETOF(pack_permute_val), const_ptr); 4041 4042 // Splat the constants that can use xxspltib 4043 __ xxspltib(vec_0s->to_vsr(), 0); 4044 __ xxspltib(vec_8s->to_vsr(), 8); 4045 if (PowerArchitecturePPC64 >= 10) { 4046 // Using VALID_B64 for the offsets effectively strips the upper bit 4047 // of each byte that was selected from the table. Setting the upper 4048 // bit gives us a way to distinguish between the 6-bit value of 0 4049 // from an error code of 0, which will happen if the character is 4050 // outside the range of the lookup, or is an illegal Base64 4051 // character, such as %. 4052 __ xxspltib(offsets->to_vsr(), VALID_B64); 4053 4054 __ lxv(table_48_63, BLK_OFFSETOF(table_48_63_val), const_ptr); 4055 __ lxv(table_64_79, BLK_OFFSETOF(table_64_79_val), const_ptr); 4056 __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr); 4057 __ lxv(table_96_111, BLK_OFFSETOF(table_96_111_val), const_ptr); 4058 __ lxv(table_112_127, BLK_OFFSETOF(table_112_127_val), const_ptr); 4059 } else { 4060 __ xxspltib(vec_4s->to_vsr(), 4); 4061 __ xxspltib(vec_0xfs, 0xf); 4062 __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr); 4063 } 4064 4065 // The rest of the constants use different values depending on the 4066 // setting of isURL 4067 __ cmpwi(CR0, isURL, 0); 4068 __ beq(CR0, not_URL); 4069 4070 // isURL != 0 (true) 4071 if (PowerArchitecturePPC64 >= 10) { 4072 __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_URL_val), const_ptr); 4073 __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_URL_val), const_ptr); 4074 } else { 4075 __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_URL_val), const_ptr); 4076 __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_URL_val), const_ptr); 4077 __ xxspltib(vec_special_case_char->to_vsr(), '_'); 4078 __ xxspltib(vec_special_case_offset, (unsigned char)US); 4079 } 4080 __ b(calculate_size); 4081 4082 // isURL = 0 (false) 4083 __ bind(not_URL); 4084 if (PowerArchitecturePPC64 >= 10) { 4085 __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_val), const_ptr); 4086 __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr); 4087 } else { 4088 __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_val), const_ptr); 4089 __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_val), const_ptr); 4090 __ xxspltib(vec_special_case_char->to_vsr(), '/'); 4091 __ xxspltib(vec_special_case_offset, (unsigned char)SLS); 4092 } 4093 4094 __ bind(calculate_size); 4095 4096 // out starts at d + dp 4097 __ add(out, d, dp); 4098 4099 // in starts at s + sp 4100 __ add(in, s, sp); 4101 4102 __ align(32); 4103 __ bind(loop_start); 4104 __ lxv(input->to_vsr(), 0, in); // offset=0 4105 4106 // 4107 // Lookup 4108 // 4109 if (PowerArchitecturePPC64 >= 10) { 4110 // Use xxpermx to do a lookup of each Base64 character in the 4111 // input vector and translate it to a 6-bit value + 0x80. 4112 // Characters which are not valid Base64 characters will result 4113 // in a zero in the corresponding byte. 4114 // 4115 // Note that due to align(32) call above, the xxpermx instructions do 4116 // not require align_prefix() calls, since the final xxpermx 4117 // prefix+opcode is at byte 24. 4118 __ xxpermx(xlate_a, table_32_47, table_48_63, input->to_vsr(), 1); // offset=4 4119 __ xxpermx(xlate_b, table_64_79, table_80_95, input->to_vsr(), 2); // offset=12 4120 __ xxlor(xlate_b, xlate_a, xlate_b); // offset=20 4121 __ xxpermx(xlate_a, table_96_111, table_112_127, input->to_vsr(), 3); // offset=24 4122 __ xxlor(input->to_vsr(), xlate_a, xlate_b); 4123 // Check for non-Base64 characters by comparing each byte to zero. 4124 __ vcmpequb_(non_match, input, vec_0s); 4125 } else { 4126 // Isolate the upper 4 bits of each character by shifting it right 4 bits 4127 __ vsrb(higher_nibble, input, vec_4s); 4128 // Isolate the lower 4 bits by masking 4129 __ xxland(lower_nibble, input->to_vsr(), vec_0xfs); 4130 4131 // Get the offset (the value to subtract from the byte) by using 4132 // a lookup table indexed by the upper 4 bits of the character 4133 __ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr()); 4134 4135 // Find out which elements are the special case character (isURL ? '/' : '-') 4136 __ vcmpequb(eq_special_case_char, input, vec_special_case_char); 4137 4138 // For each character in the input which is a special case 4139 // character, replace its offset with one that is special for that 4140 // character. 4141 __ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr()); 4142 4143 // Use the lower_nibble to select a mask "M" from the lookup table. 4144 __ xxperm(M, maskLUT, lower_nibble); 4145 4146 // "bit" is used to isolate which of the bits in M is relevant. 4147 __ xxperm(bit, bitposLUT, higher_nibble->to_vsr()); 4148 4149 // Each element of non_match correspond to one each of the 16 input 4150 // characters. Those elements that become 0x00 after the xxland 4151 // instruction are invalid Base64 characters. 4152 __ xxland(non_match->to_vsr(), M, bit); 4153 4154 // Compare each element to zero 4155 // 4156 __ vcmpequb_(non_match, non_match, vec_0s); 4157 } 4158 // vmcmpequb_ sets the EQ bit of CR6 if no elements compare equal. 4159 // Any element comparing equal to zero means there is an error in 4160 // that element. Note that the comparison result register 4161 // non_match is not referenced again. Only CR6-EQ matters. 4162 __ bne_predict_not_taken(CR6, loop_exit); 4163 4164 // The Base64 characters had no errors, so add the offsets, which in 4165 // the case of Power10 is a constant vector of all 0x80's (see earlier 4166 // comment where the offsets register is loaded). 4167 __ vaddubm(input, input, offsets); 4168 4169 // Pack 4170 // 4171 // In the tables below, b0, b1, .. b15 are the bytes of decoded 4172 // binary data, the first line of each of the cells (except for 4173 // the constants) uses the bit-field nomenclature from the 4174 // above-linked paper, whereas the second line is more specific 4175 // about which exact bits are present, and is constructed using the 4176 // Power ISA 3.x document style, where: 4177 // 4178 // * The specifier after the colon depicts which bits are there. 4179 // * The bit numbering is big endian style (bit 0 is the most 4180 // significant). 4181 // * || is a concatenate operator. 4182 // * Strings of 0's are a field of zeros with the shown length, and 4183 // likewise for strings of 1's. 4184 4185 // Note that only e12..e15 are shown here because the shifting 4186 // and OR'ing pattern replicates for e8..e11, e4..7, and 4187 // e0..e3. 4188 // 4189 // +======================+=================+======================+======================+=============+ 4190 // | Vector | e12 | e13 | e14 | e15 | 4191 // | Element | | | | | 4192 // +======================+=================+======================+======================+=============+ 4193 // | after vaddubm | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa | 4194 // | | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 | 4195 // +----------------------+-----------------+----------------------+----------------------+-------------+ 4196 // | pack_lshift | | << 6 | << 4 | << 2 | 4197 // +----------------------+-----------------+----------------------+----------------------+-------------+ 4198 // | l after vslb | 00dddddd | cc000000 | bbbb0000 | aaaaaa00 | 4199 // | | 00||b2:2..7 | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 | 4200 // +----------------------+-----------------+----------------------+----------------------+-------------+ 4201 // | l after vslo | cc000000 | bbbb0000 | aaaaaa00 | 00000000 | 4202 // | | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 | 00000000 | 4203 // +----------------------+-----------------+----------------------+----------------------+-------------+ 4204 // | pack_rshift | | >> 2 | >> 4 | | 4205 // +----------------------+-----------------+----------------------+----------------------+-------------+ 4206 // | r after vsrb | 00dddddd | 0000cccc | 000000bb | 00aaaaaa | 4207 // | | 00||b2:2..7 | 0000||b1:4..7 | 000000||b0:6..7 | 00||b0:0..5 | 4208 // +----------------------+-----------------+----------------------+----------------------+-------------+ 4209 // | gathered after xxlor | ccdddddd | bbbbcccc | aaaaaabb | 00aaaaaa | 4210 // | | b2:0..7 | b1:0..7 | b0:0..7 | 00||b0:0..5 | 4211 // +======================+=================+======================+======================+=============+ 4212 // 4213 // Note: there is a typo in the above-linked paper that shows the result of the gathering process is: 4214 // [ddddddcc|bbbbcccc|aaaaaabb] 4215 // but should be: 4216 // [ccdddddd|bbbbcccc|aaaaaabb] 4217 // 4218 __ vslb(l, input, pack_lshift); 4219 // vslo of vec_8s shifts the vector by one octet toward lower 4220 // element numbers, discarding element 0. This means it actually 4221 // shifts to the right (not left) according to the order of the 4222 // table above. 4223 __ vslo(l, l, vec_8s); 4224 __ vsrb(r, input, pack_rshift); 4225 __ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr()); 4226 4227 // Final rearrangement of bytes into their correct positions. 4228 // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+ 4229 // | Vector | e0 | e1 | e2 | e3 | e4 | e5 | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 | 4230 // | Elements | | | | | | | | | | | | | | | | | 4231 // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+ 4232 // | after xxlor | b11 | b10 | b9 | xx | b8 | b7 | b6 | xx | b5 | b4 | b3 | xx | b2 | b1 | b0 | xx | 4233 // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+ 4234 // | pack_permute | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 4 | 5 | 6 | 8 | 9 | 10 | 12 | 13 | 14 | 4235 // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+ 4236 // | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 | 4237 // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+ 4238 // xx bytes are not used to form the final data 4239 // b0..b15 are the decoded and reassembled 8-bit bytes of data 4240 // b11 with asterisk is a "don't care", because these bytes will be 4241 // overwritten on the next iteration. 4242 __ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute); 4243 4244 // We cannot use a static displacement on the store, since it's a 4245 // multiple of 12, not 16. Note that this stxv instruction actually 4246 // writes 16 bytes, even though only the first 12 are valid data. 4247 __ stxv(gathered->to_vsr(), 0, out); 4248 __ addi(out, out, 12); 4249 __ addi(in, in, 16); 4250 __ bdnz(loop_start); 4251 4252 __ bind(loop_exit); 4253 4254 // Return the number of out bytes produced, which is (out - (d + dp)) == out - d - dp; 4255 __ sub(R3_RET, out, d); 4256 __ sub(R3_RET, R3_RET, dp); 4257 4258 __ blr(); 4259 4260 __ bind(return_zero); 4261 __ li(R3_RET, 0); 4262 __ blr(); 4263 4264 return start; 4265 } 4266 4267 #undef UC 4268 #undef LC 4269 #undef DIG 4270 #undef PLS 4271 #undef HYP 4272 #undef SLS 4273 #undef US 4274 4275 // This algorithm is based on the methods described in this paper: 4276 // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html 4277 // 4278 // The details of this implementation vary from the paper due to the 4279 // difference in the ISA between SSE and AltiVec, especially in the 4280 // splitting bytes section where there is no need on Power to mask after 4281 // the shift because the shift is byte-wise rather than an entire an entire 4282 // 128-bit word. 4283 // 4284 // For the lookup part of the algorithm, different logic is used than 4285 // described in the paper because of the availability of vperm, which can 4286 // do a 64-byte table lookup in four instructions, while preserving the 4287 // branchless nature. 4288 // 4289 // Description of the ENCODE_CORE macro 4290 // 4291 // Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2 4292 // bits of each byte are zeros) 4293 // 4294 // (Note: e7..e0 are not shown because they follow the same pattern as 4295 // e8..e15) 4296 // 4297 // In the table below, b0, b1, .. b15 are the bytes of unencoded 4298 // binary data, the first line of each of the cells (except for 4299 // the constants) uses the bit-field nomenclature from the 4300 // above-linked paper, whereas the second line is more specific 4301 // about which exact bits are present, and is constructed using the 4302 // Power ISA 3.x document style, where: 4303 // 4304 // * The specifier after the colon depicts which bits are there. 4305 // * The bit numbering is big endian style (bit 0 is the most 4306 // significant). 4307 // * || is a concatenate operator. 4308 // * Strings of 0's are a field of zeros with the shown length, and 4309 // likewise for strings of 1's. 4310 // 4311 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+ 4312 // | Vector | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 | 4313 // | Element | | | | | | | | | 4314 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+ 4315 // | after lxv | jjjjkkkk | iiiiiijj | gghhhhhh | ffffgggg | eeeeeeff | ccdddddd | bbbbcccc | aaaaaabb | 4316 // | | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 | 4317 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4318 // | xxperm indexes | 0 | 10 | 11 | 12 | 0 | 13 | 14 | 15 | 4319 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4320 // | (1) after xxperm | | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb | 4321 // | | (b15) | b5 | b4 | b3 | (b15) | b2 | b1 | b0 | 4322 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4323 // | rshift_amount | 0 | 6 | 4 | 2 | 0 | 6 | 4 | 2 | 4324 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4325 // | after vsrb | | 000000gg | 0000ffff | 00eeeeee | | 000000cc | 0000bbbb | 00aaaaaa | 4326 // | | (b15) | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | (b15) | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 | 4327 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4328 // | rshift_mask | 00000000 | 000000||11 | 0000||1111 | 00||111111 | 00000000 | 000000||11 | 0000||1111 | 00||111111 | 4329 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4330 // | rshift after vand | 00000000 | 000000gg | 0000ffff | 00eeeeee | 00000000 | 000000cc | 0000bbbb | 00aaaaaa | 4331 // | | 00000000 | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | 00000000 | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 | 4332 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4333 // | 1 octet lshift (1) | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb | 00000000 | 4334 // | | b5 | b4 | b3 | (b15) | b2 | b1 | b0 | 00000000 | 4335 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4336 // | lshift_amount | 0 | 2 | 4 | 0 | 0 | 2 | 4 | 0 | 4337 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4338 // | after vslb | gghhhhhh | ffgggg00 | eeff0000 | | ccdddddd | bbcccc00 | aabb0000 | 00000000 | 4339 // | | b5 | b4:2..7||00 | b3:4..7||0000 | (b15) | b2:0..7 | b1:2..7||00 | b0:4..7||0000 | 00000000 | 4340 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4341 // | lshift_mask | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 | 4342 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4343 // | lshift after vand | 00hhhhhh | 00gggg00 | 00ff0000 | 00000000 | 00dddddd | 00cccc00 | 00bb0000 | 00000000 | 4344 // | | 00||b5:2..7 | 00||b4:4..7||00 | 00||b3:6..7||0000 | 00000000 | 00||b2:2..7 | 00||b1:4..7||00 | 00||b0:6..7||0000 | 00000000 | 4345 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+ 4346 // | after vor lshift, rshift | 00hhhhhh | 00gggggg | 00ffffff | 00eeeeee | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa | 4347 // | | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 | 4348 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+ 4349 // 4350 // Expand the first 12 bytes into 16 bytes, leaving every 4th byte 4351 // blank for now. 4352 // __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute); 4353 // 4354 // Generate two bit-shifted pieces - rshift and lshift - that will 4355 // later be OR'd together. 4356 // 4357 // First the right-shifted piece 4358 // __ vsrb(rshift, input, expand_rshift); 4359 // __ vand(rshift, rshift, expand_rshift_mask); 4360 // 4361 // Now the left-shifted piece, which is done by octet shifting 4362 // the input one byte to the left, then doing a variable shift, 4363 // followed by a mask operation. 4364 // 4365 // __ vslo(lshift, input, vec_8s); 4366 // __ vslb(lshift, lshift, expand_lshift); 4367 // __ vand(lshift, lshift, expand_lshift_mask); 4368 // 4369 // Combine the two pieces by OR'ing 4370 // __ vor(expanded, rshift, lshift); 4371 // 4372 // At this point, expanded is a vector containing a 6-bit value in each 4373 // byte. These values are used as indexes into a 64-byte lookup table that 4374 // is contained in four vector registers. The lookup operation is done 4375 // using vperm instructions with the same indexes for the lower 32 and 4376 // upper 32 bytes. To figure out which of the two looked-up bytes to use 4377 // at each location, all values in expanded are compared to 31. Using 4378 // vsel, values higher than 31 use the results from the upper 32 bytes of 4379 // the lookup operation, while values less than or equal to 31 use the 4380 // lower 32 bytes of the lookup operation. 4381 // 4382 // Note: it's tempting to use a xxpermx,xxpermx,vor sequence here on 4383 // Power10 (or later), but experiments doing so on Power10 yielded a slight 4384 // performance drop, perhaps due to the need for xxpermx instruction 4385 // prefixes. 4386 4387 #define ENCODE_CORE \ 4388 __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute); \ 4389 __ vsrb(rshift, input, expand_rshift); \ 4390 __ vand(rshift, rshift, expand_rshift_mask); \ 4391 __ vslo(lshift, input, vec_8s); \ 4392 __ vslb(lshift, lshift, expand_lshift); \ 4393 __ vand(lshift, lshift, expand_lshift_mask); \ 4394 __ vor(expanded, rshift, lshift); \ 4395 __ vperm(encoded_00_31, vec_base64_00_15, vec_base64_16_31, expanded); \ 4396 __ vperm(encoded_32_63, vec_base64_32_47, vec_base64_48_63, expanded); \ 4397 __ vcmpgtub(gt_31, expanded, vec_31s); \ 4398 __ vsel(expanded, encoded_00_31, encoded_32_63, gt_31); 4399 4400 // Intrinsic function prototype in Base64.java: 4401 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) { 4402 4403 address generate_base64_encodeBlock() { 4404 __ align(CodeEntryAlignment); 4405 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 4406 StubCodeMark mark(this, stub_id); 4407 address start = __ function_entry(); 4408 4409 typedef struct { 4410 unsigned char expand_permute_val[16]; 4411 unsigned char expand_rshift_val[16]; 4412 unsigned char expand_rshift_mask_val[16]; 4413 unsigned char expand_lshift_val[16]; 4414 unsigned char expand_lshift_mask_val[16]; 4415 unsigned char base64_00_15_val[16]; 4416 unsigned char base64_16_31_val[16]; 4417 unsigned char base64_32_47_val[16]; 4418 unsigned char base64_48_63_val[16]; 4419 unsigned char base64_48_63_URL_val[16]; 4420 } constant_block; 4421 4422 alignas(16) static const constant_block const_block = { 4423 .expand_permute_val = { 4424 ARRAY_TO_LXV_ORDER( 4425 0, 4, 5, 6, 4426 0, 7, 8, 9, 4427 0, 10, 11, 12, 4428 0, 13, 14, 15 ) }, 4429 4430 .expand_rshift_val = { 4431 ARRAY_TO_LXV_ORDER( 4432 0, 6, 4, 2, 4433 0, 6, 4, 2, 4434 0, 6, 4, 2, 4435 0, 6, 4, 2 ) }, 4436 4437 .expand_rshift_mask_val = { 4438 ARRAY_TO_LXV_ORDER( 4439 0b00000000, 0b00000011, 0b00001111, 0b00111111, 4440 0b00000000, 0b00000011, 0b00001111, 0b00111111, 4441 0b00000000, 0b00000011, 0b00001111, 0b00111111, 4442 0b00000000, 0b00000011, 0b00001111, 0b00111111 ) }, 4443 4444 .expand_lshift_val = { 4445 ARRAY_TO_LXV_ORDER( 4446 0, 2, 4, 0, 4447 0, 2, 4, 0, 4448 0, 2, 4, 0, 4449 0, 2, 4, 0 ) }, 4450 4451 .expand_lshift_mask_val = { 4452 ARRAY_TO_LXV_ORDER( 4453 0b00111111, 0b00111100, 0b00110000, 0b00000000, 4454 0b00111111, 0b00111100, 0b00110000, 0b00000000, 4455 0b00111111, 0b00111100, 0b00110000, 0b00000000, 4456 0b00111111, 0b00111100, 0b00110000, 0b00000000 ) }, 4457 4458 .base64_00_15_val = { 4459 ARRAY_TO_LXV_ORDER( 4460 'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P' ) }, 4461 4462 .base64_16_31_val = { 4463 ARRAY_TO_LXV_ORDER( 4464 'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f' ) }, 4465 4466 .base64_32_47_val = { 4467 ARRAY_TO_LXV_ORDER( 4468 'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v' ) }, 4469 4470 .base64_48_63_val = { 4471 ARRAY_TO_LXV_ORDER( 4472 'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/' ) }, 4473 4474 .base64_48_63_URL_val = { 4475 ARRAY_TO_LXV_ORDER( 4476 'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) } 4477 }; 4478 4479 // Number of bytes to process in each pass through the main loop. 4480 // 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes. 4481 const unsigned block_size = 12; 4482 4483 // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore 4484 Register src = R3_ARG1; // source starting address of Base64 characters 4485 Register sp = R4_ARG2; // source starting position 4486 Register sl = R5_ARG3; // total source length of the Base64 characters to be processed 4487 Register dst = R6_ARG4; // destination address 4488 Register dp = R7_ARG5; // destination starting position 4489 Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding 4490 4491 // Local variables 4492 Register const_ptr = R12; // used for loading constants (reuses isURL's register) 4493 Register tmp_reg = R9; // used for speeding up load_constant() 4494 4495 Register size = R9; // number of bytes to process (reuses tmp_reg's register) 4496 Register blocked_size = R10; // number of bytes to process a block at a time 4497 Register block_modulo = R12; // == block_size (reuse const_ptr) 4498 Register remaining = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg) 4499 Register in = R4; // current input (source) pointer (reuse sp's register) 4500 Register num_blocks = R11; // number of blocks to be processed by the loop 4501 Register out = R8; // current output (destination) pointer (reuse const_ptr's register) 4502 Register three = R9; // constant divisor (reuse size's register) 4503 Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register) 4504 Register tmp1 = R7; // temp register for lxvl length (reuse dp's register) 4505 Register modulo_chars = R7; // number of bytes written during the final write % 4 (reuse tmp1's register) 4506 Register pad_char = R6; // literal '=' (reuse dst's register) 4507 4508 // Volatile VSRS are 0..13, 32..51 (VR0..VR13) 4509 // VR Constants 4510 VectorRegister vec_8s = VR0; 4511 VectorRegister vec_31s = VR1; 4512 VectorRegister vec_base64_00_15 = VR2; 4513 VectorRegister vec_base64_16_31 = VR3; 4514 VectorRegister vec_base64_32_47 = VR4; 4515 VectorRegister vec_base64_48_63 = VR5; 4516 VectorRegister expand_rshift = VR6; 4517 VectorRegister expand_rshift_mask = VR7; 4518 VectorRegister expand_lshift = VR8; 4519 VectorRegister expand_lshift_mask = VR9; 4520 4521 // VR variables for expand 4522 VectorRegister input = VR10; 4523 VectorRegister rshift = VR11; 4524 VectorRegister lshift = VR12; 4525 VectorRegister expanded = VR13; 4526 4527 // VR variables for lookup 4528 VectorRegister encoded_00_31 = VR10; // (reuse input) 4529 VectorRegister encoded_32_63 = VR11; // (reuse rshift) 4530 VectorRegister gt_31 = VR12; // (reuse lshift) 4531 4532 // VSR Constants 4533 VectorSRegister expand_permute = VSR0; 4534 4535 Label not_URL, calculate_size, calculate_blocked_size, skip_loop; 4536 Label loop_start, le_16_to_write, no_pad, one_pad_char; 4537 4538 // The upper 32 bits of the non-pointer parameter registers are not 4539 // guaranteed to be zero, so mask off those upper bits. 4540 __ clrldi(sp, sp, 32); 4541 __ clrldi(sl, sl, 32); 4542 __ clrldi(dp, dp, 32); 4543 __ clrldi(isURL, isURL, 32); 4544 4545 // load up the constants 4546 __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg); 4547 __ lxv(expand_permute, BLK_OFFSETOF(expand_permute_val), const_ptr); 4548 __ lxv(expand_rshift->to_vsr(), BLK_OFFSETOF(expand_rshift_val), const_ptr); 4549 __ lxv(expand_rshift_mask->to_vsr(), BLK_OFFSETOF(expand_rshift_mask_val), const_ptr); 4550 __ lxv(expand_lshift->to_vsr(), BLK_OFFSETOF(expand_lshift_val), const_ptr); 4551 __ lxv(expand_lshift_mask->to_vsr(), BLK_OFFSETOF(expand_lshift_mask_val), const_ptr); 4552 __ lxv(vec_base64_00_15->to_vsr(), BLK_OFFSETOF(base64_00_15_val), const_ptr); 4553 __ lxv(vec_base64_16_31->to_vsr(), BLK_OFFSETOF(base64_16_31_val), const_ptr); 4554 __ lxv(vec_base64_32_47->to_vsr(), BLK_OFFSETOF(base64_32_47_val), const_ptr); 4555 4556 // Splat the constants that can use xxspltib 4557 __ xxspltib(vec_8s->to_vsr(), 8); 4558 __ xxspltib(vec_31s->to_vsr(), 31); 4559 4560 4561 // Use a different translation lookup table depending on the 4562 // setting of isURL 4563 __ cmpdi(CR0, isURL, 0); 4564 __ beq(CR0, not_URL); 4565 __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr); 4566 __ b(calculate_size); 4567 4568 __ bind(not_URL); 4569 __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_val), const_ptr); 4570 4571 __ bind(calculate_size); 4572 4573 // size = sl - sp - 4 (*) 4574 // (*) Don't process the last four bytes in the main loop because 4575 // we don't want the lxv instruction to read past the end of the src 4576 // data, in case those four bytes are on the start of an unmapped or 4577 // otherwise inaccessible page. 4578 // 4579 __ sub(size, sl, sp); 4580 __ subi(size, size, 4); 4581 __ cmpdi(CR7, size, block_size); 4582 __ bgt(CR7, calculate_blocked_size); 4583 __ mr(remaining, size); 4584 // Add the 4 back into remaining again 4585 __ addi(remaining, remaining, 4); 4586 // make "in" point to the beginning of the source data: in = src + sp 4587 __ add(in, src, sp); 4588 // out = dst + dp 4589 __ add(out, dst, dp); 4590 __ b(skip_loop); 4591 4592 __ bind(calculate_blocked_size); 4593 __ li(block_modulo, block_size); 4594 // num_blocks = size / block_modulo 4595 __ divwu(num_blocks, size, block_modulo); 4596 // blocked_size = num_blocks * size 4597 __ mullw(blocked_size, num_blocks, block_modulo); 4598 // remaining = size - blocked_size 4599 __ sub(remaining, size, blocked_size); 4600 __ mtctr(num_blocks); 4601 4602 // Add the 4 back in to remaining again 4603 __ addi(remaining, remaining, 4); 4604 4605 // make "in" point to the beginning of the source data: in = src + sp 4606 __ add(in, src, sp); 4607 4608 // out = dst + dp 4609 __ add(out, dst, dp); 4610 4611 __ align(32); 4612 __ bind(loop_start); 4613 4614 __ lxv(input->to_vsr(), 0, in); 4615 4616 ENCODE_CORE 4617 4618 __ stxv(expanded->to_vsr(), 0, out); 4619 __ addi(in, in, 12); 4620 __ addi(out, out, 16); 4621 __ bdnz(loop_start); 4622 4623 __ bind(skip_loop); 4624 4625 // When there are less than 16 bytes left, we need to be careful not to 4626 // read beyond the end of the src buffer, which might be in an unmapped 4627 // page. 4628 // Load the remaining bytes using lxvl. 4629 __ rldicr(tmp1, remaining, 56, 7); 4630 __ lxvl(input->to_vsr(), in, tmp1); 4631 4632 ENCODE_CORE 4633 4634 // bytes_to_write = ((remaining * 4) + 2) / 3 4635 __ li(three, 3); 4636 __ rlwinm(bytes_to_write, remaining, 2, 0, 29); // remaining * 4 4637 __ addi(bytes_to_write, bytes_to_write, 2); 4638 __ divwu(bytes_to_write, bytes_to_write, three); 4639 4640 __ cmpwi(CR7, bytes_to_write, 16); 4641 __ ble_predict_taken(CR7, le_16_to_write); 4642 __ stxv(expanded->to_vsr(), 0, out); 4643 4644 // We've processed 12 of the 13-15 data bytes, so advance the pointers, 4645 // and do one final pass for the remaining 1-3 bytes. 4646 __ addi(in, in, 12); 4647 __ addi(out, out, 16); 4648 __ subi(remaining, remaining, 12); 4649 __ subi(bytes_to_write, bytes_to_write, 16); 4650 __ rldicr(tmp1, bytes_to_write, 56, 7); 4651 __ lxvl(input->to_vsr(), in, tmp1); 4652 4653 ENCODE_CORE 4654 4655 __ bind(le_16_to_write); 4656 // shift bytes_to_write into the upper 8 bits of t1 for use by stxvl 4657 __ rldicr(tmp1, bytes_to_write, 56, 7); 4658 __ stxvl(expanded->to_vsr(), out, tmp1); 4659 __ add(out, out, bytes_to_write); 4660 4661 __ li(pad_char, '='); 4662 __ rlwinm_(modulo_chars, bytes_to_write, 0, 30, 31); // bytes_to_write % 4, set CR0 4663 // Examples: 4664 // remaining bytes_to_write modulo_chars num pad chars 4665 // 0 0 0 0 4666 // 1 2 2 2 4667 // 2 3 3 1 4668 // 3 4 0 0 4669 // 4 6 2 2 4670 // 5 7 3 1 4671 // ... 4672 // 12 16 0 0 4673 // 13 18 2 2 4674 // 14 19 3 1 4675 // 15 20 0 0 4676 __ beq(CR0, no_pad); 4677 __ cmpwi(CR7, modulo_chars, 3); 4678 __ beq(CR7, one_pad_char); 4679 4680 // two pad chars 4681 __ stb(pad_char, out); 4682 __ addi(out, out, 1); 4683 4684 __ bind(one_pad_char); 4685 __ stb(pad_char, out); 4686 4687 __ bind(no_pad); 4688 4689 __ blr(); 4690 return start; 4691 } 4692 4693 #endif // VM_LITTLE_ENDIAN 4694 4695 void generate_lookup_secondary_supers_table_stub() { 4696 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 4697 StubCodeMark mark(this, stub_id); 4698 4699 const Register 4700 r_super_klass = R4_ARG2, 4701 r_array_base = R3_ARG1, 4702 r_array_length = R7_ARG5, 4703 r_array_index = R6_ARG4, 4704 r_sub_klass = R5_ARG3, 4705 r_bitmap = R11_scratch1, 4706 result = R8_ARG6; 4707 4708 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 4709 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 4710 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 4711 r_array_base, r_array_length, r_array_index, 4712 r_bitmap, result, slot); 4713 __ blr(); 4714 } 4715 } 4716 4717 // Slow path implementation for UseSecondarySupersTable. 4718 address generate_lookup_secondary_supers_table_slow_path_stub() { 4719 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 4720 StubCodeMark mark(this, stub_id); 4721 4722 address start = __ pc(); 4723 const Register 4724 r_super_klass = R4_ARG2, 4725 r_array_base = R3_ARG1, 4726 temp1 = R7_ARG5, 4727 r_array_index = R6_ARG4, 4728 r_bitmap = R11_scratch1, 4729 result = R8_ARG6; 4730 4731 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 4732 __ blr(); 4733 4734 return start; 4735 } 4736 4737 address generate_cont_thaw(StubGenStubId stub_id) { 4738 if (!Continuations::enabled()) return nullptr; 4739 4740 Continuation::thaw_kind kind; 4741 bool return_barrier; 4742 bool return_barrier_exception; 4743 4744 switch (stub_id) { 4745 case cont_thaw_id: 4746 kind = Continuation::thaw_top; 4747 return_barrier = false; 4748 return_barrier_exception = false; 4749 break; 4750 case cont_returnBarrier_id: 4751 kind = Continuation::thaw_return_barrier; 4752 return_barrier = true; 4753 return_barrier_exception = false; 4754 break; 4755 case cont_returnBarrierExc_id: 4756 kind = Continuation::thaw_return_barrier_exception; 4757 return_barrier = true; 4758 return_barrier_exception = true; 4759 break; 4760 default: 4761 ShouldNotReachHere(); 4762 } 4763 StubCodeMark mark(this, stub_id); 4764 4765 Register tmp1 = R10_ARG8; 4766 Register tmp2 = R9_ARG7; 4767 Register tmp3 = R8_ARG6; 4768 Register nvtmp = R15_esp; // nonvolatile tmp register 4769 FloatRegister nvftmp = F20; // nonvolatile fp tmp register 4770 4771 address start = __ pc(); 4772 4773 if (kind == Continuation::thaw_top) { 4774 __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC 4775 } 4776 4777 if (return_barrier) { 4778 __ mr(nvtmp, R3_RET); __ fmr(nvftmp, F1_RET); // preserve possible return value from a method returning to the return barrier 4779 DEBUG_ONLY(__ ld_ptr(tmp1, _abi0(callers_sp), R1_SP);) 4780 __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread); 4781 #ifdef ASSERT 4782 __ ld_ptr(tmp2, _abi0(callers_sp), R1_SP); 4783 __ cmpd(CR0, tmp1, tmp2); 4784 __ asm_assert_eq(FILE_AND_LINE ": callers sp is corrupt"); 4785 #endif 4786 } 4787 #ifdef ASSERT 4788 __ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread); 4789 __ cmpd(CR0, R1_SP, tmp1); 4790 __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP"); 4791 #endif 4792 4793 __ li(R4_ARG2, return_barrier ? 1 : 0); 4794 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), R16_thread, R4_ARG2); 4795 4796 #ifdef ASSERT 4797 DEBUG_ONLY(__ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread)); 4798 DEBUG_ONLY(__ cmpd(CR0, R1_SP, tmp1)); 4799 __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP"); 4800 #endif 4801 4802 // R3_RET contains the size of the frames to thaw, 0 if overflow or no more frames 4803 Label thaw_success; 4804 __ cmpdi(CR0, R3_RET, 0); 4805 __ bne(CR0, thaw_success); 4806 __ load_const_optimized(tmp1, (SharedRuntime::throw_StackOverflowError_entry()), R0); 4807 __ mtctr(tmp1); __ bctr(); 4808 __ bind(thaw_success); 4809 4810 __ addi(R3_RET, R3_RET, frame::native_abi_reg_args_size); // Large abi required for C++ calls. 4811 __ neg(R3_RET, R3_RET); 4812 // align down resulting in a smaller negative offset 4813 __ clrrdi(R3_RET, R3_RET, exact_log2(frame::alignment_in_bytes)); 4814 DEBUG_ONLY(__ mr(tmp1, R1_SP);) 4815 __ resize_frame(R3_RET, tmp2); // make room for the thawed frames 4816 4817 __ li(R4_ARG2, kind); 4818 __ call_VM_leaf(Continuation::thaw_entry(), R16_thread, R4_ARG2); 4819 __ mr(R1_SP, R3_RET); // R3_RET contains the SP of the thawed top frame 4820 4821 if (return_barrier) { 4822 // we're now in the caller of the frame that returned to the barrier 4823 __ mr(R3_RET, nvtmp); __ fmr(F1_RET, nvftmp); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 4824 } else { 4825 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 4826 __ li(R3_RET, 0); // return 0 (success) from doYield 4827 } 4828 4829 if (return_barrier_exception) { 4830 Register ex_pc = R17_tos; // nonvolatile register 4831 __ ld(ex_pc, _abi0(lr), R1_SP); // LR 4832 __ mr(nvtmp, R3_RET); // save return value containing the exception oop 4833 // The thawed top frame has got a frame::java_abi. This is not sufficient for the runtime call. 4834 __ push_frame_reg_args(0, tmp1); 4835 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), R16_thread, ex_pc); 4836 __ mtlr(R3_RET); // the exception handler 4837 __ pop_frame(); 4838 // See OptoRuntime::generate_exception_blob for register arguments 4839 __ mr(R3_ARG1, nvtmp); // exception oop 4840 __ mr(R4_ARG2, ex_pc); // exception pc 4841 } else { 4842 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 4843 __ ld(R0, _abi0(lr), R1_SP); // LR 4844 __ mtlr(R0); 4845 } 4846 __ blr(); 4847 4848 return start; 4849 } 4850 4851 address generate_cont_thaw() { 4852 return generate_cont_thaw(StubGenStubId::cont_thaw_id); 4853 } 4854 4855 // TODO: will probably need multiple return barriers depending on return type 4856 4857 address generate_cont_returnBarrier() { 4858 return generate_cont_thaw(StubGenStubId::cont_returnBarrier_id); 4859 } 4860 4861 address generate_cont_returnBarrier_exception() { 4862 return generate_cont_thaw(StubGenStubId::cont_returnBarrierExc_id); 4863 } 4864 4865 address generate_cont_preempt_stub() { 4866 if (!Continuations::enabled()) return nullptr; 4867 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 4868 StubCodeMark mark(this, stub_id); 4869 address start = __ pc(); 4870 4871 __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC 4872 4873 __ reset_last_Java_frame(false /*check_last_java_sp*/); 4874 4875 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 4876 __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread); 4877 4878 Label preemption_cancelled; 4879 __ lbz(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread); 4880 __ cmpwi(CR0, R11_scratch1, 0); 4881 __ bne(CR0, preemption_cancelled); 4882 4883 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 4884 SharedRuntime::continuation_enter_cleanup(_masm); 4885 __ pop_frame(); 4886 __ restore_LR(R11_scratch1); 4887 __ blr(); 4888 4889 // We acquired the monitor after freezing the frames so call thaw to continue execution. 4890 __ bind(preemption_cancelled); 4891 __ li(R11_scratch1, 0); // false 4892 __ stb(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread); 4893 int simm16_offs = __ load_const_optimized(R11_scratch1, ContinuationEntry::thaw_call_pc_address(), R0, true); 4894 __ ld(R11_scratch1, simm16_offs, R11_scratch1); 4895 __ mtctr(R11_scratch1); 4896 __ bctr(); 4897 4898 return start; 4899 } 4900 4901 // exception handler for upcall stubs 4902 address generate_upcall_stub_exception_handler() { 4903 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 4904 StubCodeMark mark(this, stub_id); 4905 address start = __ pc(); 4906 4907 // Native caller has no idea how to handle exceptions, 4908 // so we just crash here. Up to callee to catch exceptions. 4909 __ verify_oop(R3_ARG1); 4910 __ load_const_optimized(R12_scratch2, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception), R0); 4911 __ call_c(R12_scratch2); 4912 __ should_not_reach_here(); 4913 4914 return start; 4915 } 4916 4917 // load Method* target of MethodHandle 4918 // R3_ARG1 = jobject receiver 4919 // R19_method = result Method* 4920 address generate_upcall_stub_load_target() { 4921 4922 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 4923 StubCodeMark mark(this, stub_id); 4924 address start = __ pc(); 4925 4926 __ resolve_global_jobject(R3_ARG1, R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS); 4927 // Load target method from receiver 4928 __ load_heap_oop(R19_method, java_lang_invoke_MethodHandle::form_offset(), R3_ARG1, 4929 R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL); 4930 __ load_heap_oop(R19_method, java_lang_invoke_LambdaForm::vmentry_offset(), R19_method, 4931 R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL); 4932 __ load_heap_oop(R19_method, java_lang_invoke_MemberName::method_offset(), R19_method, 4933 R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL); 4934 __ ld(R19_method, java_lang_invoke_ResolvedMethodName::vmtarget_offset(), R19_method); 4935 __ std(R19_method, in_bytes(JavaThread::callee_target_offset()), R16_thread); // just in case callee is deoptimized 4936 4937 __ blr(); 4938 4939 return start; 4940 } 4941 4942 // Initialization 4943 void generate_preuniverse_stubs() { 4944 // preuniverse stubs are not needed for ppc 4945 } 4946 4947 void generate_initial_stubs() { 4948 // Generates all stubs and initializes the entry points 4949 4950 // Entry points that exist in all platforms. 4951 // Note: This is code that could be shared among different platforms - however the 4952 // benefit seems to be smaller than the disadvantage of having a 4953 // much more complicated generator structure. See also comment in 4954 // stubRoutines.hpp. 4955 4956 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4957 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 4958 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4959 4960 if (UnsafeMemoryAccess::_table == nullptr) { 4961 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 4962 } 4963 4964 // CRC32 Intrinsics. 4965 if (UseCRC32Intrinsics) { 4966 StubRoutines::_crc_table_adr = StubRoutines::ppc::generate_crc_constants(REVERSE_CRC32_POLY); 4967 StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(StubGenStubId::updateBytesCRC32_id); 4968 } 4969 4970 // CRC32C Intrinsics. 4971 if (UseCRC32CIntrinsics) { 4972 StubRoutines::_crc32c_table_addr = StubRoutines::ppc::generate_crc_constants(REVERSE_CRC32C_POLY); 4973 StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(StubGenStubId::updateBytesCRC32C_id); 4974 } 4975 4976 if (VM_Version::supports_float16()) { 4977 // For results consistency both intrinsics should be enabled. 4978 StubRoutines::_hf2f = generate_float16ToFloat(); 4979 StubRoutines::_f2hf = generate_floatToFloat16(); 4980 } 4981 } 4982 4983 void generate_continuation_stubs() { 4984 // Continuation stubs: 4985 StubRoutines::_cont_thaw = generate_cont_thaw(); 4986 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 4987 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 4988 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 4989 } 4990 4991 void generate_final_stubs() { 4992 // Generates all stubs and initializes the entry points 4993 4994 // support for verify_oop (must happen after universe_init) 4995 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4996 4997 // nmethod entry barriers for concurrent class unloading 4998 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 4999 5000 // arraycopy stubs used by compilers 5001 generate_arraycopy_stubs(); 5002 5003 #ifdef COMPILER2 5004 if (UseSecondarySupersTable) { 5005 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 5006 if (!InlineSecondarySupersTest) { 5007 generate_lookup_secondary_supers_table_stub(); 5008 } 5009 } 5010 #endif // COMPILER2 5011 5012 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 5013 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 5014 } 5015 5016 void generate_compiler_stubs() { 5017 #if COMPILER2_OR_JVMCI 5018 5019 #ifdef COMPILER2 5020 if (UseMultiplyToLenIntrinsic) { 5021 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5022 } 5023 if (UseSquareToLenIntrinsic) { 5024 StubRoutines::_squareToLen = generate_squareToLen(); 5025 } 5026 if (UseMulAddIntrinsic) { 5027 StubRoutines::_mulAdd = generate_mulAdd(); 5028 } 5029 if (UseMontgomeryMultiplyIntrinsic) { 5030 StubRoutines::_montgomeryMultiply 5031 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); 5032 } 5033 if (UseMontgomerySquareIntrinsic) { 5034 StubRoutines::_montgomerySquare 5035 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); 5036 } 5037 #endif 5038 5039 // data cache line writeback 5040 if (VM_Version::supports_data_cache_line_flush()) { 5041 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 5042 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 5043 } 5044 5045 if (UseGHASHIntrinsics) { 5046 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5047 } 5048 5049 if (UseAESIntrinsics) { 5050 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5051 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5052 } 5053 5054 if (UseSHA256Intrinsics) { 5055 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 5056 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 5057 } 5058 if (UseSHA512Intrinsics) { 5059 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 5060 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 5061 } 5062 5063 #ifdef VM_LITTLE_ENDIAN 5064 // Currently supported on PPC64LE only 5065 if (UseBASE64Intrinsics) { 5066 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 5067 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 5068 } 5069 #endif 5070 #endif // COMPILER2_OR_JVMCI 5071 } 5072 5073 public: 5074 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 5075 switch(blob_id) { 5076 case preuniverse_id: 5077 generate_preuniverse_stubs(); 5078 break; 5079 case initial_id: 5080 generate_initial_stubs(); 5081 break; 5082 case continuation_id: 5083 generate_continuation_stubs(); 5084 break; 5085 case compiler_id: 5086 generate_compiler_stubs(); 5087 break; 5088 case final_id: 5089 generate_final_stubs(); 5090 break; 5091 default: 5092 fatal("unexpected blob id: %d", blob_id); 5093 break; 5094 }; 5095 } 5096 }; 5097 5098 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 5099 StubGenerator g(code, blob_id); 5100 } 5101