1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2025 SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.inline.hpp"
27 #include "compiler/oopMap.hpp"
28 #include "gc/shared/barrierSet.hpp"
29 #include "gc/shared/barrierSetAssembler.hpp"
30 #include "gc/shared/barrierSetNMethod.hpp"
31 #include "interpreter/interpreter.hpp"
32 #include "nativeInst_ppc.hpp"
33 #include "oops/instanceOop.hpp"
34 #include "oops/method.hpp"
35 #include "oops/objArrayKlass.hpp"
36 #include "oops/oop.inline.hpp"
37 #include "prims/methodHandles.hpp"
38 #include "prims/upcallLinker.hpp"
39 #include "runtime/continuation.hpp"
40 #include "runtime/continuationEntry.inline.hpp"
41 #include "runtime/frame.inline.hpp"
42 #include "runtime/handles.inline.hpp"
43 #include "runtime/javaThread.hpp"
44 #include "runtime/sharedRuntime.hpp"
45 #include "runtime/stubCodeGenerator.hpp"
46 #include "runtime/stubRoutines.hpp"
47 #include "runtime/vm_version.hpp"
48 #include "utilities/align.hpp"
49 #include "utilities/powerOfTwo.hpp"
50 #if INCLUDE_ZGC
51 #include "gc/z/zBarrierSetAssembler.hpp"
52 #endif
53
54 // Declaration and definition of StubGenerator (no .hpp file).
55 // For a more detailed description of the stub routine structure
56 // see the comment in stubRoutines.hpp.
57
58 #define __ _masm->
59
60 #ifdef PRODUCT
61 #define BLOCK_COMMENT(str) // nothing
62 #else
63 #define BLOCK_COMMENT(str) __ block_comment(str)
64 #endif
65
66 #if defined(ABI_ELFv2)
67 #define STUB_ENTRY(name) StubRoutines::name
68 #else
69 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name)->entry()
70 #endif
71
72 class StubGenerator: public StubCodeGenerator {
73 private:
74
75 // Call stubs are used to call Java from C
76 //
77 // Arguments:
78 //
79 // R3 - call wrapper address : address
80 // R4 - result : intptr_t*
81 // R5 - result type : BasicType
82 // R6 - method : Method
83 // R7 - frame mgr entry point : address
84 // R8 - parameter block : intptr_t*
85 // R9 - parameter count in words : int
86 // R10 - thread : Thread*
87 //
88 address generate_call_stub(address& return_address) {
89 // Setup a new c frame, copy java arguments, call template interpreter or
90 // native_entry, and process result.
91
92 StubId stub_id = StubId::stubgen_call_stub_id;
93 StubCodeMark mark(this, stub_id);
94
95 address start = __ function_entry();
96
97 int save_nonvolatile_registers_size = __ save_nonvolatile_registers_size(true, SuperwordUseVSX);
98
99 // some sanity checks
100 STATIC_ASSERT(StackAlignmentInBytes == 16);
101 assert((sizeof(frame::native_abi_minframe) % 16) == 0, "unaligned");
102 assert((sizeof(frame::native_abi_reg_args) % 16) == 0, "unaligned");
103 assert((save_nonvolatile_registers_size % 16) == 0, "unaligned");
104 assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
105 assert((sizeof(frame::entry_frame_locals) % 16) == 0, "unaligned");
106
107 Register r_arg_call_wrapper_addr = R3;
108 Register r_arg_result_addr = R4;
109 Register r_arg_result_type = R5;
110 Register r_arg_method = R6;
111 Register r_arg_entry = R7;
112 Register r_arg_argument_addr = R8;
113 Register r_arg_argument_count = R9;
114 Register r_arg_thread = R10;
115
116 Register r_entryframe_fp = R2; // volatile
117 Register r_argument_size = R11_scratch1; // volatile
118 Register r_top_of_arguments_addr = R21_tmp1;
119
120 {
121 // Stack on entry to call_stub:
122 //
123 // F1 [C_FRAME]
124 // ...
125 Register r_frame_size = R12_scratch2; // volatile
126 Label arguments_copied;
127
128 // Save LR/CR to caller's C_FRAME.
129 __ save_LR_CR(R0);
130
131 // Keep copy of our frame pointer (caller's SP).
132 __ mr(r_entryframe_fp, R1_SP);
133
134 // calculate frame size
135 STATIC_ASSERT(Interpreter::logStackElementSize == 3);
136
137 // space for arguments aligned up: ((arg_count + 1) * 8) &~ 15
138 __ addi(r_frame_size, r_arg_argument_count, 1);
139 __ rldicr(r_frame_size, r_frame_size, 3, 63 - 4);
140
141 // this is the pure space for arguments (excluding alignment padding)
142 __ sldi(r_argument_size, r_arg_argument_count, 3);
143
144 __ addi(r_frame_size, r_frame_size,
145 save_nonvolatile_registers_size + frame::entry_frame_locals_size + frame::top_ijava_frame_abi_size);
146
147 // push ENTRY_FRAME
148 __ push_frame(r_frame_size, R0);
149
150 // Save non-volatiles registers to ENTRY_FRAME.
151 __ save_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
152 true, SuperwordUseVSX);
153
154 BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
155 // Push ENTRY_FRAME including arguments:
156 //
157 // F0 [TOP_IJAVA_FRAME_ABI]
158 // alignment (optional)
159 // [outgoing Java arguments]
160 // [non-volatiles]
161 // [ENTRY_FRAME_LOCALS]
162 // F1 [C_FRAME]
163 // ...
164
165 // initialize call_stub locals (step 1)
166 __ std(r_arg_call_wrapper_addr, _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
167 __ std(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
168 __ std(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
169 // we will save arguments_tos_address later
170
171 BLOCK_COMMENT("Copy Java arguments");
172 // copy Java arguments
173
174 // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
175 __ addi(r_top_of_arguments_addr, r_entryframe_fp,
176 -(save_nonvolatile_registers_size + frame::entry_frame_locals_size));
177 __ sub(r_top_of_arguments_addr, r_top_of_arguments_addr, r_argument_size);
178
179 // any arguments to copy?
180 __ cmpdi(CR0, r_arg_argument_count, 0);
181 __ beq(CR0, arguments_copied);
182
183 // prepare loop and copy arguments in reverse order
184 {
185 Register r_argument_addr = R22_tmp2;
186 Register r_argumentcopy_addr = R23_tmp3;
187 // init CTR with arg_argument_count
188 __ mtctr(r_arg_argument_count);
189
190 // let r_argumentcopy_addr point to last outgoing Java arguments P
191 __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
192
193 // let r_argument_addr point to last incoming java argument
194 __ add(r_argument_addr, r_arg_argument_addr, r_argument_size);
195 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
196
197 // now loop while CTR > 0 and copy arguments
198 {
199 Label next_argument;
200 __ bind(next_argument);
201
202 __ ld(R0, 0, r_argument_addr);
203 // argument_addr--;
204 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
205 __ std(R0, 0, r_argumentcopy_addr);
206 // argumentcopy_addr++;
207 __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
208
209 __ bdnz(next_argument);
210 }
211 }
212
213 // Arguments copied, continue.
214 __ bind(arguments_copied);
215 }
216
217 {
218 BLOCK_COMMENT("Call template interpreter or native entry.");
219 assert_different_registers(r_arg_entry, r_top_of_arguments_addr, r_arg_method, r_arg_thread);
220
221 // Register state on entry to template interpreter / native entry:
222 //
223 // tos - intptr_t* sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
224 // R19_method - Method
225 // R16_thread - JavaThread*
226
227 // Tos must point to last argument - element_size.
228 const Register tos = R15_esp;
229
230 __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
231
232 // initialize call_stub locals (step 2)
233 // now save tos as arguments_tos_address
234 __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
235
236 // load argument registers for call
237 __ mr(R19_method, r_arg_method);
238 __ mr(R16_thread, r_arg_thread);
239 assert(tos != r_arg_method, "trashed r_arg_method");
240 assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
241
242 // Set R15_prev_state to 0 for simplifying checks in callee.
243 __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R0);
244 // Stack on entry to template interpreter / native entry:
245 //
246 // F0 [TOP_IJAVA_FRAME_ABI]
247 // alignment (optional)
248 // [outgoing Java arguments]
249 // [non-volatiles]
250 // [ENTRY_FRAME_LOCALS]
251 // F1 [C_FRAME]
252 // ...
253 //
254
255 // global toc register
256 __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R0);
257 // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
258 // when called via a c2i.
259
260 // Pass initial_caller_sp to framemanager.
261 __ mr(R21_sender_SP, R1_SP);
262
263 // Do a light-weight C-call here, r_arg_entry holds the address
264 // of the interpreter entry point (template interpreter or native entry)
265 // and save runtime-value of LR in return_address.
266 assert(r_arg_entry != tos && r_arg_entry != R19_method && r_arg_entry != R16_thread,
267 "trashed r_arg_entry");
268 return_address = __ call_stub(r_arg_entry);
269 }
270
271 {
272 BLOCK_COMMENT("Returned from template interpreter or native entry.");
273 // Now pop frame, process result, and return to caller.
274
275 // Stack on exit from template interpreter / native entry:
276 //
277 // F0 [ABI]
278 // ...
279 // [non-volatiles]
280 // [ENTRY_FRAME_LOCALS]
281 // F1 [C_FRAME]
282 // ...
283 //
284 // Just pop the topmost frame ...
285 //
286
287 Label ret_is_object;
288 Label ret_is_long;
289 Label ret_is_float;
290 Label ret_is_double;
291
292 Register r_lr = R11_scratch1;
293 Register r_cr = R12_scratch2;
294
295 // Reload some volatile registers which we've spilled before the call
296 // to template interpreter / native entry.
297 // Access all locals via frame pointer, because we know nothing about
298 // the topmost frame's size.
299 __ ld(r_entryframe_fp, _abi0(callers_sp), R1_SP); // restore after call
300 assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
301 __ ld(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
302 __ ld(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
303 __ ld(r_cr, _abi0(cr), r_entryframe_fp);
304 __ ld(r_lr, _abi0(lr), r_entryframe_fp);
305 __ mtcr(r_cr); // restore CR
306 __ mtlr(r_lr); // restore LR
307
308 // Store result depending on type. Everything that is not
309 // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
310 // Using volatile CRs.
311 __ cmpwi(CR1, r_arg_result_type, T_OBJECT);
312 __ cmpwi(CR5, r_arg_result_type, T_LONG);
313 __ cmpwi(CR6, r_arg_result_type, T_FLOAT);
314 __ cmpwi(CR7, r_arg_result_type, T_DOUBLE);
315
316 __ pop_cont_fastpath(); // kills CR0, uses R16_thread
317
318 // restore non-volatile registers
319 __ restore_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
320 true, SuperwordUseVSX);
321
322 // pop frame
323 __ mr(R1_SP, r_entryframe_fp);
324
325 // Stack on exit from call_stub:
326 //
327 // 0 [C_FRAME]
328 // ...
329 //
330 // no call_stub frames left.
331
332 __ beq(CR1, ret_is_object);
333 __ beq(CR5, ret_is_long);
334 __ beq(CR6, ret_is_float);
335 __ beq(CR7, ret_is_double);
336
337 // default:
338 __ stw(R3_RET, 0, r_arg_result_addr);
339 __ blr(); // return to caller
340
341 // case T_OBJECT:
342 // case T_LONG:
343 __ bind(ret_is_object);
344 __ bind(ret_is_long);
345 __ std(R3_RET, 0, r_arg_result_addr);
346 __ blr(); // return to caller
347
348 // case T_FLOAT:
349 __ bind(ret_is_float);
350 __ stfs(F1_RET, 0, r_arg_result_addr);
351 __ blr(); // return to caller
352
353 // case T_DOUBLE:
354 __ bind(ret_is_double);
355 __ stfd(F1_RET, 0, r_arg_result_addr);
356 __ blr(); // return to caller
357 }
358
359 return start;
360 }
361
362 // Return point for a Java call if there's an exception thrown in
363 // Java code. The exception is caught and transformed into a
364 // pending exception stored in JavaThread that can be tested from
365 // within the VM.
366 //
367 address generate_catch_exception() {
368 StubId stub_id = StubId::stubgen_catch_exception_id;
369 StubCodeMark mark(this, stub_id);
370
371 address start = __ pc();
372
373 // Registers alive
374 //
375 // R16_thread
376 // R3_ARG1 - address of pending exception
377 // R4_ARG2 - return address in call stub
378
379 const Register exception_file = R21_tmp1;
380 const Register exception_line = R22_tmp2;
381
382 __ load_const(exception_file, (void*)__FILE__);
383 __ load_const(exception_line, (void*)__LINE__);
384
385 __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
386 // store into `char *'
387 __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
388 // store into `int'
389 __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
390
391 // complete return to VM
392 assert(StubRoutines::_call_stub_return_address != nullptr, "must have been generated before");
393
394 __ mtlr(R4_ARG2);
395 // continue in call stub
396 __ blr();
397
398 return start;
399 }
400
401 // Continuation point for runtime calls returning with a pending
402 // exception. The pending exception check happened in the runtime
403 // or native call stub. The pending exception in Thread is
404 // converted into a Java-level exception.
405 //
406 // Read:
407 //
408 // LR: The pc the runtime library callee wants to return to.
409 // Since the exception occurred in the callee, the return pc
410 // from the point of view of Java is the exception pc.
411 // thread: Needed for method handles.
412 //
413 // Invalidate:
414 //
415 // volatile registers (except below).
416 //
417 // Update:
418 //
419 // R4_ARG2: exception
420 //
421 // (LR is unchanged and is live out).
422 //
423 address generate_forward_exception() {
424 StubId stub_id = StubId::stubgen_forward_exception_id;
425 StubCodeMark mark(this, stub_id);
426 address start = __ pc();
427
428 if (VerifyOops) {
429 // Get pending exception oop.
430 __ ld(R3_ARG1,
431 in_bytes(Thread::pending_exception_offset()),
432 R16_thread);
433 // Make sure that this code is only executed if there is a pending exception.
434 {
435 Label L;
436 __ cmpdi(CR0, R3_ARG1, 0);
437 __ bne(CR0, L);
438 __ stop("StubRoutines::forward exception: no pending exception (1)");
439 __ bind(L);
440 }
441 __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
442 }
443
444 // Save LR/CR and copy exception pc (LR) into R4_ARG2.
445 __ save_LR(R4_ARG2);
446 __ push_frame_reg_args(0, R0);
447 // Find exception handler.
448 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
449 SharedRuntime::exception_handler_for_return_address),
450 R16_thread,
451 R4_ARG2);
452 // Copy handler's address.
453 __ mtctr(R3_RET);
454 __ pop_frame();
455 __ restore_LR(R0);
456
457 // Set up the arguments for the exception handler:
458 // - R3_ARG1: exception oop
459 // - R4_ARG2: exception pc.
460
461 // Load pending exception oop.
462 __ ld(R3_ARG1,
463 in_bytes(Thread::pending_exception_offset()),
464 R16_thread);
465
466 // The exception pc is the return address in the caller.
467 // Must load it into R4_ARG2.
468 __ mflr(R4_ARG2);
469
470 #ifdef ASSERT
471 // Make sure exception is set.
472 {
473 Label L;
474 __ cmpdi(CR0, R3_ARG1, 0);
475 __ bne(CR0, L);
476 __ stop("StubRoutines::forward exception: no pending exception (2)");
477 __ bind(L);
478 }
479 #endif
480
481 // Clear the pending exception.
482 __ li(R0, 0);
483 __ std(R0,
484 in_bytes(Thread::pending_exception_offset()),
485 R16_thread);
486 // Jump to exception handler.
487 __ bctr();
488
489 return start;
490 }
491
492 #undef __
493 #define __ _masm->
494
495 #if !defined(PRODUCT)
496 // Wrapper which calls oopDesc::is_oop_or_null()
497 // Only called by MacroAssembler::verify_oop
498 static void verify_oop_helper(const char* message, oopDesc* o) {
499 if (!oopDesc::is_oop_or_null(o)) {
500 fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
501 }
502 ++ StubRoutines::_verify_oop_count;
503 }
504 #endif
505
506 // Return address of code to be called from code generated by
507 // MacroAssembler::verify_oop.
508 //
509 // Don't generate, rather use C++ code.
510 address generate_verify_oop() {
511 // this is actually a `FunctionDescriptor*'.
512 address start = nullptr;
513
514 #if !defined(PRODUCT)
515 start = CAST_FROM_FN_PTR(address, verify_oop_helper);
516 #endif
517
518 return start;
519 }
520
521 // Computes the Galois/Counter Mode (GCM) product and reduction.
522 //
523 // This function performs polynomial multiplication of the subkey H with
524 // the current GHASH state using vectorized polynomial multiplication (`vpmsumd`).
525 // The subkey H is divided into lower, middle, and higher halves.
526 // The multiplication results are reduced using `vConstC2` to stay within GF(2^128).
527 // The final computed value is stored back into `vState`.
528 static void computeGCMProduct(MacroAssembler* _masm,
529 VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH,
530 VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState,
531 VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct,
532 VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9,
533 VectorRegister vCombinedResult, VectorRegister vSwappedH) {
534 __ vxor(vH, vH, vState);
535 __ vpmsumd(vLowProduct, vLowerH, vH); // L : Lower Half of subkey H
536 __ vpmsumd(vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H
537 __ vpmsumd(vHighProduct, vHigherH, vH); // H : Higher Half of subkey H
538 __ vpmsumd(vReducedLow, vLowProduct, vConstC2); // Reduction
539 __ vsldoi(vTmp8, vMidProduct, vZero, 8); // mL : Extract the lower 64 bits of M
540 __ vsldoi(vTmp9, vZero, vMidProduct, 8); // mH : Extract the higher 64 bits of M
541 __ vxor(vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half
542 __ vxor(vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half
543 __ vsldoi(vLowProduct, vLowProduct, vLowProduct, 8); // Swap
544 __ vxor(vLowProduct, vLowProduct, vReducedLow);
545 __ vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8); // Swap
546 __ vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction using constant
547 __ vxor(vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products
548 __ vxor(vState, vLowProduct, vCombinedResult);
549 }
550
551 // Generate stub for ghash process blocks.
552 //
553 // Arguments for generated stub:
554 // state: R3_ARG1 (long[] state)
555 // subkeyH: R4_ARG2 (long[] subH)
556 // data: R5_ARG3 (byte[] data)
557 // blocks: R6_ARG4 (number of 16-byte blocks to process)
558 //
559 // The polynomials are processed in bit-reflected order for efficiency reasons.
560 // This optimization leverages the structure of the Galois field arithmetic
561 // to minimize the number of bit manipulations required during multiplication.
562 // For an explanation of how this works, refer :
563 // Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich,
564 // Martin Dixon. "Optimized Galois-Counter-Mode Implementation on IntelĀ®
565 // Architecture Processor"
566 // http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf
567 //
568 //
569 address generate_ghash_processBlocks() {
570 StubCodeMark mark(this, "StubRoutines", "ghash");
571 address start = __ function_entry();
572
573 // Registers for parameters
574 Register state = R3_ARG1; // long[] state
575 Register subkeyH = R4_ARG2; // long[] subH
576 Register data = R5_ARG3; // byte[] data
577 Register blocks = R6_ARG4;
578 Register temp1 = R8;
579 // Vector Registers
580 VectorRegister vZero = VR0;
581 VectorRegister vH = VR1;
582 VectorRegister vLowerH = VR2;
583 VectorRegister vHigherH = VR3;
584 VectorRegister vLowProduct = VR4;
585 VectorRegister vMidProduct = VR5;
586 VectorRegister vHighProduct = VR6;
587 VectorRegister vReducedLow = VR7;
588 VectorRegister vTmp8 = VR8;
589 VectorRegister vTmp9 = VR9;
590 VectorRegister vTmp10 = VR10;
591 VectorRegister vSwappedH = VR11;
592 VectorRegister vTmp12 = VR12;
593 VectorRegister loadOrder = VR13;
594 VectorRegister vHigh = VR14;
595 VectorRegister vLow = VR15;
596 VectorRegister vState = VR16;
597 VectorRegister vPerm = VR17;
598 VectorRegister vCombinedResult = VR18;
599 VectorRegister vConstC2 = VR19;
600
601 __ li(temp1, 0xc2);
602 __ sldi(temp1, temp1, 56);
603 __ vspltisb(vZero, 0);
604 __ mtvrd(vConstC2, temp1);
605 __ lxvd2x(vH->to_vsr(), subkeyH);
606 __ lxvd2x(vState->to_vsr(), state);
607 // Operations to obtain lower and higher bytes of subkey H.
608 __ vspltisb(vReducedLow, 1);
609 __ vspltisb(vTmp10, 7);
610 __ vsldoi(vTmp8, vZero, vReducedLow, 1); // 0x1
611 __ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1
612 __ vsplt(vTmp9, 0, vH); // MSB of H
613 __ vsl(vH, vH, vReducedLow); // Carry = H<<7
614 __ vsrab(vTmp9, vTmp9, vTmp10);
615 __ vand(vTmp9, vTmp9, vTmp8); // Carry
616 __ vxor(vTmp10, vH, vTmp9);
617 __ vsldoi(vConstC2, vZero, vConstC2, 8);
618 __ vsldoi(vSwappedH, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H
619 __ vsldoi(vLowerH, vZero, vSwappedH, 8); // H.L
620 __ vsldoi(vHigherH, vSwappedH, vZero, 8); // H.H
621 #ifdef ASSERT
622 __ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero
623 __ asm_assert_ne("blocks should NOT be zero");
624 #endif
625 __ clrldi(blocks, blocks, 32);
626 __ mtctr(blocks);
627 __ lvsl(loadOrder, temp1);
628 #ifdef VM_LITTLE_ENDIAN
629 __ vspltisb(vTmp12, 0xf);
630 __ vxor(loadOrder, loadOrder, vTmp12);
631 #define LE_swap_bytes(x) __ vec_perm(x, x, x, loadOrder)
632 #else
633 #define LE_swap_bytes(x)
634 #endif
635
636 // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation.
637 //
638 // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts,
639 // performing three 128-bit multiplications and combining the results efficiently.
640 //
641 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
642 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
643 //
644 // Inputs:
645 // - vH: The data vector (state), containing both B0 (lower half) and B1 (higher half).
646 // - vLowerH: Lower half of the subkey H (A0).
647 // - vHigherH: Higher half of the subkey H (A1).
648 // - vConstC2: Constant used for reduction (for final processing).
649 //
650 // References:
651 // Shay Gueron, Michael E. Kounavis.
652 // "IntelĀ® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode"
653 // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918
654 //
655 Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop;
656 __ andi(temp1, data, 15);
657 __ cmpwi(CR0, temp1, 0);
658 __ bne(CR0, L_initialize_unaligned_loop);
659
660 __ bind(L_aligned_loop);
661 __ lvx(vH, temp1, data);
662 LE_swap_bytes(vH);
663 computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
664 vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
665 __ addi(data, data, 16);
666 __ bdnz(L_aligned_loop);
667 __ b(L_store);
668
669 __ bind(L_initialize_unaligned_loop);
670 __ li(temp1, 0);
671 __ lvsl(vPerm, temp1, data);
672 __ lvx(vHigh, temp1, data);
673 #ifdef VM_LITTLE_ENDIAN
674 __ vspltisb(vTmp12, -1);
675 __ vxor(vPerm, vPerm, vTmp12);
676 #endif
677 __ bind(L_unaligned_loop);
678 __ addi(data, data, 16);
679 __ lvx(vLow, temp1, data);
680 __ vec_perm(vH, vHigh, vLow, vPerm);
681 computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
682 vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
683 __ vmr(vHigh, vLow);
684 __ bdnz(L_unaligned_loop);
685
686 __ bind(L_store);
687 __ stxvd2x(vState->to_vsr(), state);
688 __ blr();
689
690 return start;
691 }
692 // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
693 //
694 // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
695 // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
696 //
697 // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
698 // for turning on loop predication optimization, and hence the behavior of "array range check"
699 // and "loop invariant check" could be influenced, which potentially boosted JVM98.
700 //
701 // Generate stub for disjoint short fill. If "aligned" is true, the
702 // "to" address is assumed to be heapword aligned.
703 //
704 // Arguments for generated stub:
705 // to: R3_ARG1
706 // value: R4_ARG2
707 // count: R5_ARG3 treated as signed
708 //
709 address generate_fill(StubId stub_id) {
710 BasicType t;
711 bool aligned;
712
713 switch (stub_id) {
714 case StubId::stubgen_jbyte_fill_id:
715 t = T_BYTE;
716 aligned = false;
717 break;
718 case StubId::stubgen_jshort_fill_id:
719 t = T_SHORT;
720 aligned = false;
721 break;
722 case StubId::stubgen_jint_fill_id:
723 t = T_INT;
724 aligned = false;
725 break;
726 case StubId::stubgen_arrayof_jbyte_fill_id:
727 t = T_BYTE;
728 aligned = true;
729 break;
730 case StubId::stubgen_arrayof_jshort_fill_id:
731 t = T_SHORT;
732 aligned = true;
733 break;
734 case StubId::stubgen_arrayof_jint_fill_id:
735 t = T_INT;
736 aligned = true;
737 break;
738 default:
739 ShouldNotReachHere();
740 }
741
742 StubCodeMark mark(this, stub_id);
743 address start = __ function_entry();
744
745 const Register to = R3_ARG1; // source array address
746 const Register value = R4_ARG2; // fill value
747 const Register count = R5_ARG3; // elements count
748 const Register temp = R6_ARG4; // temp register
749
750 //assert_clean_int(count, O3); // Make sure 'count' is clean int.
751
752 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
753 Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
754
755 int shift = -1;
756 switch (t) {
757 case T_BYTE:
758 shift = 2;
759 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
760 __ rldimi(value, value, 8, 48); // 8 bit -> 16 bit
761 __ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
762 __ blt(CR0, L_fill_elements);
763 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
764 break;
765 case T_SHORT:
766 shift = 1;
767 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
768 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
769 __ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
770 __ blt(CR0, L_fill_elements);
771 break;
772 case T_INT:
773 shift = 0;
774 __ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
775 __ blt(CR0, L_fill_4_bytes);
776 break;
777 default: ShouldNotReachHere();
778 }
779
780 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
781 // Align source address at 4 bytes address boundary.
782 if (t == T_BYTE) {
783 // One byte misalignment happens only for byte arrays.
784 __ andi_(temp, to, 1);
785 __ beq(CR0, L_skip_align1);
786 __ stb(value, 0, to);
787 __ addi(to, to, 1);
788 __ addi(count, count, -1);
789 __ bind(L_skip_align1);
790 }
791 // Two bytes misalignment happens only for byte and short (char) arrays.
792 __ andi_(temp, to, 2);
793 __ beq(CR0, L_skip_align2);
794 __ sth(value, 0, to);
795 __ addi(to, to, 2);
796 __ addi(count, count, -(1 << (shift - 1)));
797 __ bind(L_skip_align2);
798 }
799
800 if (!aligned) {
801 // Align to 8 bytes, we know we are 4 byte aligned to start.
802 __ andi_(temp, to, 7);
803 __ beq(CR0, L_fill_32_bytes);
804 __ stw(value, 0, to);
805 __ addi(to, to, 4);
806 __ addi(count, count, -(1 << shift));
807 __ bind(L_fill_32_bytes);
808 }
809
810 __ li(temp, 8<<shift); // Prepare for 32 byte loop.
811 // Clone bytes int->long as above.
812 __ rldimi(value, value, 32, 0); // 32 bit -> 64 bit
813
814 Label L_check_fill_8_bytes;
815 // Fill 32-byte chunks.
816 __ subf_(count, temp, count);
817 __ blt(CR0, L_check_fill_8_bytes);
818
819 Label L_fill_32_bytes_loop;
820 __ align(32);
821 __ bind(L_fill_32_bytes_loop);
822
823 __ std(value, 0, to);
824 __ std(value, 8, to);
825 __ subf_(count, temp, count); // Update count.
826 __ std(value, 16, to);
827 __ std(value, 24, to);
828
829 __ addi(to, to, 32);
830 __ bge(CR0, L_fill_32_bytes_loop);
831
832 __ bind(L_check_fill_8_bytes);
833 __ add_(count, temp, count);
834 __ beq(CR0, L_exit);
835 __ addic_(count, count, -(2 << shift));
836 __ blt(CR0, L_fill_4_bytes);
837
838 //
839 // Length is too short, just fill 8 bytes at a time.
840 //
841 Label L_fill_8_bytes_loop;
842 __ bind(L_fill_8_bytes_loop);
843 __ std(value, 0, to);
844 __ addic_(count, count, -(2 << shift));
845 __ addi(to, to, 8);
846 __ bge(CR0, L_fill_8_bytes_loop);
847
848 // Fill trailing 4 bytes.
849 __ bind(L_fill_4_bytes);
850 __ andi_(temp, count, 1<<shift);
851 __ beq(CR0, L_fill_2_bytes);
852
853 __ stw(value, 0, to);
854 if (t == T_BYTE || t == T_SHORT) {
855 __ addi(to, to, 4);
856 // Fill trailing 2 bytes.
857 __ bind(L_fill_2_bytes);
858 __ andi_(temp, count, 1<<(shift-1));
859 __ beq(CR0, L_fill_byte);
860 __ sth(value, 0, to);
861 if (t == T_BYTE) {
862 __ addi(to, to, 2);
863 // Fill trailing byte.
864 __ bind(L_fill_byte);
865 __ andi_(count, count, 1);
866 __ beq(CR0, L_exit);
867 __ stb(value, 0, to);
868 } else {
869 __ bind(L_fill_byte);
870 }
871 } else {
872 __ bind(L_fill_2_bytes);
873 }
874 __ bind(L_exit);
875 __ blr();
876
877 // Handle copies less than 8 bytes. Int is handled elsewhere.
878 if (t == T_BYTE) {
879 __ bind(L_fill_elements);
880 Label L_fill_2, L_fill_4;
881 __ andi_(temp, count, 1);
882 __ beq(CR0, L_fill_2);
883 __ stb(value, 0, to);
884 __ addi(to, to, 1);
885 __ bind(L_fill_2);
886 __ andi_(temp, count, 2);
887 __ beq(CR0, L_fill_4);
888 __ stb(value, 0, to);
889 __ stb(value, 0, to);
890 __ addi(to, to, 2);
891 __ bind(L_fill_4);
892 __ andi_(temp, count, 4);
893 __ beq(CR0, L_exit);
894 __ stb(value, 0, to);
895 __ stb(value, 1, to);
896 __ stb(value, 2, to);
897 __ stb(value, 3, to);
898 __ blr();
899 }
900
901 if (t == T_SHORT) {
902 Label L_fill_2;
903 __ bind(L_fill_elements);
904 __ andi_(temp, count, 1);
905 __ beq(CR0, L_fill_2);
906 __ sth(value, 0, to);
907 __ addi(to, to, 2);
908 __ bind(L_fill_2);
909 __ andi_(temp, count, 2);
910 __ beq(CR0, L_exit);
911 __ sth(value, 0, to);
912 __ sth(value, 2, to);
913 __ blr();
914 }
915 return start;
916 }
917
918 inline void assert_positive_int(Register count) {
919 #ifdef ASSERT
920 __ srdi_(R0, count, 31);
921 __ asm_assert_eq("missing zero extend");
922 #endif
923 }
924
925 // Generate overlap test for array copy stubs.
926 //
927 // Input:
928 // R3_ARG1 - from
929 // R4_ARG2 - to
930 // R5_ARG3 - element count
931 //
932 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
933 Register tmp1 = R6_ARG4;
934 Register tmp2 = R7_ARG5;
935
936 assert_positive_int(R5_ARG3);
937
938 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
939 __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
940 __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
941 __ cmpld(CR1, tmp1, tmp2);
942 __ crnand(CR0, Assembler::less, CR1, Assembler::less);
943 // Overlaps if Src before dst and distance smaller than size.
944 // Branch to forward copy routine otherwise (within range of 32kB).
945 __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::less), no_overlap_target);
946
947 // need to copy backwards
948 }
949
950 // This is common errorexit stub for UnsafeMemoryAccess.
951 address generate_unsafecopy_common_error_exit() {
952 address start_pc = __ pc();
953 Register tmp1 = R6_ARG4;
954 // probably copy stub would have changed value reset it.
955 if (VM_Version::has_mfdscr()) {
956 __ load_const_optimized(tmp1, VM_Version::_dscr_val);
957 __ mtdscr(tmp1);
958 }
959 __ li(R3_RET, 0); // return 0
960 __ blr();
961 return start_pc;
962 }
963
964 // The guideline in the implementations of generate_disjoint_xxx_copy
965 // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
966 // single instructions, but to avoid alignment interrupts (see subsequent
967 // comment). Furthermore, we try to minimize misaligned access, even
968 // though they cause no alignment interrupt.
969 //
970 // In Big-Endian mode, the PowerPC architecture requires implementations to
971 // handle automatically misaligned integer halfword and word accesses,
972 // word-aligned integer doubleword accesses, and word-aligned floating-point
973 // accesses. Other accesses may or may not generate an Alignment interrupt
974 // depending on the implementation.
975 // Alignment interrupt handling may require on the order of hundreds of cycles,
976 // so every effort should be made to avoid misaligned memory values.
977 //
978 //
979 // Generate stub for disjoint byte copy. If "aligned" is true, the
980 // "from" and "to" addresses are assumed to be heapword aligned.
981 //
982 // Arguments for generated stub:
983 // from: R3_ARG1
984 // to: R4_ARG2
985 // count: R5_ARG3 treated as signed
986 //
987 address generate_disjoint_byte_copy(StubId stub_id) {
988 bool aligned;
989 switch (stub_id) {
990 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
991 aligned = false;
992 break;
993 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
994 aligned = true;
995 break;
996 default:
997 ShouldNotReachHere();
998 }
999
1000 StubCodeMark mark(this, stub_id);
1001 address start = __ function_entry();
1002 assert_positive_int(R5_ARG3);
1003
1004 Register tmp1 = R6_ARG4;
1005 Register tmp2 = R7_ARG5;
1006 Register tmp3 = R8_ARG6;
1007 Register tmp4 = R9_ARG7;
1008
1009 VectorSRegister tmp_vsr1 = VSR1;
1010 VectorSRegister tmp_vsr2 = VSR2;
1011
1012 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
1013 {
1014 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1015 UnsafeMemoryAccessMark umam(this, !aligned, false);
1016
1017 // Don't try anything fancy if arrays don't have many elements.
1018 __ li(tmp3, 0);
1019 __ cmpwi(CR0, R5_ARG3, 17);
1020 __ ble(CR0, l_6); // copy 4 at a time
1021
1022 if (!aligned) {
1023 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1024 __ andi_(tmp1, tmp1, 3);
1025 __ bne(CR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1026
1027 // Copy elements if necessary to align to 4 bytes.
1028 __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1029 __ andi_(tmp1, tmp1, 3);
1030 __ beq(CR0, l_2);
1031
1032 __ subf(R5_ARG3, tmp1, R5_ARG3);
1033 __ bind(l_9);
1034 __ lbz(tmp2, 0, R3_ARG1);
1035 __ addic_(tmp1, tmp1, -1);
1036 __ stb(tmp2, 0, R4_ARG2);
1037 __ addi(R3_ARG1, R3_ARG1, 1);
1038 __ addi(R4_ARG2, R4_ARG2, 1);
1039 __ bne(CR0, l_9);
1040
1041 __ bind(l_2);
1042 }
1043
1044 // copy 8 elements at a time
1045 __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1046 __ andi_(tmp1, tmp2, 7);
1047 __ bne(CR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1048
1049 // copy a 2-element word if necessary to align to 8 bytes
1050 __ andi_(R0, R3_ARG1, 7);
1051 __ beq(CR0, l_7);
1052
1053 __ lwzx(tmp2, R3_ARG1, tmp3);
1054 __ addi(R5_ARG3, R5_ARG3, -4);
1055 __ stwx(tmp2, R4_ARG2, tmp3);
1056 { // FasterArrayCopy
1057 __ addi(R3_ARG1, R3_ARG1, 4);
1058 __ addi(R4_ARG2, R4_ARG2, 4);
1059 }
1060 __ bind(l_7);
1061
1062 { // FasterArrayCopy
1063 __ cmpwi(CR0, R5_ARG3, 31);
1064 __ ble(CR0, l_6); // copy 2 at a time if less than 32 elements remain
1065
1066 __ srdi(tmp1, R5_ARG3, 5);
1067 __ andi_(R5_ARG3, R5_ARG3, 31);
1068 __ mtctr(tmp1);
1069
1070
1071 // Prefetch the data into the L2 cache.
1072 __ dcbt(R3_ARG1, 0);
1073
1074 // If supported set DSCR pre-fetch to deepest.
1075 if (VM_Version::has_mfdscr()) {
1076 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1077 __ mtdscr(tmp2);
1078 }
1079 __ li(tmp1, 16);
1080
1081 // Backbranch target aligned to 32-byte. Not 16-byte align as
1082 // loop contains < 8 instructions that fit inside a single
1083 // i-cache sector.
1084 __ align(32);
1085
1086 __ bind(l_10);
1087 // Use loop with VSX load/store instructions to
1088 // copy 32 elements a time.
1089 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1090 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1091 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1092 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1093 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1094 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1095 __ bdnz(l_10); // Dec CTR and loop if not zero.
1096
1097 // Restore DSCR pre-fetch value.
1098 if (VM_Version::has_mfdscr()) {
1099 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1100 __ mtdscr(tmp2);
1101 }
1102
1103 } // FasterArrayCopy
1104
1105 __ bind(l_6);
1106
1107 // copy 4 elements at a time
1108 __ cmpwi(CR0, R5_ARG3, 4);
1109 __ blt(CR0, l_1);
1110 __ srdi(tmp1, R5_ARG3, 2);
1111 __ mtctr(tmp1); // is > 0
1112 __ andi_(R5_ARG3, R5_ARG3, 3);
1113
1114 { // FasterArrayCopy
1115 __ addi(R3_ARG1, R3_ARG1, -4);
1116 __ addi(R4_ARG2, R4_ARG2, -4);
1117 __ bind(l_3);
1118 __ lwzu(tmp2, 4, R3_ARG1);
1119 __ stwu(tmp2, 4, R4_ARG2);
1120 __ bdnz(l_3);
1121 __ addi(R3_ARG1, R3_ARG1, 4);
1122 __ addi(R4_ARG2, R4_ARG2, 4);
1123 }
1124
1125 // do single element copy
1126 __ bind(l_1);
1127 __ cmpwi(CR0, R5_ARG3, 0);
1128 __ beq(CR0, l_4);
1129
1130 { // FasterArrayCopy
1131 __ mtctr(R5_ARG3);
1132 __ addi(R3_ARG1, R3_ARG1, -1);
1133 __ addi(R4_ARG2, R4_ARG2, -1);
1134
1135 __ bind(l_5);
1136 __ lbzu(tmp2, 1, R3_ARG1);
1137 __ stbu(tmp2, 1, R4_ARG2);
1138 __ bdnz(l_5);
1139 }
1140 }
1141
1142 __ bind(l_4);
1143 __ li(R3_RET, 0); // return 0
1144 __ blr();
1145
1146 return start;
1147 }
1148
1149 // Generate stub for conjoint byte copy. If "aligned" is true, the
1150 // "from" and "to" addresses are assumed to be heapword aligned.
1151 //
1152 // Arguments for generated stub:
1153 // from: R3_ARG1
1154 // to: R4_ARG2
1155 // count: R5_ARG3 treated as signed
1156 //
1157 address generate_conjoint_byte_copy(StubId stub_id) {
1158 bool aligned;
1159 switch (stub_id) {
1160 case StubId::stubgen_jbyte_arraycopy_id:
1161 aligned = false;
1162 break;
1163 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1164 aligned = true;
1165 break;
1166 default:
1167 ShouldNotReachHere();
1168 }
1169
1170 StubCodeMark mark(this, stub_id);
1171 address start = __ function_entry();
1172 assert_positive_int(R5_ARG3);
1173
1174 Register tmp1 = R6_ARG4;
1175 Register tmp2 = R7_ARG5;
1176 Register tmp3 = R8_ARG6;
1177
1178 address nooverlap_target = aligned ?
1179 STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy()) :
1180 STUB_ENTRY(jbyte_disjoint_arraycopy());
1181
1182 array_overlap_test(nooverlap_target, 0);
1183 // Do reverse copy. We assume the case of actual overlap is rare enough
1184 // that we don't have to optimize it.
1185 Label l_1, l_2;
1186 {
1187 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1188 UnsafeMemoryAccessMark umam(this, !aligned, false);
1189 __ b(l_2);
1190 __ bind(l_1);
1191 __ stbx(tmp1, R4_ARG2, R5_ARG3);
1192 __ bind(l_2);
1193 __ addic_(R5_ARG3, R5_ARG3, -1);
1194 __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1195 __ bge(CR0, l_1);
1196 }
1197 __ li(R3_RET, 0); // return 0
1198 __ blr();
1199
1200 return start;
1201 }
1202
1203 // Generate stub for disjoint short copy. If "aligned" is true, the
1204 // "from" and "to" addresses are assumed to be heapword aligned.
1205 //
1206 // Arguments for generated stub:
1207 // from: R3_ARG1
1208 // to: R4_ARG2
1209 // elm.count: R5_ARG3 treated as signed
1210 //
1211 // Strategy for aligned==true:
1212 //
1213 // If length <= 9:
1214 // 1. copy 2 elements at a time (l_6)
1215 // 2. copy last element if original element count was odd (l_1)
1216 //
1217 // If length > 9:
1218 // 1. copy 4 elements at a time until less than 4 elements are left (l_7)
1219 // 2. copy 2 elements at a time until less than 2 elements are left (l_6)
1220 // 3. copy last element if one was left in step 2. (l_1)
1221 //
1222 //
1223 // Strategy for aligned==false:
1224 //
1225 // If length <= 9: same as aligned==true case, but NOTE: load/stores
1226 // can be unaligned (see comment below)
1227 //
1228 // If length > 9:
1229 // 1. continue with step 6. if the alignment of from and to mod 4
1230 // is different.
1231 // 2. align from and to to 4 bytes by copying 1 element if necessary
1232 // 3. at l_2 from and to are 4 byte aligned; continue with
1233 // 5. if they cannot be aligned to 8 bytes because they have
1234 // got different alignment mod 8.
1235 // 4. at this point we know that both, from and to, have the same
1236 // alignment mod 8, now copy one element if necessary to get
1237 // 8 byte alignment of from and to.
1238 // 5. copy 4 elements at a time until less than 4 elements are
1239 // left; depending on step 3. all load/stores are aligned or
1240 // either all loads or all stores are unaligned.
1241 // 6. copy 2 elements at a time until less than 2 elements are
1242 // left (l_6); arriving here from step 1., there is a chance
1243 // that all accesses are unaligned.
1244 // 7. copy last element if one was left in step 6. (l_1)
1245 //
1246 // There are unaligned data accesses using integer load/store
1247 // instructions in this stub. POWER allows such accesses.
1248 //
1249 // According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1250 // Chapter 2: Effect of Operand Placement on Performance) unaligned
1251 // integer load/stores have good performance. Only unaligned
1252 // floating point load/stores can have poor performance.
1253 //
1254 // TODO:
1255 //
1256 // 1. check if aligning the backbranch target of loops is beneficial
1257 //
1258 address generate_disjoint_short_copy(StubId stub_id) {
1259 bool aligned;
1260 switch (stub_id) {
1261 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1262 aligned = false;
1263 break;
1264 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1265 aligned = true;
1266 break;
1267 default:
1268 ShouldNotReachHere();
1269 }
1270
1271 StubCodeMark mark(this, stub_id);
1272
1273 Register tmp1 = R6_ARG4;
1274 Register tmp2 = R7_ARG5;
1275 Register tmp3 = R8_ARG6;
1276 Register tmp4 = R9_ARG7;
1277
1278 VectorSRegister tmp_vsr1 = VSR1;
1279 VectorSRegister tmp_vsr2 = VSR2;
1280
1281 address start = __ function_entry();
1282 assert_positive_int(R5_ARG3);
1283
1284 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1285 {
1286 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1287 UnsafeMemoryAccessMark umam(this, !aligned, false);
1288 // don't try anything fancy if arrays don't have many elements
1289 __ li(tmp3, 0);
1290 __ cmpwi(CR0, R5_ARG3, 9);
1291 __ ble(CR0, l_6); // copy 2 at a time
1292
1293 if (!aligned) {
1294 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1295 __ andi_(tmp1, tmp1, 3);
1296 __ bne(CR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1297
1298 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1299
1300 // Copy 1 element if necessary to align to 4 bytes.
1301 __ andi_(tmp1, R3_ARG1, 3);
1302 __ beq(CR0, l_2);
1303
1304 __ lhz(tmp2, 0, R3_ARG1);
1305 __ addi(R3_ARG1, R3_ARG1, 2);
1306 __ sth(tmp2, 0, R4_ARG2);
1307 __ addi(R4_ARG2, R4_ARG2, 2);
1308 __ addi(R5_ARG3, R5_ARG3, -1);
1309 __ bind(l_2);
1310
1311 // At this point the positions of both, from and to, are at least 4 byte aligned.
1312
1313 // Copy 4 elements at a time.
1314 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1315 __ xorr(tmp2, R3_ARG1, R4_ARG2);
1316 __ andi_(tmp1, tmp2, 7);
1317 __ bne(CR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1318
1319 // Copy a 2-element word if necessary to align to 8 bytes.
1320 __ andi_(R0, R3_ARG1, 7);
1321 __ beq(CR0, l_7);
1322
1323 __ lwzx(tmp2, R3_ARG1, tmp3);
1324 __ addi(R5_ARG3, R5_ARG3, -2);
1325 __ stwx(tmp2, R4_ARG2, tmp3);
1326 { // FasterArrayCopy
1327 __ addi(R3_ARG1, R3_ARG1, 4);
1328 __ addi(R4_ARG2, R4_ARG2, 4);
1329 }
1330 }
1331
1332 __ bind(l_7);
1333
1334 // Copy 4 elements at a time; either the loads or the stores can
1335 // be unaligned if aligned == false.
1336
1337 { // FasterArrayCopy
1338 __ cmpwi(CR0, R5_ARG3, 15);
1339 __ ble(CR0, l_6); // copy 2 at a time if less than 16 elements remain
1340
1341 __ srdi(tmp1, R5_ARG3, 4);
1342 __ andi_(R5_ARG3, R5_ARG3, 15);
1343 __ mtctr(tmp1);
1344
1345
1346 // Processor supports VSX, so use it to mass copy.
1347
1348 // Prefetch src data into L2 cache.
1349 __ dcbt(R3_ARG1, 0);
1350
1351 // If supported set DSCR pre-fetch to deepest.
1352 if (VM_Version::has_mfdscr()) {
1353 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1354 __ mtdscr(tmp2);
1355 }
1356 __ li(tmp1, 16);
1357
1358 // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1359 // as loop contains < 8 instructions that fit inside a single
1360 // i-cache sector.
1361 __ align(32);
1362
1363 __ bind(l_9);
1364 // Use loop with VSX load/store instructions to
1365 // copy 16 elements a time.
1366 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src.
1367 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst.
1368 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
1369 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1370 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
1371 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
1372 __ bdnz(l_9); // Dec CTR and loop if not zero.
1373
1374 // Restore DSCR pre-fetch value.
1375 if (VM_Version::has_mfdscr()) {
1376 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1377 __ mtdscr(tmp2);
1378 }
1379
1380 } // FasterArrayCopy
1381 __ bind(l_6);
1382
1383 // copy 2 elements at a time
1384 { // FasterArrayCopy
1385 __ cmpwi(CR0, R5_ARG3, 2);
1386 __ blt(CR0, l_1);
1387 __ srdi(tmp1, R5_ARG3, 1);
1388 __ andi_(R5_ARG3, R5_ARG3, 1);
1389
1390 __ addi(R3_ARG1, R3_ARG1, -4);
1391 __ addi(R4_ARG2, R4_ARG2, -4);
1392 __ mtctr(tmp1);
1393
1394 __ bind(l_3);
1395 __ lwzu(tmp2, 4, R3_ARG1);
1396 __ stwu(tmp2, 4, R4_ARG2);
1397 __ bdnz(l_3);
1398
1399 __ addi(R3_ARG1, R3_ARG1, 4);
1400 __ addi(R4_ARG2, R4_ARG2, 4);
1401 }
1402
1403 // do single element copy
1404 __ bind(l_1);
1405 __ cmpwi(CR0, R5_ARG3, 0);
1406 __ beq(CR0, l_4);
1407
1408 { // FasterArrayCopy
1409 __ mtctr(R5_ARG3);
1410 __ addi(R3_ARG1, R3_ARG1, -2);
1411 __ addi(R4_ARG2, R4_ARG2, -2);
1412
1413 __ bind(l_5);
1414 __ lhzu(tmp2, 2, R3_ARG1);
1415 __ sthu(tmp2, 2, R4_ARG2);
1416 __ bdnz(l_5);
1417 }
1418 }
1419
1420 __ bind(l_4);
1421 __ li(R3_RET, 0); // return 0
1422 __ blr();
1423
1424 return start;
1425 }
1426
1427 // Generate stub for conjoint short copy. If "aligned" is true, the
1428 // "from" and "to" addresses are assumed to be heapword aligned.
1429 //
1430 // Arguments for generated stub:
1431 // from: R3_ARG1
1432 // to: R4_ARG2
1433 // count: R5_ARG3 treated as signed
1434 //
1435 address generate_conjoint_short_copy(StubId stub_id) {
1436 bool aligned;
1437 switch (stub_id) {
1438 case StubId::stubgen_jshort_arraycopy_id:
1439 aligned = false;
1440 break;
1441 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1442 aligned = true;
1443 break;
1444 default:
1445 ShouldNotReachHere();
1446 }
1447
1448 StubCodeMark mark(this, stub_id);
1449 address start = __ function_entry();
1450 assert_positive_int(R5_ARG3);
1451
1452 Register tmp1 = R6_ARG4;
1453 Register tmp2 = R7_ARG5;
1454 Register tmp3 = R8_ARG6;
1455
1456 address nooverlap_target = aligned ?
1457 STUB_ENTRY(arrayof_jshort_disjoint_arraycopy()) :
1458 STUB_ENTRY(jshort_disjoint_arraycopy());
1459
1460 array_overlap_test(nooverlap_target, 1);
1461
1462 Label l_1, l_2;
1463 {
1464 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1465 UnsafeMemoryAccessMark umam(this, !aligned, false);
1466 __ sldi(tmp1, R5_ARG3, 1);
1467 __ b(l_2);
1468 __ bind(l_1);
1469 __ sthx(tmp2, R4_ARG2, tmp1);
1470 __ bind(l_2);
1471 __ addic_(tmp1, tmp1, -2);
1472 __ lhzx(tmp2, R3_ARG1, tmp1);
1473 __ bge(CR0, l_1);
1474 }
1475 __ li(R3_RET, 0); // return 0
1476 __ blr();
1477
1478 return start;
1479 }
1480
1481 // Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned"
1482 // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1483 //
1484 // Arguments:
1485 // from: R3_ARG1
1486 // to: R4_ARG2
1487 // count: R5_ARG3 treated as signed
1488 //
1489 void generate_disjoint_int_copy_core(bool aligned) {
1490 Register tmp1 = R6_ARG4;
1491 Register tmp2 = R7_ARG5;
1492 Register tmp3 = R8_ARG6;
1493 Register tmp4 = R0;
1494
1495 VectorSRegister tmp_vsr1 = VSR1;
1496 VectorSRegister tmp_vsr2 = VSR2;
1497
1498 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1499
1500 // for short arrays, just do single element copy
1501 __ li(tmp3, 0);
1502 __ cmpwi(CR0, R5_ARG3, 5);
1503 __ ble(CR0, l_2);
1504
1505 if (!aligned) {
1506 // check if arrays have same alignment mod 8.
1507 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1508 __ andi_(R0, tmp1, 7);
1509 // Not the same alignment, but ld and std just need to be 4 byte aligned.
1510 __ bne(CR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1511
1512 // copy 1 element to align to and from on an 8 byte boundary
1513 __ andi_(R0, R3_ARG1, 7);
1514 __ beq(CR0, l_4);
1515
1516 __ lwzx(tmp2, R3_ARG1, tmp3);
1517 __ addi(R5_ARG3, R5_ARG3, -1);
1518 __ stwx(tmp2, R4_ARG2, tmp3);
1519 { // FasterArrayCopy
1520 __ addi(R3_ARG1, R3_ARG1, 4);
1521 __ addi(R4_ARG2, R4_ARG2, 4);
1522 }
1523 __ bind(l_4);
1524 }
1525
1526 { // FasterArrayCopy
1527 __ cmpwi(CR0, R5_ARG3, 7);
1528 __ ble(CR0, l_2); // copy 1 at a time if less than 8 elements remain
1529
1530 __ srdi(tmp1, R5_ARG3, 3);
1531 __ andi_(R5_ARG3, R5_ARG3, 7);
1532 __ mtctr(tmp1);
1533
1534 // Processor supports VSX, so use it to mass copy.
1535
1536 // Prefetch the data into the L2 cache.
1537 __ dcbt(R3_ARG1, 0);
1538
1539 // Set DSCR pre-fetch to deepest.
1540 if (VM_Version::has_mfdscr()) {
1541 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1542 __ mtdscr(tmp2);
1543 }
1544 __ li(tmp1, 16);
1545
1546 // Backbranch target aligned to 32-byte. Not 16-byte align as
1547 // loop contains < 8 instructions that fit inside a single
1548 // i-cache sector.
1549 __ align(32);
1550
1551 __ bind(l_7);
1552 // Use loop with VSX load/store instructions to
1553 // copy 8 elements a time.
1554 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1555 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1556 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1557 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1558 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1559 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1560 __ bdnz(l_7); // Dec CTR and loop if not zero.
1561
1562 // Restore DSCR pre-fetch value.
1563 if (VM_Version::has_mfdscr()) {
1564 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1565 __ mtdscr(tmp2);
1566 }
1567
1568 } // FasterArrayCopy
1569
1570 // copy 1 element at a time
1571 __ bind(l_2);
1572 __ cmpwi(CR0, R5_ARG3, 0);
1573 __ beq(CR0, l_1);
1574
1575 { // FasterArrayCopy
1576 __ mtctr(R5_ARG3);
1577 __ addi(R3_ARG1, R3_ARG1, -4);
1578 __ addi(R4_ARG2, R4_ARG2, -4);
1579
1580 __ bind(l_3);
1581 __ lwzu(tmp2, 4, R3_ARG1);
1582 __ stwu(tmp2, 4, R4_ARG2);
1583 __ bdnz(l_3);
1584 }
1585
1586 __ bind(l_1);
1587 return;
1588 }
1589
1590 // Generate stub for disjoint int copy. If "aligned" is true, the
1591 // "from" and "to" addresses are assumed to be heapword aligned.
1592 //
1593 // Arguments for generated stub:
1594 // from: R3_ARG1
1595 // to: R4_ARG2
1596 // count: R5_ARG3 treated as signed
1597 //
1598 address generate_disjoint_int_copy(StubId stub_id) {
1599 bool aligned;
1600 switch (stub_id) {
1601 case StubId::stubgen_jint_disjoint_arraycopy_id:
1602 aligned = false;
1603 break;
1604 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1605 aligned = true;
1606 break;
1607 default:
1608 ShouldNotReachHere();
1609 }
1610
1611 StubCodeMark mark(this, stub_id);
1612 address start = __ function_entry();
1613 assert_positive_int(R5_ARG3);
1614 {
1615 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1616 UnsafeMemoryAccessMark umam(this, !aligned, false);
1617 generate_disjoint_int_copy_core(aligned);
1618 }
1619 __ li(R3_RET, 0); // return 0
1620 __ blr();
1621 return start;
1622 }
1623
1624 // Generate core code for conjoint int copy (and oop copy on
1625 // 32-bit). If "aligned" is true, the "from" and "to" addresses
1626 // are assumed to be heapword aligned.
1627 //
1628 // Arguments:
1629 // from: R3_ARG1
1630 // to: R4_ARG2
1631 // count: R5_ARG3 treated as signed
1632 //
1633 void generate_conjoint_int_copy_core(bool aligned) {
1634 // Do reverse copy. We assume the case of actual overlap is rare enough
1635 // that we don't have to optimize it.
1636
1637 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1638
1639 Register tmp1 = R6_ARG4;
1640 Register tmp2 = R7_ARG5;
1641 Register tmp3 = R8_ARG6;
1642 Register tmp4 = R0;
1643
1644 VectorSRegister tmp_vsr1 = VSR1;
1645 VectorSRegister tmp_vsr2 = VSR2;
1646
1647 { // FasterArrayCopy
1648 __ cmpwi(CR0, R5_ARG3, 0);
1649 __ beq(CR0, l_6);
1650
1651 __ sldi(R5_ARG3, R5_ARG3, 2);
1652 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1653 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1654 __ srdi(R5_ARG3, R5_ARG3, 2);
1655
1656 if (!aligned) {
1657 // check if arrays have same alignment mod 8.
1658 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1659 __ andi_(R0, tmp1, 7);
1660 // Not the same alignment, but ld and std just need to be 4 byte aligned.
1661 __ bne(CR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1662
1663 // copy 1 element to align to and from on an 8 byte boundary
1664 __ andi_(R0, R3_ARG1, 7);
1665 __ beq(CR0, l_7);
1666
1667 __ addi(R3_ARG1, R3_ARG1, -4);
1668 __ addi(R4_ARG2, R4_ARG2, -4);
1669 __ addi(R5_ARG3, R5_ARG3, -1);
1670 __ lwzx(tmp2, R3_ARG1);
1671 __ stwx(tmp2, R4_ARG2);
1672 __ bind(l_7);
1673 }
1674
1675 __ cmpwi(CR0, R5_ARG3, 7);
1676 __ ble(CR0, l_5); // copy 1 at a time if less than 8 elements remain
1677
1678 __ srdi(tmp1, R5_ARG3, 3);
1679 __ andi(R5_ARG3, R5_ARG3, 7);
1680 __ mtctr(tmp1);
1681
1682 // Processor supports VSX, so use it to mass copy.
1683 // Prefetch the data into the L2 cache.
1684 __ dcbt(R3_ARG1, 0);
1685
1686 // Set DSCR pre-fetch to deepest.
1687 if (VM_Version::has_mfdscr()) {
1688 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1689 __ mtdscr(tmp2);
1690 }
1691 __ li(tmp1, 16);
1692
1693 // Backbranch target aligned to 32-byte. Not 16-byte align as
1694 // loop contains < 8 instructions that fit inside a single
1695 // i-cache sector.
1696 __ align(32);
1697
1698 __ bind(l_4);
1699 // Use loop with VSX load/store instructions to
1700 // copy 8 elements a time.
1701 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
1702 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
1703 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
1704 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1705 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1706 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1707 __ bdnz(l_4);
1708
1709 // Restore DSCR pre-fetch value.
1710 if (VM_Version::has_mfdscr()) {
1711 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1712 __ mtdscr(tmp2);
1713 }
1714
1715 __ cmpwi(CR0, R5_ARG3, 0);
1716 __ beq(CR0, l_6);
1717
1718 __ bind(l_5);
1719 __ mtctr(R5_ARG3);
1720 __ bind(l_3);
1721 __ lwz(R0, -4, R3_ARG1);
1722 __ stw(R0, -4, R4_ARG2);
1723 __ addi(R3_ARG1, R3_ARG1, -4);
1724 __ addi(R4_ARG2, R4_ARG2, -4);
1725 __ bdnz(l_3);
1726
1727 __ bind(l_6);
1728 }
1729 }
1730
1731 // Generate stub for conjoint int copy. If "aligned" is true, the
1732 // "from" and "to" addresses are assumed to be heapword aligned.
1733 //
1734 // Arguments for generated stub:
1735 // from: R3_ARG1
1736 // to: R4_ARG2
1737 // count: R5_ARG3 treated as signed
1738 //
1739 address generate_conjoint_int_copy(StubId stub_id) {
1740 bool aligned;
1741 switch (stub_id) {
1742 case StubId::stubgen_jint_arraycopy_id:
1743 aligned = false;
1744 break;
1745 case StubId::stubgen_arrayof_jint_arraycopy_id:
1746 aligned = true;
1747 break;
1748 default:
1749 ShouldNotReachHere();
1750 }
1751
1752 StubCodeMark mark(this, stub_id);
1753 address start = __ function_entry();
1754 assert_positive_int(R5_ARG3);
1755 address nooverlap_target = aligned ?
1756 STUB_ENTRY(arrayof_jint_disjoint_arraycopy()) :
1757 STUB_ENTRY(jint_disjoint_arraycopy());
1758
1759 array_overlap_test(nooverlap_target, 2);
1760 {
1761 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1762 UnsafeMemoryAccessMark umam(this, !aligned, false);
1763 generate_conjoint_int_copy_core(aligned);
1764 }
1765
1766 __ li(R3_RET, 0); // return 0
1767 __ blr();
1768
1769 return start;
1770 }
1771
1772 // Generate core code for disjoint long copy (and oop copy on
1773 // 64-bit). If "aligned" is true, the "from" and "to" addresses
1774 // are assumed to be heapword aligned.
1775 //
1776 // Arguments:
1777 // from: R3_ARG1
1778 // to: R4_ARG2
1779 // count: R5_ARG3 treated as signed
1780 //
1781 void generate_disjoint_long_copy_core(bool aligned) {
1782 Register tmp1 = R6_ARG4;
1783 Register tmp2 = R7_ARG5;
1784 Register tmp3 = R8_ARG6;
1785 Register tmp4 = R0;
1786
1787 Label l_1, l_2, l_3, l_4, l_5;
1788
1789 VectorSRegister tmp_vsr1 = VSR1;
1790 VectorSRegister tmp_vsr2 = VSR2;
1791
1792 { // FasterArrayCopy
1793 __ cmpwi(CR0, R5_ARG3, 3);
1794 __ ble(CR0, l_3); // copy 1 at a time if less than 4 elements remain
1795
1796 __ srdi(tmp1, R5_ARG3, 2);
1797 __ andi_(R5_ARG3, R5_ARG3, 3);
1798 __ mtctr(tmp1);
1799
1800 // Processor supports VSX, so use it to mass copy.
1801
1802 // Prefetch the data into the L2 cache.
1803 __ dcbt(R3_ARG1, 0);
1804
1805 // Set DSCR pre-fetch to deepest.
1806 if (VM_Version::has_mfdscr()) {
1807 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1808 __ mtdscr(tmp2);
1809 }
1810 __ li(tmp1, 16);
1811
1812 // Backbranch target aligned to 32-byte. Not 16-byte align as
1813 // loop contains < 8 instructions that fit inside a single
1814 // i-cache sector.
1815 __ align(32);
1816
1817 __ bind(l_5);
1818 // Use loop with VSX load/store instructions to
1819 // copy 4 elements a time.
1820 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1821 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1822 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1823 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1824 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1825 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1826 __ bdnz(l_5); // Dec CTR and loop if not zero.
1827
1828 // Restore DSCR pre-fetch value.
1829 if (VM_Version::has_mfdscr()) {
1830 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1831 __ mtdscr(tmp2);
1832 }
1833
1834 } // FasterArrayCopy
1835
1836 // copy 1 element at a time
1837 __ bind(l_3);
1838 __ cmpwi(CR0, R5_ARG3, 0);
1839 __ beq(CR0, l_1);
1840
1841 { // FasterArrayCopy
1842 __ mtctr(R5_ARG3);
1843 __ addi(R3_ARG1, R3_ARG1, -8);
1844 __ addi(R4_ARG2, R4_ARG2, -8);
1845
1846 __ bind(l_2);
1847 __ ldu(R0, 8, R3_ARG1);
1848 __ stdu(R0, 8, R4_ARG2);
1849 __ bdnz(l_2);
1850
1851 }
1852 __ bind(l_1);
1853 }
1854
1855 // Generate stub for disjoint long copy. If "aligned" is true, the
1856 // "from" and "to" addresses are assumed to be heapword aligned.
1857 //
1858 // Arguments for generated stub:
1859 // from: R3_ARG1
1860 // to: R4_ARG2
1861 // count: R5_ARG3 treated as signed
1862 //
1863 address generate_disjoint_long_copy(StubId stub_id) {
1864 bool aligned;
1865 switch (stub_id) {
1866 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1867 aligned = false;
1868 break;
1869 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1870 aligned = true;
1871 break;
1872 default:
1873 ShouldNotReachHere();
1874 }
1875
1876 StubCodeMark mark(this, stub_id);
1877 address start = __ function_entry();
1878 assert_positive_int(R5_ARG3);
1879 {
1880 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1881 UnsafeMemoryAccessMark umam(this, !aligned, false);
1882 generate_disjoint_long_copy_core(aligned);
1883 }
1884 __ li(R3_RET, 0); // return 0
1885 __ blr();
1886
1887 return start;
1888 }
1889
1890 // Generate core code for conjoint long copy (and oop copy on
1891 // 64-bit). If "aligned" is true, the "from" and "to" addresses
1892 // are assumed to be heapword aligned.
1893 //
1894 // Arguments:
1895 // from: R3_ARG1
1896 // to: R4_ARG2
1897 // count: R5_ARG3 treated as signed
1898 //
1899 void generate_conjoint_long_copy_core(bool aligned) {
1900 Register tmp1 = R6_ARG4;
1901 Register tmp2 = R7_ARG5;
1902 Register tmp3 = R8_ARG6;
1903 Register tmp4 = R0;
1904
1905 VectorSRegister tmp_vsr1 = VSR1;
1906 VectorSRegister tmp_vsr2 = VSR2;
1907
1908 Label l_1, l_2, l_3, l_4, l_5;
1909
1910 __ cmpwi(CR0, R5_ARG3, 0);
1911 __ beq(CR0, l_1);
1912
1913 { // FasterArrayCopy
1914 __ sldi(R5_ARG3, R5_ARG3, 3);
1915 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1916 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1917 __ srdi(R5_ARG3, R5_ARG3, 3);
1918
1919 __ cmpwi(CR0, R5_ARG3, 3);
1920 __ ble(CR0, l_5); // copy 1 at a time if less than 4 elements remain
1921
1922 __ srdi(tmp1, R5_ARG3, 2);
1923 __ andi(R5_ARG3, R5_ARG3, 3);
1924 __ mtctr(tmp1);
1925
1926 // Processor supports VSX, so use it to mass copy.
1927 // Prefetch the data into the L2 cache.
1928 __ dcbt(R3_ARG1, 0);
1929
1930 // Set DSCR pre-fetch to deepest.
1931 if (VM_Version::has_mfdscr()) {
1932 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1933 __ mtdscr(tmp2);
1934 }
1935 __ li(tmp1, 16);
1936
1937 // Backbranch target aligned to 32-byte. Not 16-byte align as
1938 // loop contains < 8 instructions that fit inside a single
1939 // i-cache sector.
1940 __ align(32);
1941
1942 __ bind(l_4);
1943 // Use loop with VSX load/store instructions to
1944 // copy 4 elements a time.
1945 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
1946 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
1947 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
1948 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1949 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1950 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1951 __ bdnz(l_4);
1952
1953 // Restore DSCR pre-fetch value.
1954 if (VM_Version::has_mfdscr()) {
1955 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1956 __ mtdscr(tmp2);
1957 }
1958
1959 __ cmpwi(CR0, R5_ARG3, 0);
1960 __ beq(CR0, l_1);
1961
1962 __ bind(l_5);
1963 __ mtctr(R5_ARG3);
1964 __ bind(l_3);
1965 __ ld(R0, -8, R3_ARG1);
1966 __ std(R0, -8, R4_ARG2);
1967 __ addi(R3_ARG1, R3_ARG1, -8);
1968 __ addi(R4_ARG2, R4_ARG2, -8);
1969 __ bdnz(l_3);
1970
1971 }
1972 __ bind(l_1);
1973 }
1974
1975 // Generate stub for conjoint long copy. If "aligned" is true, the
1976 // "from" and "to" addresses are assumed to be heapword aligned.
1977 //
1978 // Arguments for generated stub:
1979 // from: R3_ARG1
1980 // to: R4_ARG2
1981 // count: R5_ARG3 treated as signed
1982 //
1983 address generate_conjoint_long_copy(StubId stub_id) {
1984 bool aligned;
1985 switch (stub_id) {
1986 case StubId::stubgen_jlong_arraycopy_id:
1987 aligned = false;
1988 break;
1989 case StubId::stubgen_arrayof_jlong_arraycopy_id:
1990 aligned = true;
1991 break;
1992 default:
1993 ShouldNotReachHere();
1994 }
1995
1996 StubCodeMark mark(this, stub_id);
1997 address start = __ function_entry();
1998 assert_positive_int(R5_ARG3);
1999 address nooverlap_target = aligned ?
2000 STUB_ENTRY(arrayof_jlong_disjoint_arraycopy()) :
2001 STUB_ENTRY(jlong_disjoint_arraycopy());
2002
2003 array_overlap_test(nooverlap_target, 3);
2004 {
2005 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
2006 UnsafeMemoryAccessMark umam(this, !aligned, false);
2007 generate_conjoint_long_copy_core(aligned);
2008 }
2009 __ li(R3_RET, 0); // return 0
2010 __ blr();
2011
2012 return start;
2013 }
2014
2015 // Generate stub for conjoint oop copy. If "aligned" is true, the
2016 // "from" and "to" addresses are assumed to be heapword aligned.
2017 //
2018 // Arguments for generated stub:
2019 // from: R3_ARG1
2020 // to: R4_ARG2
2021 // count: R5_ARG3 treated as signed
2022 // dest_uninitialized: G1 support
2023 //
2024 address generate_conjoint_oop_copy(StubId stub_id) {
2025 bool aligned;
2026 bool dest_uninitialized;
2027 switch (stub_id) {
2028 case StubId::stubgen_oop_arraycopy_id:
2029 aligned = false;
2030 dest_uninitialized = false;
2031 break;
2032 case StubId::stubgen_arrayof_oop_arraycopy_id:
2033 aligned = true;
2034 dest_uninitialized = false;
2035 break;
2036 case StubId::stubgen_oop_arraycopy_uninit_id:
2037 aligned = false;
2038 dest_uninitialized = true;
2039 break;
2040 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
2041 aligned = true;
2042 dest_uninitialized = true;
2043 break;
2044 default:
2045 ShouldNotReachHere();
2046 }
2047
2048 StubCodeMark mark(this, stub_id);
2049 address start = __ function_entry();
2050 assert_positive_int(R5_ARG3);
2051 address nooverlap_target = aligned ?
2052 STUB_ENTRY(arrayof_oop_disjoint_arraycopy(dest_uninitialized)) :
2053 STUB_ENTRY(oop_disjoint_arraycopy(dest_uninitialized));
2054
2055 array_overlap_test(nooverlap_target, UseCompressedOops ? 2 : 3);
2056
2057 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2058 if (dest_uninitialized) {
2059 decorators |= IS_DEST_UNINITIALIZED;
2060 }
2061 if (aligned) {
2062 decorators |= ARRAYCOPY_ALIGNED;
2063 }
2064
2065 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2066 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2067
2068 if (UseCompressedOops) {
2069 generate_conjoint_int_copy_core(aligned);
2070 } else {
2071 #if INCLUDE_ZGC
2072 if (UseZGC) {
2073 ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
2074 zbs->generate_conjoint_oop_copy(_masm, dest_uninitialized);
2075 } else
2076 #endif
2077 generate_conjoint_long_copy_core(aligned);
2078 }
2079
2080 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2081 __ li(R3_RET, 0); // return 0
2082 __ blr();
2083 return start;
2084 }
2085
2086 // Generate stub for disjoint oop copy. If "aligned" is true, the
2087 // "from" and "to" addresses are assumed to be heapword aligned.
2088 //
2089 // Arguments for generated stub:
2090 // from: R3_ARG1
2091 // to: R4_ARG2
2092 // count: R5_ARG3 treated as signed
2093 // dest_uninitialized: G1 support
2094 //
2095 address generate_disjoint_oop_copy(StubId stub_id) {
2096 bool aligned;
2097 bool dest_uninitialized;
2098 switch (stub_id) {
2099 case StubId::stubgen_oop_disjoint_arraycopy_id:
2100 aligned = false;
2101 dest_uninitialized = false;
2102 break;
2103 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
2104 aligned = true;
2105 dest_uninitialized = false;
2106 break;
2107 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2108 aligned = false;
2109 dest_uninitialized = true;
2110 break;
2111 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
2112 aligned = true;
2113 dest_uninitialized = true;
2114 break;
2115 default:
2116 ShouldNotReachHere();
2117 }
2118
2119 StubCodeMark mark(this, stub_id);
2120 address start = __ function_entry();
2121 assert_positive_int(R5_ARG3);
2122
2123 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2124 if (dest_uninitialized) {
2125 decorators |= IS_DEST_UNINITIALIZED;
2126 }
2127 if (aligned) {
2128 decorators |= ARRAYCOPY_ALIGNED;
2129 }
2130
2131 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2132 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2133
2134 if (UseCompressedOops) {
2135 generate_disjoint_int_copy_core(aligned);
2136 } else {
2137 #if INCLUDE_ZGC
2138 if (UseZGC) {
2139 ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
2140 zbs->generate_disjoint_oop_copy(_masm, dest_uninitialized);
2141 } else
2142 #endif
2143 generate_disjoint_long_copy_core(aligned);
2144 }
2145
2146 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2147 __ li(R3_RET, 0); // return 0
2148 __ blr();
2149
2150 return start;
2151 }
2152
2153
2154 // Helper for generating a dynamic type check.
2155 // Smashes only the given temp registers.
2156 void generate_type_check(Register sub_klass,
2157 Register super_check_offset,
2158 Register super_klass,
2159 Register temp1,
2160 Register temp2,
2161 Label& L_success) {
2162 assert_different_registers(sub_klass, super_check_offset, super_klass);
2163
2164 BLOCK_COMMENT("type_check:");
2165
2166 Label L_miss;
2167
2168 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp1, temp2, &L_success, &L_miss, nullptr,
2169 super_check_offset);
2170 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success);
2171
2172 // Fall through on failure!
2173 __ bind(L_miss);
2174 }
2175
2176
2177 // Generate stub for checked oop copy.
2178 //
2179 // Arguments for generated stub:
2180 // from: R3
2181 // to: R4
2182 // count: R5 treated as signed
2183 // ckoff: R6 (super_check_offset)
2184 // ckval: R7 (super_klass)
2185 // ret: R3 zero for success; (-1^K) where K is partial transfer count
2186 //
2187 address generate_checkcast_copy(StubId stub_id) {
2188 const Register R3_from = R3_ARG1; // source array address
2189 const Register R4_to = R4_ARG2; // destination array address
2190 const Register R5_count = R5_ARG3; // elements count
2191 const Register R6_ckoff = R6_ARG4; // super_check_offset
2192 const Register R7_ckval = R7_ARG5; // super_klass
2193
2194 const Register R8_offset = R8_ARG6; // loop var, with stride wordSize
2195 const Register R9_remain = R9_ARG7; // loop var, with stride -1
2196 const Register R10_oop = R10_ARG8; // actual oop copied
2197 const Register R11_klass = R11_scratch1; // oop._klass
2198 const Register R12_tmp = R12_scratch2;
2199 const Register R2_tmp = R2;
2200
2201 bool dest_uninitialized;
2202 switch (stub_id) {
2203 case StubId::stubgen_checkcast_arraycopy_id:
2204 dest_uninitialized = false;
2205 break;
2206 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2207 dest_uninitialized = true;
2208 break;
2209 default:
2210 ShouldNotReachHere();
2211 }
2212 //__ align(CodeEntryAlignment);
2213 StubCodeMark mark(this, stub_id);
2214 address start = __ function_entry();
2215
2216 // Assert that int is 64 bit sign extended and arrays are not conjoint.
2217 #ifdef ASSERT
2218 {
2219 assert_positive_int(R5_ARG3);
2220 const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2;
2221 Label no_overlap;
2222 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
2223 __ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes
2224 __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
2225 __ cmpld(CR1, tmp1, tmp2);
2226 __ crnand(CR0, Assembler::less, CR1, Assembler::less);
2227 // Overlaps if Src before dst and distance smaller than size.
2228 // Branch to forward copy routine otherwise.
2229 __ blt(CR0, no_overlap);
2230 __ stop("overlap in checkcast_copy");
2231 __ bind(no_overlap);
2232 }
2233 #endif
2234
2235 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2236 if (dest_uninitialized) {
2237 decorators |= IS_DEST_UNINITIALIZED;
2238 }
2239
2240 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2241 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_from, R4_to, R5_count, /* preserve: */ R6_ckoff, R7_ckval);
2242
2243 //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
2244
2245 Label load_element, store_element, store_null, success, do_epilogue;
2246 __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
2247 __ li(R8_offset, 0); // Offset from start of arrays.
2248 __ bne(CR0, load_element);
2249
2250 // Empty array: Nothing to do.
2251 __ li(R3_RET, 0); // Return 0 on (trivial) success.
2252 __ blr();
2253
2254 // ======== begin loop ========
2255 // (Entry is load_element.)
2256 __ align(OptoLoopAlignment);
2257 __ bind(store_element);
2258 if (UseCompressedOops) {
2259 __ encode_heap_oop_not_null(R10_oop);
2260 __ bind(store_null);
2261 __ stw(R10_oop, R8_offset, R4_to);
2262 } else {
2263 __ bind(store_null);
2264 #if INCLUDE_ZGC
2265 if (UseZGC) {
2266 __ store_heap_oop(R10_oop, R8_offset, R4_to, R11_scratch1, R12_tmp, noreg,
2267 MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2268 dest_uninitialized ? IS_DEST_UNINITIALIZED : 0);
2269 } else
2270 #endif
2271 __ std(R10_oop, R8_offset, R4_to);
2272 }
2273
2274 __ addi(R8_offset, R8_offset, heapOopSize); // Step to next offset.
2275 __ addic_(R9_remain, R9_remain, -1); // Decrement the count.
2276 __ beq(CR0, success);
2277
2278 // ======== loop entry is here ========
2279 __ bind(load_element);
2280 #if INCLUDE_ZGC
2281 if (UseZGC) {
2282 __ load_heap_oop(R10_oop, R8_offset, R3_from,
2283 R11_scratch1, R12_tmp,
2284 MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2285 0, &store_null);
2286 } else
2287 #endif
2288 __ load_heap_oop(R10_oop, R8_offset, R3_from,
2289 R11_scratch1, R12_tmp,
2290 MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2291 AS_RAW, &store_null);
2292
2293 __ load_klass(R11_klass, R10_oop); // Query the object klass.
2294
2295 generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, R2_tmp,
2296 // Branch to this on success:
2297 store_element);
2298 // ======== end loop ========
2299
2300 // It was a real error; we must depend on the caller to finish the job.
2301 // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
2302 // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
2303 // and report their number to the caller.
2304 __ subf_(R5_count, R9_remain, R5_count);
2305 __ nand(R3_RET, R5_count, R5_count); // report (-1^K) to caller
2306 __ bne(CR0, do_epilogue);
2307 __ blr();
2308
2309 __ bind(success);
2310 __ li(R3_RET, 0);
2311
2312 __ bind(do_epilogue);
2313 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_to, R5_count, /* preserve */ R3_RET);
2314
2315 __ blr();
2316 return start;
2317 }
2318
2319
2320 // Generate 'unsafe' array copy stub.
2321 // Though just as safe as the other stubs, it takes an unscaled
2322 // size_t argument instead of an element count.
2323 //
2324 // Arguments for generated stub:
2325 // from: R3
2326 // to: R4
2327 // count: R5 byte count, treated as ssize_t, can be zero
2328 //
2329 // Examines the alignment of the operands and dispatches
2330 // to a long, int, short, or byte copy loop.
2331 //
2332 address generate_unsafe_copy(address byte_copy_entry,
2333 address short_copy_entry,
2334 address int_copy_entry,
2335 address long_copy_entry) {
2336
2337 const Register R3_from = R3_ARG1; // source array address
2338 const Register R4_to = R4_ARG2; // destination array address
2339 const Register R5_count = R5_ARG3; // elements count (as long on PPC64)
2340
2341 const Register R6_bits = R6_ARG4; // test copy of low bits
2342 const Register R7_tmp = R7_ARG5;
2343
2344 //__ align(CodeEntryAlignment);
2345 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2346 StubCodeMark mark(this, stub_id);
2347 address start = __ function_entry();
2348
2349 // Bump this on entry, not on exit:
2350 //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
2351
2352 Label short_copy, int_copy, long_copy;
2353
2354 __ orr(R6_bits, R3_from, R4_to);
2355 __ orr(R6_bits, R6_bits, R5_count);
2356 __ andi_(R0, R6_bits, (BytesPerLong-1));
2357 __ beq(CR0, long_copy);
2358
2359 __ andi_(R0, R6_bits, (BytesPerInt-1));
2360 __ beq(CR0, int_copy);
2361
2362 __ andi_(R0, R6_bits, (BytesPerShort-1));
2363 __ beq(CR0, short_copy);
2364
2365 // byte_copy:
2366 __ b(byte_copy_entry);
2367
2368 __ bind(short_copy);
2369 __ srwi(R5_count, R5_count, LogBytesPerShort);
2370 __ b(short_copy_entry);
2371
2372 __ bind(int_copy);
2373 __ srwi(R5_count, R5_count, LogBytesPerInt);
2374 __ b(int_copy_entry);
2375
2376 __ bind(long_copy);
2377 __ srwi(R5_count, R5_count, LogBytesPerLong);
2378 __ b(long_copy_entry);
2379
2380 return start;
2381 }
2382
2383
2384 // Perform range checks on the proposed arraycopy.
2385 // Kills the two temps, but nothing else.
2386 // Also, clean the sign bits of src_pos and dst_pos.
2387 void arraycopy_range_checks(Register src, // source array oop
2388 Register src_pos, // source position
2389 Register dst, // destination array oop
2390 Register dst_pos, // destination position
2391 Register length, // length of copy
2392 Register temp1, Register temp2,
2393 Label& L_failed) {
2394 BLOCK_COMMENT("arraycopy_range_checks:");
2395
2396 const Register array_length = temp1; // scratch
2397 const Register end_pos = temp2; // scratch
2398
2399 // if (src_pos + length > arrayOop(src)->length() ) FAIL;
2400 __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
2401 __ add(end_pos, src_pos, length); // src_pos + length
2402 __ cmpd(CR0, end_pos, array_length);
2403 __ bgt(CR0, L_failed);
2404
2405 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2406 __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
2407 __ add(end_pos, dst_pos, length); // src_pos + length
2408 __ cmpd(CR0, end_pos, array_length);
2409 __ bgt(CR0, L_failed);
2410
2411 BLOCK_COMMENT("arraycopy_range_checks done");
2412 }
2413
2414
2415 // Helper for generate_unsafe_setmemory
2416 //
2417 // Atomically fill an array of memory using 1-, 2-, 4-, or 8-byte chunks and return.
2418 static void do_setmemory_atomic_loop(int elem_size, Register dest, Register size, Register byteVal,
2419 MacroAssembler *_masm) {
2420
2421 Label L_Loop, L_Tail; // 2x unrolled loop
2422
2423 // Propagate byte to required width
2424 if (elem_size > 1) __ rldimi(byteVal, byteVal, 8, 64 - 2 * 8);
2425 if (elem_size > 2) __ rldimi(byteVal, byteVal, 16, 64 - 2 * 16);
2426 if (elem_size > 4) __ rldimi(byteVal, byteVal, 32, 64 - 2 * 32);
2427
2428 __ srwi_(R0, size, exact_log2(2 * elem_size)); // size is a 32 bit value
2429 __ beq(CR0, L_Tail);
2430 __ mtctr(R0);
2431
2432 __ align(32); // loop alignment
2433 __ bind(L_Loop);
2434 __ store_sized_value(byteVal, 0, dest, elem_size);
2435 __ store_sized_value(byteVal, elem_size, dest, elem_size);
2436 __ addi(dest, dest, 2 * elem_size);
2437 __ bdnz(L_Loop);
2438
2439 __ bind(L_Tail);
2440 __ andi_(R0, size, elem_size);
2441 __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintbhBCLRisReturn);
2442 __ store_sized_value(byteVal, 0, dest, elem_size);
2443 __ blr();
2444 }
2445
2446 //
2447 // Generate 'unsafe' set memory stub
2448 // Though just as safe as the other stubs, it takes an unscaled
2449 // size_t (# bytes) argument instead of an element count.
2450 //
2451 // Input:
2452 // R3_ARG1 - destination array address
2453 // R4_ARG2 - byte count (size_t)
2454 // R5_ARG3 - byte value
2455 //
2456 address generate_unsafe_setmemory(address unsafe_byte_fill) {
2457 __ align(CodeEntryAlignment);
2458 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2459 address start = __ function_entry();
2460
2461 // bump this on entry, not on exit:
2462 // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
2463
2464 {
2465 Label L_fill8Bytes, L_fill4Bytes, L_fillBytes;
2466
2467 const Register dest = R3_ARG1;
2468 const Register size = R4_ARG2;
2469 const Register byteVal = R5_ARG3;
2470 const Register rScratch1 = R6;
2471
2472 // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2473
2474 // Check for pointer & size alignment
2475 __ orr(rScratch1, dest, size);
2476
2477 __ andi_(R0, rScratch1, 7);
2478 __ beq(CR0, L_fill8Bytes);
2479
2480 __ andi_(R0, rScratch1, 3);
2481 __ beq(CR0, L_fill4Bytes);
2482
2483 __ andi_(R0, rScratch1, 1);
2484 __ bne(CR0, L_fillBytes);
2485
2486 // Mark remaining code as such which performs Unsafe accesses.
2487 UnsafeMemoryAccessMark umam(this, true, false);
2488
2489 // At this point, we know the lower bit of size is zero and a
2490 // multiple of 2
2491 do_setmemory_atomic_loop(2, dest, size, byteVal, _masm);
2492
2493 __ align(32);
2494 __ bind(L_fill8Bytes);
2495 // At this point, we know the lower 3 bits of size are zero and a
2496 // multiple of 8
2497 do_setmemory_atomic_loop(8, dest, size, byteVal, _masm);
2498
2499 __ align(32);
2500 __ bind(L_fill4Bytes);
2501 // At this point, we know the lower 2 bits of size are zero and a
2502 // multiple of 4
2503 do_setmemory_atomic_loop(4, dest, size, byteVal, _masm);
2504
2505 __ align(32);
2506 __ bind(L_fillBytes);
2507 do_setmemory_atomic_loop(1, dest, size, byteVal, _masm);
2508 }
2509
2510 return start;
2511 }
2512
2513
2514 //
2515 // Generate generic array copy stubs
2516 //
2517 // Input:
2518 // R3 - src oop
2519 // R4 - src_pos
2520 // R5 - dst oop
2521 // R6 - dst_pos
2522 // R7 - element count
2523 //
2524 // Output:
2525 // R3 == 0 - success
2526 // R3 == -1 - need to call System.arraycopy
2527 //
2528 address generate_generic_copy(address entry_jbyte_arraycopy,
2529 address entry_jshort_arraycopy,
2530 address entry_jint_arraycopy,
2531 address entry_oop_arraycopy,
2532 address entry_disjoint_oop_arraycopy,
2533 address entry_jlong_arraycopy,
2534 address entry_checkcast_arraycopy) {
2535 Label L_failed, L_objArray;
2536
2537 // Input registers
2538 const Register src = R3_ARG1; // source array oop
2539 const Register src_pos = R4_ARG2; // source position
2540 const Register dst = R5_ARG3; // destination array oop
2541 const Register dst_pos = R6_ARG4; // destination position
2542 const Register length = R7_ARG5; // elements count
2543
2544 // registers used as temp
2545 const Register src_klass = R8_ARG6; // source array klass
2546 const Register dst_klass = R9_ARG7; // destination array klass
2547 const Register lh = R10_ARG8; // layout handler
2548 const Register temp = R2;
2549
2550 //__ align(CodeEntryAlignment);
2551 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2552 StubCodeMark mark(this, stub_id);
2553 address start = __ function_entry();
2554
2555 // Bump this on entry, not on exit:
2556 //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
2557
2558 // In principle, the int arguments could be dirty.
2559
2560 //-----------------------------------------------------------------------
2561 // Assembler stubs will be used for this call to arraycopy
2562 // if the following conditions are met:
2563 //
2564 // (1) src and dst must not be null.
2565 // (2) src_pos must not be negative.
2566 // (3) dst_pos must not be negative.
2567 // (4) length must not be negative.
2568 // (5) src klass and dst klass should be the same and not null.
2569 // (6) src and dst should be arrays.
2570 // (7) src_pos + length must not exceed length of src.
2571 // (8) dst_pos + length must not exceed length of dst.
2572 BLOCK_COMMENT("arraycopy initial argument checks");
2573
2574 __ cmpdi(CR1, src, 0); // if (src == nullptr) return -1;
2575 __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
2576 __ cmpdi(CR5, dst, 0); // if (dst == nullptr) return -1;
2577 __ cror(CR1, Assembler::equal, CR0, Assembler::less);
2578 __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
2579 __ cror(CR5, Assembler::equal, CR0, Assembler::less);
2580 __ extsw_(length, length); // if (length < 0) return -1;
2581 __ cror(CR1, Assembler::equal, CR5, Assembler::equal);
2582 __ cror(CR1, Assembler::equal, CR0, Assembler::less);
2583 __ beq(CR1, L_failed);
2584
2585 BLOCK_COMMENT("arraycopy argument klass checks");
2586 __ load_klass(src_klass, src);
2587 __ load_klass(dst_klass, dst);
2588
2589 // Load layout helper
2590 //
2591 // |array_tag| | header_size | element_type | |log2_element_size|
2592 // 32 30 24 16 8 2 0
2593 //
2594 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2595 //
2596
2597 int lh_offset = in_bytes(Klass::layout_helper_offset());
2598
2599 // Load 32-bits signed value. Use br() instruction with it to check icc.
2600 __ lwz(lh, lh_offset, src_klass);
2601
2602 // Handle objArrays completely differently...
2603 jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2604 __ load_const_optimized(temp, objArray_lh, R0);
2605 __ cmpw(CR0, lh, temp);
2606 __ beq(CR0, L_objArray);
2607
2608 __ cmpd(CR5, src_klass, dst_klass); // if (src->klass() != dst->klass()) return -1;
2609 __ cmpwi(CR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
2610
2611 __ crnand(CR5, Assembler::equal, CR6, Assembler::less);
2612 __ beq(CR5, L_failed);
2613
2614 // At this point, it is known to be a typeArray (array_tag 0x3).
2615 #ifdef ASSERT
2616 { Label L;
2617 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2618 __ load_const_optimized(temp, lh_prim_tag_in_place, R0);
2619 __ cmpw(CR0, lh, temp);
2620 __ bge(CR0, L);
2621 __ stop("must be a primitive array");
2622 __ bind(L);
2623 }
2624 #endif
2625
2626 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2627 temp, dst_klass, L_failed);
2628
2629 // TypeArrayKlass
2630 //
2631 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2632 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2633 //
2634
2635 const Register offset = dst_klass; // array offset
2636 const Register elsize = src_klass; // log2 element size
2637
2638 __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
2639 __ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
2640 __ add(src, offset, src); // src array offset
2641 __ add(dst, offset, dst); // dst array offset
2642
2643 // Next registers should be set before the jump to corresponding stub.
2644 const Register from = R3_ARG1; // source array address
2645 const Register to = R4_ARG2; // destination array address
2646 const Register count = R5_ARG3; // elements count
2647
2648 // 'from', 'to', 'count' registers should be set in this order
2649 // since they are the same as 'src', 'src_pos', 'dst'.
2650
2651 BLOCK_COMMENT("scale indexes to element size");
2652 __ sld(src_pos, src_pos, elsize);
2653 __ sld(dst_pos, dst_pos, elsize);
2654 __ add(from, src_pos, src); // src_addr
2655 __ add(to, dst_pos, dst); // dst_addr
2656 __ mr(count, length); // length
2657
2658 BLOCK_COMMENT("choose copy loop based on element size");
2659 // Using conditional branches with range 32kB.
2660 const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CR0, Assembler::equal);
2661 __ cmpwi(CR0, elsize, 0);
2662 __ bc(bo, bi, entry_jbyte_arraycopy);
2663 __ cmpwi(CR0, elsize, LogBytesPerShort);
2664 __ bc(bo, bi, entry_jshort_arraycopy);
2665 __ cmpwi(CR0, elsize, LogBytesPerInt);
2666 __ bc(bo, bi, entry_jint_arraycopy);
2667 #ifdef ASSERT
2668 { Label L;
2669 __ cmpwi(CR0, elsize, LogBytesPerLong);
2670 __ beq(CR0, L);
2671 __ stop("must be long copy, but elsize is wrong");
2672 __ bind(L);
2673 }
2674 #endif
2675 __ b(entry_jlong_arraycopy);
2676
2677 // ObjArrayKlass
2678 __ bind(L_objArray);
2679 // live at this point: src_klass, dst_klass, src[_pos], dst[_pos], length
2680
2681 Label L_disjoint_plain_copy, L_checkcast_copy;
2682 // test array classes for subtyping
2683 __ cmpd(CR0, src_klass, dst_klass); // usual case is exact equality
2684 __ bne(CR0, L_checkcast_copy);
2685
2686 // Identically typed arrays can be copied without element-wise checks.
2687 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2688 temp, lh, L_failed);
2689
2690 __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2691 __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2692 __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2693 __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2694 __ add(from, src_pos, src); // src_addr
2695 __ add(to, dst_pos, dst); // dst_addr
2696 __ mr(count, length); // length
2697 __ b(entry_oop_arraycopy);
2698
2699 __ bind(L_checkcast_copy);
2700 // live at this point: src_klass, dst_klass
2701 {
2702 // Before looking at dst.length, make sure dst is also an objArray.
2703 __ lwz(temp, lh_offset, dst_klass);
2704 __ cmpw(CR0, lh, temp);
2705 __ bne(CR0, L_failed);
2706
2707 // It is safe to examine both src.length and dst.length.
2708 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2709 temp, lh, L_failed);
2710
2711 // Marshal the base address arguments now, freeing registers.
2712 __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2713 __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2714 __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2715 __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2716 __ add(from, src_pos, src); // src_addr
2717 __ add(to, dst_pos, dst); // dst_addr
2718 __ mr(count, length); // length
2719
2720 Register sco_temp = R6_ARG4; // This register is free now.
2721 assert_different_registers(from, to, count, sco_temp,
2722 dst_klass, src_klass);
2723
2724 // Generate the type check.
2725 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2726 __ lwz(sco_temp, sco_offset, dst_klass);
2727 generate_type_check(src_klass, sco_temp, dst_klass,
2728 temp, /* temp */ R10_ARG8, L_disjoint_plain_copy);
2729
2730 // Fetch destination element klass from the ObjArrayKlass header.
2731 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2732
2733 // The checkcast_copy loop needs two extra arguments:
2734 __ ld(R7_ARG5, ek_offset, dst_klass); // dest elem klass
2735 __ lwz(R6_ARG4, sco_offset, R7_ARG5); // sco of elem klass
2736 __ b(entry_checkcast_arraycopy);
2737 }
2738
2739 __ bind(L_disjoint_plain_copy);
2740 __ b(entry_disjoint_oop_arraycopy);
2741
2742 __ bind(L_failed);
2743 __ li(R3_RET, -1); // return -1
2744 __ blr();
2745 return start;
2746 }
2747
2748 // Arguments for generated stub:
2749 // R3_ARG1 - source byte array address
2750 // R4_ARG2 - destination byte array address
2751 // R5_ARG3 - round key array
2752 address generate_aescrypt_encryptBlock() {
2753 assert(UseAES, "need AES instructions and misaligned SSE support");
2754 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2755 StubCodeMark mark(this, stub_id);
2756
2757 address start = __ function_entry();
2758
2759 Label L_doLast, L_error;
2760
2761 Register from = R3_ARG1; // source array address
2762 Register to = R4_ARG2; // destination array address
2763 Register key = R5_ARG3; // round key array
2764
2765 Register keylen = R8;
2766 Register temp = R9;
2767 Register keypos = R10;
2768 Register fifteen = R12;
2769
2770 VectorRegister vRet = VR0;
2771
2772 VectorRegister vKey1 = VR1;
2773 VectorRegister vKey2 = VR2;
2774 VectorRegister vKey3 = VR3;
2775 VectorRegister vKey4 = VR4;
2776
2777 VectorRegister fromPerm = VR5;
2778 VectorRegister keyPerm = VR6;
2779 VectorRegister toPerm = VR7;
2780 VectorRegister fSplt = VR8;
2781
2782 VectorRegister vTmp1 = VR9;
2783 VectorRegister vTmp2 = VR10;
2784 VectorRegister vTmp3 = VR11;
2785 VectorRegister vTmp4 = VR12;
2786
2787 __ li (fifteen, 15);
2788
2789 // load unaligned from[0-15] to vRet
2790 __ lvx (vRet, from);
2791 __ lvx (vTmp1, fifteen, from);
2792 __ lvsl (fromPerm, from);
2793 #ifdef VM_LITTLE_ENDIAN
2794 __ vspltisb (fSplt, 0x0f);
2795 __ vxor (fromPerm, fromPerm, fSplt);
2796 #endif
2797 __ vperm (vRet, vRet, vTmp1, fromPerm);
2798
2799 // load keylen (44 or 52 or 60)
2800 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2801
2802 // to load keys
2803 __ load_perm (keyPerm, key);
2804 #ifdef VM_LITTLE_ENDIAN
2805 __ vspltisb (vTmp2, -16);
2806 __ vrld (keyPerm, keyPerm, vTmp2);
2807 __ vrld (keyPerm, keyPerm, vTmp2);
2808 __ vsldoi (keyPerm, keyPerm, keyPerm, 8);
2809 #endif
2810
2811 // load the 1st round key to vTmp1
2812 __ lvx (vTmp1, key);
2813 __ li (keypos, 16);
2814 __ lvx (vKey1, keypos, key);
2815 __ vec_perm (vTmp1, vKey1, keyPerm);
2816
2817 // 1st round
2818 __ vxor (vRet, vRet, vTmp1);
2819
2820 // load the 2nd round key to vKey1
2821 __ li (keypos, 32);
2822 __ lvx (vKey2, keypos, key);
2823 __ vec_perm (vKey1, vKey2, keyPerm);
2824
2825 // load the 3rd round key to vKey2
2826 __ li (keypos, 48);
2827 __ lvx (vKey3, keypos, key);
2828 __ vec_perm (vKey2, vKey3, keyPerm);
2829
2830 // load the 4th round key to vKey3
2831 __ li (keypos, 64);
2832 __ lvx (vKey4, keypos, key);
2833 __ vec_perm (vKey3, vKey4, keyPerm);
2834
2835 // load the 5th round key to vKey4
2836 __ li (keypos, 80);
2837 __ lvx (vTmp1, keypos, key);
2838 __ vec_perm (vKey4, vTmp1, keyPerm);
2839
2840 // 2nd - 5th rounds
2841 __ vcipher (vRet, vRet, vKey1);
2842 __ vcipher (vRet, vRet, vKey2);
2843 __ vcipher (vRet, vRet, vKey3);
2844 __ vcipher (vRet, vRet, vKey4);
2845
2846 // load the 6th round key to vKey1
2847 __ li (keypos, 96);
2848 __ lvx (vKey2, keypos, key);
2849 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2850
2851 // load the 7th round key to vKey2
2852 __ li (keypos, 112);
2853 __ lvx (vKey3, keypos, key);
2854 __ vec_perm (vKey2, vKey3, keyPerm);
2855
2856 // load the 8th round key to vKey3
2857 __ li (keypos, 128);
2858 __ lvx (vKey4, keypos, key);
2859 __ vec_perm (vKey3, vKey4, keyPerm);
2860
2861 // load the 9th round key to vKey4
2862 __ li (keypos, 144);
2863 __ lvx (vTmp1, keypos, key);
2864 __ vec_perm (vKey4, vTmp1, keyPerm);
2865
2866 // 6th - 9th rounds
2867 __ vcipher (vRet, vRet, vKey1);
2868 __ vcipher (vRet, vRet, vKey2);
2869 __ vcipher (vRet, vRet, vKey3);
2870 __ vcipher (vRet, vRet, vKey4);
2871
2872 // load the 10th round key to vKey1
2873 __ li (keypos, 160);
2874 __ lvx (vKey2, keypos, key);
2875 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2876
2877 // load the 11th round key to vKey2
2878 __ li (keypos, 176);
2879 __ lvx (vTmp1, keypos, key);
2880 __ vec_perm (vKey2, vTmp1, keyPerm);
2881
2882 // if all round keys are loaded, skip next 4 rounds
2883 __ cmpwi (CR0, keylen, 44);
2884 __ beq (CR0, L_doLast);
2885
2886 // 10th - 11th rounds
2887 __ vcipher (vRet, vRet, vKey1);
2888 __ vcipher (vRet, vRet, vKey2);
2889
2890 // load the 12th round key to vKey1
2891 __ li (keypos, 192);
2892 __ lvx (vKey2, keypos, key);
2893 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2894
2895 // load the 13th round key to vKey2
2896 __ li (keypos, 208);
2897 __ lvx (vTmp1, keypos, key);
2898 __ vec_perm (vKey2, vTmp1, keyPerm);
2899
2900 // if all round keys are loaded, skip next 2 rounds
2901 __ cmpwi (CR0, keylen, 52);
2902 __ beq (CR0, L_doLast);
2903
2904 #ifdef ASSERT
2905 __ cmpwi (CR0, keylen, 60);
2906 __ bne (CR0, L_error);
2907 #endif
2908
2909 // 12th - 13th rounds
2910 __ vcipher (vRet, vRet, vKey1);
2911 __ vcipher (vRet, vRet, vKey2);
2912
2913 // load the 14th round key to vKey1
2914 __ li (keypos, 224);
2915 __ lvx (vKey2, keypos, key);
2916 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2917
2918 // load the 15th round key to vKey2
2919 __ li (keypos, 240);
2920 __ lvx (vTmp1, keypos, key);
2921 __ vec_perm (vKey2, vTmp1, keyPerm);
2922
2923 __ bind(L_doLast);
2924
2925 // last two rounds
2926 __ vcipher (vRet, vRet, vKey1);
2927 __ vcipherlast (vRet, vRet, vKey2);
2928
2929 #ifdef VM_LITTLE_ENDIAN
2930 // toPerm = 0x0F0E0D0C0B0A09080706050403020100
2931 __ lvsl (toPerm, keypos); // keypos is a multiple of 16
2932 __ vxor (toPerm, toPerm, fSplt);
2933
2934 // Swap Bytes
2935 __ vperm (vRet, vRet, vRet, toPerm);
2936 #endif
2937
2938 // store result (unaligned)
2939 // Note: We can't use a read-modify-write sequence which touches additional Bytes.
2940 Register lo = temp, hi = fifteen; // Reuse
2941 __ vsldoi (vTmp1, vRet, vRet, 8);
2942 __ mfvrd (hi, vRet);
2943 __ mfvrd (lo, vTmp1);
2944 __ std (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
2945 __ std (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
2946
2947 __ blr();
2948
2949 #ifdef ASSERT
2950 __ bind(L_error);
2951 __ stop("aescrypt_encryptBlock: invalid key length");
2952 #endif
2953 return start;
2954 }
2955
2956 // Arguments for generated stub:
2957 // R3_ARG1 - source byte array address
2958 // R4_ARG2 - destination byte array address
2959 // R5_ARG3 - sessionKe (key) in little endian int array
2960 address generate_aescrypt_decryptBlock() {
2961 assert(UseAES, "need AES instructions and misaligned SSE support");
2962 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2963 StubCodeMark mark(this, stub_id);
2964
2965 address start = __ function_entry();
2966
2967 Label L_doLast, L_do44, L_do52, L_error;
2968
2969 Register from = R3_ARG1; // source array address
2970 Register to = R4_ARG2; // destination array address
2971 Register key = R5_ARG3; // round key array
2972
2973 Register keylen = R8;
2974 Register temp = R9;
2975 Register keypos = R10;
2976 Register fifteen = R12;
2977
2978 VectorRegister vRet = VR0;
2979
2980 VectorRegister vKey1 = VR1;
2981 VectorRegister vKey2 = VR2;
2982 VectorRegister vKey3 = VR3;
2983 VectorRegister vKey4 = VR4;
2984 VectorRegister vKey5 = VR5;
2985
2986 VectorRegister fromPerm = VR6;
2987 VectorRegister keyPerm = VR7;
2988 VectorRegister toPerm = VR8;
2989 VectorRegister fSplt = VR9;
2990
2991 VectorRegister vTmp1 = VR10;
2992 VectorRegister vTmp2 = VR11;
2993 VectorRegister vTmp3 = VR12;
2994 VectorRegister vTmp4 = VR13;
2995
2996 __ li (fifteen, 15);
2997
2998 // load unaligned from[0-15] to vRet
2999 __ lvx (vRet, from);
3000 __ lvx (vTmp1, fifteen, from);
3001 __ lvsl (fromPerm, from);
3002 #ifdef VM_LITTLE_ENDIAN
3003 __ vspltisb (fSplt, 0x0f);
3004 __ vxor (fromPerm, fromPerm, fSplt);
3005 #endif
3006 __ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
3007
3008 // load keylen (44 or 52 or 60)
3009 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
3010
3011 // to load keys
3012 __ load_perm (keyPerm, key);
3013 #ifdef VM_LITTLE_ENDIAN
3014 __ vxor (vTmp2, vTmp2, vTmp2);
3015 __ vspltisb (vTmp2, -16);
3016 __ vrld (keyPerm, keyPerm, vTmp2);
3017 __ vrld (keyPerm, keyPerm, vTmp2);
3018 __ vsldoi (keyPerm, keyPerm, keyPerm, 8);
3019 #endif
3020
3021 __ cmpwi (CR0, keylen, 44);
3022 __ beq (CR0, L_do44);
3023
3024 __ cmpwi (CR0, keylen, 52);
3025 __ beq (CR0, L_do52);
3026
3027 #ifdef ASSERT
3028 __ cmpwi (CR0, keylen, 60);
3029 __ bne (CR0, L_error);
3030 #endif
3031
3032 // load the 15th round key to vKey1
3033 __ li (keypos, 240);
3034 __ lvx (vKey1, keypos, key);
3035 __ li (keypos, 224);
3036 __ lvx (vKey2, keypos, key);
3037 __ vec_perm (vKey1, vKey2, vKey1, keyPerm);
3038
3039 // load the 14th round key to vKey2
3040 __ li (keypos, 208);
3041 __ lvx (vKey3, keypos, key);
3042 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
3043
3044 // load the 13th round key to vKey3
3045 __ li (keypos, 192);
3046 __ lvx (vKey4, keypos, key);
3047 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
3048
3049 // load the 12th round key to vKey4
3050 __ li (keypos, 176);
3051 __ lvx (vKey5, keypos, key);
3052 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
3053
3054 // load the 11th round key to vKey5
3055 __ li (keypos, 160);
3056 __ lvx (vTmp1, keypos, key);
3057 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
3058
3059 // 1st - 5th rounds
3060 __ vxor (vRet, vRet, vKey1);
3061 __ vncipher (vRet, vRet, vKey2);
3062 __ vncipher (vRet, vRet, vKey3);
3063 __ vncipher (vRet, vRet, vKey4);
3064 __ vncipher (vRet, vRet, vKey5);
3065
3066 __ b (L_doLast);
3067
3068 __ align(32);
3069 __ bind (L_do52);
3070
3071 // load the 13th round key to vKey1
3072 __ li (keypos, 208);
3073 __ lvx (vKey1, keypos, key);
3074 __ li (keypos, 192);
3075 __ lvx (vKey2, keypos, key);
3076 __ vec_perm (vKey1, vKey2, vKey1, keyPerm);
3077
3078 // load the 12th round key to vKey2
3079 __ li (keypos, 176);
3080 __ lvx (vKey3, keypos, key);
3081 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
3082
3083 // load the 11th round key to vKey3
3084 __ li (keypos, 160);
3085 __ lvx (vTmp1, keypos, key);
3086 __ vec_perm (vKey3, vTmp1, vKey3, keyPerm);
3087
3088 // 1st - 3rd rounds
3089 __ vxor (vRet, vRet, vKey1);
3090 __ vncipher (vRet, vRet, vKey2);
3091 __ vncipher (vRet, vRet, vKey3);
3092
3093 __ b (L_doLast);
3094
3095 __ align(32);
3096 __ bind (L_do44);
3097
3098 // load the 11th round key to vKey1
3099 __ li (keypos, 176);
3100 __ lvx (vKey1, keypos, key);
3101 __ li (keypos, 160);
3102 __ lvx (vTmp1, keypos, key);
3103 __ vec_perm (vKey1, vTmp1, vKey1, keyPerm);
3104
3105 // 1st round
3106 __ vxor (vRet, vRet, vKey1);
3107
3108 __ bind (L_doLast);
3109
3110 // load the 10th round key to vKey1
3111 __ li (keypos, 144);
3112 __ lvx (vKey2, keypos, key);
3113 __ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
3114
3115 // load the 9th round key to vKey2
3116 __ li (keypos, 128);
3117 __ lvx (vKey3, keypos, key);
3118 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
3119
3120 // load the 8th round key to vKey3
3121 __ li (keypos, 112);
3122 __ lvx (vKey4, keypos, key);
3123 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
3124
3125 // load the 7th round key to vKey4
3126 __ li (keypos, 96);
3127 __ lvx (vKey5, keypos, key);
3128 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
3129
3130 // load the 6th round key to vKey5
3131 __ li (keypos, 80);
3132 __ lvx (vTmp1, keypos, key);
3133 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
3134
3135 // last 10th - 6th rounds
3136 __ vncipher (vRet, vRet, vKey1);
3137 __ vncipher (vRet, vRet, vKey2);
3138 __ vncipher (vRet, vRet, vKey3);
3139 __ vncipher (vRet, vRet, vKey4);
3140 __ vncipher (vRet, vRet, vKey5);
3141
3142 // load the 5th round key to vKey1
3143 __ li (keypos, 64);
3144 __ lvx (vKey2, keypos, key);
3145 __ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
3146
3147 // load the 4th round key to vKey2
3148 __ li (keypos, 48);
3149 __ lvx (vKey3, keypos, key);
3150 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
3151
3152 // load the 3rd round key to vKey3
3153 __ li (keypos, 32);
3154 __ lvx (vKey4, keypos, key);
3155 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
3156
3157 // load the 2nd round key to vKey4
3158 __ li (keypos, 16);
3159 __ lvx (vKey5, keypos, key);
3160 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
3161
3162 // load the 1st round key to vKey5
3163 __ lvx (vTmp1, key);
3164 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
3165
3166 // last 5th - 1th rounds
3167 __ vncipher (vRet, vRet, vKey1);
3168 __ vncipher (vRet, vRet, vKey2);
3169 __ vncipher (vRet, vRet, vKey3);
3170 __ vncipher (vRet, vRet, vKey4);
3171 __ vncipherlast (vRet, vRet, vKey5);
3172
3173 #ifdef VM_LITTLE_ENDIAN
3174 // toPerm = 0x0F0E0D0C0B0A09080706050403020100
3175 __ lvsl (toPerm, keypos); // keypos is a multiple of 16
3176 __ vxor (toPerm, toPerm, fSplt);
3177
3178 // Swap Bytes
3179 __ vperm (vRet, vRet, vRet, toPerm);
3180 #endif
3181
3182 // store result (unaligned)
3183 // Note: We can't use a read-modify-write sequence which touches additional Bytes.
3184 Register lo = temp, hi = fifteen; // Reuse
3185 __ vsldoi (vTmp1, vRet, vRet, 8);
3186 __ mfvrd (hi, vRet);
3187 __ mfvrd (lo, vTmp1);
3188 __ std (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
3189 __ std (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
3190
3191 __ blr();
3192
3193 #ifdef ASSERT
3194 __ bind(L_error);
3195 __ stop("aescrypt_decryptBlock: invalid key length");
3196 #endif
3197 return start;
3198 }
3199
3200 address generate_sha256_implCompress(StubId stub_id) {
3201 assert(UseSHA, "need SHA instructions");
3202 bool multi_block;
3203 switch (stub_id) {
3204 case StubId::stubgen_sha256_implCompress_id:
3205 multi_block = false;
3206 break;
3207 case StubId::stubgen_sha256_implCompressMB_id:
3208 multi_block = true;
3209 break;
3210 default:
3211 ShouldNotReachHere();
3212 }
3213 StubCodeMark mark(this, stub_id);
3214 address start = __ function_entry();
3215
3216 __ sha256 (multi_block);
3217 __ blr();
3218
3219 return start;
3220 }
3221
3222 address generate_sha512_implCompress(StubId stub_id) {
3223 assert(UseSHA, "need SHA instructions");
3224 bool multi_block;
3225 switch (stub_id) {
3226 case StubId::stubgen_sha512_implCompress_id:
3227 multi_block = false;
3228 break;
3229 case StubId::stubgen_sha512_implCompressMB_id:
3230 multi_block = true;
3231 break;
3232 default:
3233 ShouldNotReachHere();
3234 }
3235 StubCodeMark mark(this, stub_id);
3236 address start = __ function_entry();
3237
3238 __ sha512 (multi_block);
3239 __ blr();
3240
3241 return start;
3242 }
3243
3244 address generate_data_cache_writeback() {
3245 const Register cacheline = R3_ARG1;
3246 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
3247 StubCodeMark mark(this, stub_id);
3248 address start = __ pc();
3249
3250 __ cache_wb(Address(cacheline));
3251 __ blr();
3252
3253 return start;
3254 }
3255
3256 address generate_data_cache_writeback_sync() {
3257 const Register is_presync = R3_ARG1;
3258 Register temp = R4;
3259 Label SKIP;
3260 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
3261 StubCodeMark mark(this, stub_id);
3262 address start = __ pc();
3263
3264 __ andi_(temp, is_presync, 1);
3265 __ bne(CR0, SKIP);
3266 __ cache_wbsync(false); // post sync => emit 'sync'
3267 __ bind(SKIP); // pre sync => emit nothing
3268 __ blr();
3269
3270 return start;
3271 }
3272
3273 void generate_arraycopy_stubs() {
3274 // generate the common exit first so later stubs can rely on it if
3275 // they want an UnsafeMemoryAccess exit non-local to the stub
3276 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
3277 // register the stub as the default exit with class UnsafeMemoryAccess
3278 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
3279
3280 // Note: the disjoint stubs must be generated first, some of the
3281 // conjoint stubs use them.
3282
3283 // Note: chaining of stubs does not rely on branching to an
3284 // auxiliary post-push entry because none of the stubs
3285 // push/pop a frame.
3286
3287 // non-aligned disjoint versions
3288 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id);
3289 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(StubId::stubgen_jshort_disjoint_arraycopy_id);
3290 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(StubId::stubgen_jint_disjoint_arraycopy_id);
3291 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(StubId::stubgen_jlong_disjoint_arraycopy_id);
3292 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id);
3293 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
3294
3295 // aligned disjoint versions
3296 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id);
3297 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id);
3298 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id);
3299 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id);
3300 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id);
3301 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
3302
3303 // non-aligned conjoint versions
3304 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(StubId::stubgen_jbyte_arraycopy_id);
3305 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(StubId::stubgen_jshort_arraycopy_id);
3306 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(StubId::stubgen_jint_arraycopy_id);
3307 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(StubId::stubgen_jlong_arraycopy_id);
3308 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_id);
3309 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id);
3310
3311 // aligned conjoint versions
3312 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id);
3313 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(StubId::stubgen_arrayof_jshort_arraycopy_id);
3314 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(StubId::stubgen_arrayof_jint_arraycopy_id);
3315 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(StubId::stubgen_arrayof_jlong_arraycopy_id);
3316 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
3317 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
3318
3319 // special/generic versions
3320 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id);
3321 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id);
3322
3323 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(STUB_ENTRY(jbyte_arraycopy()),
3324 STUB_ENTRY(jshort_arraycopy()),
3325 STUB_ENTRY(jint_arraycopy()),
3326 STUB_ENTRY(jlong_arraycopy()));
3327 StubRoutines::_generic_arraycopy = generate_generic_copy(STUB_ENTRY(jbyte_arraycopy()),
3328 STUB_ENTRY(jshort_arraycopy()),
3329 STUB_ENTRY(jint_arraycopy()),
3330 STUB_ENTRY(oop_arraycopy()),
3331 STUB_ENTRY(oop_disjoint_arraycopy()),
3332 STUB_ENTRY(jlong_arraycopy()),
3333 STUB_ENTRY(checkcast_arraycopy()));
3334
3335 // fill routines
3336 #ifdef COMPILER2
3337 if (OptimizeFill) {
3338 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
3339 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
3340 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
3341 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
3342 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
3343 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
3344 }
3345 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
3346 #endif
3347 }
3348
3349 // Stub for BigInteger::multiplyToLen()
3350 //
3351 // Arguments:
3352 //
3353 // Input:
3354 // R3 - x address
3355 // R4 - x length
3356 // R5 - y address
3357 // R6 - y length
3358 // R7 - z address
3359 //
3360 address generate_multiplyToLen() {
3361
3362 StubId stub_id = StubId::stubgen_multiplyToLen_id;
3363 StubCodeMark mark(this, stub_id);
3364
3365 address start = __ function_entry();
3366
3367 const Register x = R3;
3368 const Register xlen = R4;
3369 const Register y = R5;
3370 const Register ylen = R6;
3371 const Register z = R7;
3372
3373 const Register tmp1 = R2; // TOC not used.
3374 const Register tmp2 = R9;
3375 const Register tmp3 = R10;
3376 const Register tmp4 = R11;
3377 const Register tmp5 = R12;
3378
3379 // non-volatile regs
3380 const Register tmp6 = R31;
3381 const Register tmp7 = R30;
3382 const Register tmp8 = R29;
3383 const Register tmp9 = R28;
3384 const Register tmp10 = R27;
3385 const Register tmp11 = R26;
3386 const Register tmp12 = R25;
3387 const Register tmp13 = R24;
3388
3389 BLOCK_COMMENT("Entry:");
3390
3391 // C2 does not respect int to long conversion for stub calls.
3392 __ clrldi(xlen, xlen, 32);
3393 __ clrldi(ylen, ylen, 32);
3394
3395 // Save non-volatile regs (frameless).
3396 int current_offs = 8;
3397 __ std(R24, -current_offs, R1_SP); current_offs += 8;
3398 __ std(R25, -current_offs, R1_SP); current_offs += 8;
3399 __ std(R26, -current_offs, R1_SP); current_offs += 8;
3400 __ std(R27, -current_offs, R1_SP); current_offs += 8;
3401 __ std(R28, -current_offs, R1_SP); current_offs += 8;
3402 __ std(R29, -current_offs, R1_SP); current_offs += 8;
3403 __ std(R30, -current_offs, R1_SP); current_offs += 8;
3404 __ std(R31, -current_offs, R1_SP);
3405
3406 __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5,
3407 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
3408
3409 // Restore non-volatile regs.
3410 current_offs = 8;
3411 __ ld(R24, -current_offs, R1_SP); current_offs += 8;
3412 __ ld(R25, -current_offs, R1_SP); current_offs += 8;
3413 __ ld(R26, -current_offs, R1_SP); current_offs += 8;
3414 __ ld(R27, -current_offs, R1_SP); current_offs += 8;
3415 __ ld(R28, -current_offs, R1_SP); current_offs += 8;
3416 __ ld(R29, -current_offs, R1_SP); current_offs += 8;
3417 __ ld(R30, -current_offs, R1_SP); current_offs += 8;
3418 __ ld(R31, -current_offs, R1_SP);
3419
3420 __ blr(); // Return to caller.
3421
3422 return start;
3423 }
3424
3425 /**
3426 * Arguments:
3427 *
3428 * Input:
3429 * R3_ARG1 - out address
3430 * R4_ARG2 - in address
3431 * R5_ARG3 - offset
3432 * R6_ARG4 - len
3433 * R7_ARG5 - k
3434 * Output:
3435 * R3_RET - carry
3436 */
3437 address generate_mulAdd() {
3438 __ align(CodeEntryAlignment);
3439 StubId stub_id = StubId::stubgen_mulAdd_id;
3440 StubCodeMark mark(this, stub_id);
3441
3442 address start = __ function_entry();
3443
3444 // C2 does not sign extend signed parameters to full 64 bits registers:
3445 __ rldic (R5_ARG3, R5_ARG3, 2, 32); // always positive
3446 __ clrldi(R6_ARG4, R6_ARG4, 32); // force zero bits on higher word
3447 __ clrldi(R7_ARG5, R7_ARG5, 32); // force zero bits on higher word
3448
3449 __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
3450
3451 // Moves output carry to return register
3452 __ mr (R3_RET, R10);
3453
3454 __ blr();
3455
3456 return start;
3457 }
3458
3459 /**
3460 * Arguments:
3461 *
3462 * Input:
3463 * R3_ARG1 - in address
3464 * R4_ARG2 - in length
3465 * R5_ARG3 - out address
3466 * R6_ARG4 - out length
3467 */
3468 address generate_squareToLen() {
3469 __ align(CodeEntryAlignment);
3470 StubId stub_id = StubId::stubgen_squareToLen_id;
3471 StubCodeMark mark(this, stub_id);
3472
3473 address start = __ function_entry();
3474
3475 // args - higher word is cleaned (unsignedly) due to int to long casting
3476 const Register in = R3_ARG1;
3477 const Register in_len = R4_ARG2;
3478 __ clrldi(in_len, in_len, 32);
3479 const Register out = R5_ARG3;
3480 const Register out_len = R6_ARG4;
3481 __ clrldi(out_len, out_len, 32);
3482
3483 // output
3484 const Register ret = R3_RET;
3485
3486 // temporaries
3487 const Register lplw_s = R7;
3488 const Register in_aux = R8;
3489 const Register out_aux = R9;
3490 const Register piece = R10;
3491 const Register product = R14;
3492 const Register lplw = R15;
3493 const Register i_minus1 = R16;
3494 const Register carry = R17;
3495 const Register offset = R18;
3496 const Register off_aux = R19;
3497 const Register t = R20;
3498 const Register mlen = R21;
3499 const Register len = R22;
3500 const Register a = R23;
3501 const Register b = R24;
3502 const Register i = R25;
3503 const Register c = R26;
3504 const Register cs = R27;
3505
3506 // Labels
3507 Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_LOOP_SQUARE;
3508 Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_SQUARE;
3509
3510 // Save non-volatile regs (frameless).
3511 int current_offs = -8;
3512 __ std(R28, current_offs, R1_SP); current_offs -= 8;
3513 __ std(R27, current_offs, R1_SP); current_offs -= 8;
3514 __ std(R26, current_offs, R1_SP); current_offs -= 8;
3515 __ std(R25, current_offs, R1_SP); current_offs -= 8;
3516 __ std(R24, current_offs, R1_SP); current_offs -= 8;
3517 __ std(R23, current_offs, R1_SP); current_offs -= 8;
3518 __ std(R22, current_offs, R1_SP); current_offs -= 8;
3519 __ std(R21, current_offs, R1_SP); current_offs -= 8;
3520 __ std(R20, current_offs, R1_SP); current_offs -= 8;
3521 __ std(R19, current_offs, R1_SP); current_offs -= 8;
3522 __ std(R18, current_offs, R1_SP); current_offs -= 8;
3523 __ std(R17, current_offs, R1_SP); current_offs -= 8;
3524 __ std(R16, current_offs, R1_SP); current_offs -= 8;
3525 __ std(R15, current_offs, R1_SP); current_offs -= 8;
3526 __ std(R14, current_offs, R1_SP);
3527
3528 // Store the squares, right shifted one bit (i.e., divided by 2)
3529 __ subi (out_aux, out, 8);
3530 __ subi (in_aux, in, 4);
3531 __ cmpwi (CR0, in_len, 0);
3532 // Initialize lplw outside of the loop
3533 __ xorr (lplw, lplw, lplw);
3534 __ ble (CR0, SKIP_LOOP_SQUARE); // in_len <= 0
3535 __ mtctr (in_len);
3536
3537 __ bind(LOOP_SQUARE);
3538 __ lwzu (piece, 4, in_aux);
3539 __ mulld (product, piece, piece);
3540 // shift left 63 bits and only keep the MSB
3541 __ rldic (lplw_s, lplw, 63, 0);
3542 __ mr (lplw, product);
3543 // shift right 1 bit without sign extension
3544 __ srdi (product, product, 1);
3545 // join them to the same register and store it
3546 __ orr (product, lplw_s, product);
3547 #ifdef VM_LITTLE_ENDIAN
3548 // Swap low and high words for little endian
3549 __ rldicl (product, product, 32, 0);
3550 #endif
3551 __ stdu (product, 8, out_aux);
3552 __ bdnz (LOOP_SQUARE);
3553
3554 __ bind(SKIP_LOOP_SQUARE);
3555
3556 // Add in off-diagonal sums
3557 __ cmpwi (CR0, in_len, 0);
3558 __ ble (CR0, SKIP_DIAGONAL_SUM);
3559 // Avoid CTR usage here in order to use it at mulAdd
3560 __ subi (i_minus1, in_len, 1);
3561 __ li (offset, 4);
3562
3563 __ bind(LOOP_DIAGONAL_SUM);
3564
3565 __ sldi (off_aux, out_len, 2);
3566 __ sub (off_aux, off_aux, offset);
3567
3568 __ mr (len, i_minus1);
3569 __ sldi (mlen, i_minus1, 2);
3570 __ lwzx (t, in, mlen);
3571
3572 __ muladd (out, in, off_aux, len, t, a, b, carry);
3573
3574 // begin<addOne>
3575 // off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
3576 __ addi (mlen, mlen, 4);
3577 __ sldi (a, out_len, 2);
3578 __ subi (a, a, 4);
3579 __ sub (a, a, mlen);
3580 __ subi (off_aux, offset, 4);
3581 __ sub (off_aux, a, off_aux);
3582
3583 __ lwzx (b, off_aux, out);
3584 __ add (b, b, carry);
3585 __ stwx (b, off_aux, out);
3586
3587 // if (((uint64_t)s >> 32) != 0) {
3588 __ srdi_ (a, b, 32);
3589 __ beq (CR0, SKIP_ADDONE);
3590
3591 // while (--mlen >= 0) {
3592 __ bind(LOOP_ADDONE);
3593 __ subi (mlen, mlen, 4);
3594 __ cmpwi (CR0, mlen, 0);
3595 __ beq (CR0, SKIP_ADDONE);
3596
3597 // if (--offset_aux < 0) { // Carry out of number
3598 __ subi (off_aux, off_aux, 4);
3599 __ cmpwi (CR0, off_aux, 0);
3600 __ blt (CR0, SKIP_ADDONE);
3601
3602 // } else {
3603 __ lwzx (b, off_aux, out);
3604 __ addi (b, b, 1);
3605 __ stwx (b, off_aux, out);
3606 __ cmpwi (CR0, b, 0);
3607 __ bne (CR0, SKIP_ADDONE);
3608 __ b (LOOP_ADDONE);
3609
3610 __ bind(SKIP_ADDONE);
3611 // } } } end<addOne>
3612
3613 __ addi (offset, offset, 8);
3614 __ subi (i_minus1, i_minus1, 1);
3615 __ cmpwi (CR0, i_minus1, 0);
3616 __ bge (CR0, LOOP_DIAGONAL_SUM);
3617
3618 __ bind(SKIP_DIAGONAL_SUM);
3619
3620 // Shift back up and set low bit
3621 // Shifts 1 bit left up to len positions. Assumes no leading zeros
3622 // begin<primitiveLeftShift>
3623 __ cmpwi (CR0, out_len, 0);
3624 __ ble (CR0, SKIP_LSHIFT);
3625 __ li (i, 0);
3626 __ lwz (c, 0, out);
3627 __ subi (b, out_len, 1);
3628 __ mtctr (b);
3629
3630 __ bind(LOOP_LSHIFT);
3631 __ mr (b, c);
3632 __ addi (cs, i, 4);
3633 __ lwzx (c, out, cs);
3634
3635 __ sldi (b, b, 1);
3636 __ srwi (cs, c, 31);
3637 __ orr (b, b, cs);
3638 __ stwx (b, i, out);
3639
3640 __ addi (i, i, 4);
3641 __ bdnz (LOOP_LSHIFT);
3642
3643 __ sldi (c, out_len, 2);
3644 __ subi (c, c, 4);
3645 __ lwzx (b, out, c);
3646 __ sldi (b, b, 1);
3647 __ stwx (b, out, c);
3648
3649 __ bind(SKIP_LSHIFT);
3650 // end<primitiveLeftShift>
3651
3652 // Set low bit
3653 __ sldi (i, in_len, 2);
3654 __ subi (i, i, 4);
3655 __ lwzx (i, in, i);
3656 __ sldi (c, out_len, 2);
3657 __ subi (c, c, 4);
3658 __ lwzx (b, out, c);
3659
3660 __ andi (i, i, 1);
3661 __ orr (i, b, i);
3662
3663 __ stwx (i, out, c);
3664
3665 // Restore non-volatile regs.
3666 current_offs = -8;
3667 __ ld(R28, current_offs, R1_SP); current_offs -= 8;
3668 __ ld(R27, current_offs, R1_SP); current_offs -= 8;
3669 __ ld(R26, current_offs, R1_SP); current_offs -= 8;
3670 __ ld(R25, current_offs, R1_SP); current_offs -= 8;
3671 __ ld(R24, current_offs, R1_SP); current_offs -= 8;
3672 __ ld(R23, current_offs, R1_SP); current_offs -= 8;
3673 __ ld(R22, current_offs, R1_SP); current_offs -= 8;
3674 __ ld(R21, current_offs, R1_SP); current_offs -= 8;
3675 __ ld(R20, current_offs, R1_SP); current_offs -= 8;
3676 __ ld(R19, current_offs, R1_SP); current_offs -= 8;
3677 __ ld(R18, current_offs, R1_SP); current_offs -= 8;
3678 __ ld(R17, current_offs, R1_SP); current_offs -= 8;
3679 __ ld(R16, current_offs, R1_SP); current_offs -= 8;
3680 __ ld(R15, current_offs, R1_SP); current_offs -= 8;
3681 __ ld(R14, current_offs, R1_SP);
3682
3683 __ mr(ret, out);
3684 __ blr();
3685
3686 return start;
3687 }
3688
3689 /**
3690 * Arguments:
3691 *
3692 * Inputs:
3693 * R3_ARG1 - int crc
3694 * R4_ARG2 - byte* buf
3695 * R5_ARG3 - int length (of buffer)
3696 *
3697 * scratch:
3698 * R2, R6-R12
3699 *
3700 * Output:
3701 * R3_RET - int crc result
3702 */
3703 // Compute CRC32 function.
3704 address generate_CRC32_updateBytes(StubId stub_id) {
3705 bool is_crc32c;
3706 switch (stub_id) {
3707 case StubId::stubgen_updateBytesCRC32_id:
3708 is_crc32c = false;
3709 break;
3710 case StubId::stubgen_updateBytesCRC32C_id:
3711 is_crc32c = true;
3712 break;
3713 default:
3714 ShouldNotReachHere();
3715 }
3716 __ align(CodeEntryAlignment);
3717 StubCodeMark mark(this, stub_id);
3718 address start = __ function_entry(); // Remember stub start address (is rtn value).
3719 __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
3720 __ blr();
3721 return start;
3722 }
3723
3724 address generate_floatToFloat16() {
3725 __ align(CodeEntryAlignment);
3726 StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
3727 address start = __ function_entry();
3728 __ f2hf(R3_RET, F1_ARG1, F0);
3729 __ blr();
3730 return start;
3731 }
3732
3733 address generate_float16ToFloat() {
3734 __ align(CodeEntryAlignment);
3735 StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
3736 address start = __ function_entry();
3737 __ hf2f(F1_RET, R3_ARG1);
3738 __ blr();
3739 return start;
3740 }
3741
3742 address generate_method_entry_barrier() {
3743 __ align(CodeEntryAlignment);
3744 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
3745 StubCodeMark mark(this, stub_id);
3746
3747 address stub_address = __ pc();
3748
3749 int nbytes_save = MacroAssembler::num_volatile_regs * BytesPerWord;
3750 __ save_volatile_gprs(R1_SP, -nbytes_save, true);
3751
3752 // Link register points to instruction in prologue of the guarded nmethod.
3753 // As the stub requires one layer of indirection (argument is of type address* and not address),
3754 // passing the link register's value directly doesn't work.
3755 // Since we have to save the link register on the stack anyway, we calculate the corresponding stack address
3756 // and pass that one instead.
3757 __ addi(R3_ARG1, R1_SP, _abi0(lr));
3758
3759 __ save_LR(R0);
3760 __ push_frame_reg_args(nbytes_save, R0);
3761
3762 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier));
3763 __ mr(R0, R3_RET);
3764
3765 __ pop_frame();
3766 __ restore_LR(R3_RET /* used as tmp register */);
3767 __ restore_volatile_gprs(R1_SP, -nbytes_save, true);
3768
3769 __ cmpdi(CR0, R0, 0);
3770
3771 // Return to prologue if no deoptimization is required (bnelr)
3772 __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintIsTaken);
3773
3774 // Deoptimization required.
3775 // For actually handling the deoptimization, the 'wrong method stub' is invoked.
3776 __ load_const_optimized(R0, SharedRuntime::get_handle_wrong_method_stub());
3777 __ mtctr(R0);
3778
3779 // Pop the frame built in the prologue.
3780 __ pop_frame();
3781
3782 // Restore link register. Required as the 'wrong method stub' needs the caller's frame
3783 // to properly deoptimize this method (e.g. by re-resolving the call site for compiled methods).
3784 // This method's prologue is aborted.
3785 __ restore_LR(R0);
3786
3787 __ bctr();
3788 return stub_address;
3789 }
3790
3791 #ifdef VM_LITTLE_ENDIAN
3792 // The following Base64 decode intrinsic is based on an algorithm outlined
3793 // in here:
3794 // http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html
3795 // in the section titled "Vector lookup (pshufb with bitmask)"
3796 //
3797 // This implementation differs in the following ways:
3798 // * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions
3799 // are used instead. It turns out that some of the vector operations
3800 // needed in the algorithm require fewer AltiVec instructions.
3801 // * The algorithm in the above mentioned paper doesn't handle the
3802 // Base64-URL variant in RFC 4648. Adjustments to both the code and to two
3803 // lookup tables are needed for this.
3804 // * The "Pack" section of the code is a complete rewrite for Power because we
3805 // can utilize better instructions for this step.
3806 //
3807
3808 // Offsets per group of Base64 characters
3809 // Uppercase
3810 #define UC (signed char)((-'A' + 0) & 0xff)
3811 // Lowercase
3812 #define LC (signed char)((-'a' + 26) & 0xff)
3813 // Digits
3814 #define DIG (signed char)((-'0' + 52) & 0xff)
3815 // Plus sign (URL = 0)
3816 #define PLS (signed char)((-'+' + 62) & 0xff)
3817 // Hyphen (URL = 1)
3818 #define HYP (signed char)((-'-' + 62) & 0xff)
3819 // Slash (URL = 0)
3820 #define SLS (signed char)((-'/' + 63) & 0xff)
3821 // Underscore (URL = 1)
3822 #define US (signed char)((-'_' + 63) & 0xff)
3823
3824 // For P10 (or later) only
3825 #define VALID_B64 0x80
3826 #define VB64(x) (VALID_B64 | x)
3827
3828 #define BLK_OFFSETOF(x) (offsetof(constant_block, x))
3829
3830 // In little-endian mode, the lxv instruction loads the element at EA into
3831 // element 15 of the vector register, EA+1 goes into element 14, and so
3832 // on.
3833 //
3834 // To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the
3835 // order of the elements in a vector initialization.
3836 #define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0
3837
3838 //
3839 // Base64 decodeBlock intrinsic
3840 address generate_base64_decodeBlock() {
3841 __ align(CodeEntryAlignment);
3842 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
3843 StubCodeMark mark(this, stub_id);
3844 address start = __ function_entry();
3845
3846 typedef struct {
3847 signed char offsetLUT_val[16];
3848 signed char offsetLUT_URL_val[16];
3849 unsigned char maskLUT_val[16];
3850 unsigned char maskLUT_URL_val[16];
3851 unsigned char bitposLUT_val[16];
3852 unsigned char table_32_47_val[16];
3853 unsigned char table_32_47_URL_val[16];
3854 unsigned char table_48_63_val[16];
3855 unsigned char table_64_79_val[16];
3856 unsigned char table_80_95_val[16];
3857 unsigned char table_80_95_URL_val[16];
3858 unsigned char table_96_111_val[16];
3859 unsigned char table_112_127_val[16];
3860 unsigned char pack_lshift_val[16];
3861 unsigned char pack_rshift_val[16];
3862 unsigned char pack_permute_val[16];
3863 } constant_block;
3864
3865 alignas(16) static const constant_block const_block = {
3866
3867 .offsetLUT_val = {
3868 ARRAY_TO_LXV_ORDER(
3869 0, 0, PLS, DIG, UC, UC, LC, LC,
3870 0, 0, 0, 0, 0, 0, 0, 0 ) },
3871
3872 .offsetLUT_URL_val = {
3873 ARRAY_TO_LXV_ORDER(
3874 0, 0, HYP, DIG, UC, UC, LC, LC,
3875 0, 0, 0, 0, 0, 0, 0, 0 ) },
3876
3877 .maskLUT_val = {
3878 ARRAY_TO_LXV_ORDER(
3879 /* 0 */ (unsigned char)0b10101000,
3880 /* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3881 (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3882 (unsigned char)0b11111000,
3883 /* 10 */ (unsigned char)0b11110000,
3884 /* 11 */ (unsigned char)0b01010100,
3885 /* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000,
3886 /* 15 */ (unsigned char)0b01010100 ) },
3887
3888 .maskLUT_URL_val = {
3889 ARRAY_TO_LXV_ORDER(
3890 /* 0 */ (unsigned char)0b10101000,
3891 /* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3892 (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3893 (unsigned char)0b11111000,
3894 /* 10 */ (unsigned char)0b11110000,
3895 /* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000,
3896 /* 13 */ (unsigned char)0b01010100,
3897 /* 14 */ (unsigned char)0b01010000,
3898 /* 15 */ (unsigned char)0b01110000 ) },
3899
3900 .bitposLUT_val = {
3901 ARRAY_TO_LXV_ORDER(
3902 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80,
3903 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) },
3904
3905 // In the following table_*_val constants, a 0 value means the
3906 // character is not in the Base64 character set
3907 .table_32_47_val = {
3908 ARRAY_TO_LXV_ORDER (
3909 /* space .. '*' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '+' = 62 */ VB64(62), /* ',' .. '.' = 0 */ 0, 0, 0, /* '/' = 63 */ VB64(63) ) },
3910
3911 .table_32_47_URL_val = {
3912 ARRAY_TO_LXV_ORDER(
3913 /* space .. ',' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '-' = 62 */ VB64(62), /* '.' .. '/' */ 0, 0 ) },
3914
3915 .table_48_63_val = {
3916 ARRAY_TO_LXV_ORDER(
3917 /* '0' .. '9' = 52 .. 61 */ VB64(52), VB64(53), VB64(54), VB64(55), VB64(56), VB64(57), VB64(58), VB64(59), VB64(60), VB64(61),
3918 /* ':' .. '?' = 0 */ 0, 0, 0, 0, 0, 0 ) },
3919
3920 .table_64_79_val = {
3921 ARRAY_TO_LXV_ORDER(
3922 /* '@' = 0 */ 0, /* 'A' .. 'O' = 0 .. 14 */ VB64(0), VB64(1), VB64(2), VB64(3), VB64(4), VB64(5), VB64(6), VB64(7), VB64(8),
3923 VB64(9), VB64(10), VB64(11), VB64(12), VB64(13), VB64(14) ) },
3924
3925 .table_80_95_val = {
3926 ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3927 VB64(23), VB64(24), VB64(25), /* '[' .. '_' = 0 */ 0, 0, 0, 0, 0 ) },
3928
3929 .table_80_95_URL_val = {
3930 ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3931 VB64(23), VB64(24), VB64(25), /* '[' .. '^' = 0 */ 0, 0, 0, 0, /* '_' = 63 */ VB64(63) ) },
3932
3933 .table_96_111_val = {
3934 ARRAY_TO_LXV_ORDER(/* '`' = 0 */ 0, /* 'a' .. 'o' = 26 .. 40 */ VB64(26), VB64(27), VB64(28), VB64(29), VB64(30), VB64(31),
3935 VB64(32), VB64(33), VB64(34), VB64(35), VB64(36), VB64(37), VB64(38), VB64(39), VB64(40) ) },
3936
3937 .table_112_127_val = {
3938 ARRAY_TO_LXV_ORDER(/* 'p' .. 'z' = 41 .. 51 */ VB64(41), VB64(42), VB64(43), VB64(44), VB64(45), VB64(46), VB64(47), VB64(48),
3939 VB64(49), VB64(50), VB64(51), /* '{' .. DEL = 0 */ 0, 0, 0, 0, 0 ) },
3940
3941 .pack_lshift_val = {
3942 ARRAY_TO_LXV_ORDER(
3943 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2 ) },
3944
3945 .pack_rshift_val = {
3946 ARRAY_TO_LXV_ORDER(
3947 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0 ) },
3948
3949 // The first 4 index values are "don't care" because
3950 // we only use the first 12 bytes of the vector,
3951 // which are decoded from 16 bytes of Base64 characters.
3952 .pack_permute_val = {
3953 ARRAY_TO_LXV_ORDER(
3954 0, 0, 0, 0,
3955 0, 1, 2,
3956 4, 5, 6,
3957 8, 9, 10,
3958 12, 13, 14 ) }
3959 };
3960
3961 const unsigned block_size = 16; // number of bytes to process in each pass through the loop
3962 const unsigned block_size_shift = 4;
3963
3964 // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
3965 Register s = R3_ARG1; // source starting address of Base64 characters
3966 Register sp = R4_ARG2; // source offset
3967 Register sl = R5_ARG3; // source length = # of Base64 characters to be processed
3968 Register d = R6_ARG4; // destination address
3969 Register dp = R7_ARG5; // destination offset
3970 Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
3971 Register isMIME = R9_ARG7; // boolean, if non-zero indicates use of RFC 2045 MIME encoding - not used
3972
3973 // Local variables
3974 Register const_ptr = R9; // used for loading constants
3975 Register tmp_reg = R10; // used for speeding up load_constant_optimized()
3976
3977 // Re-use R9 and R10 to avoid using non-volatile registers (requires save/restore)
3978 Register out = R9; // moving out (destination) pointer
3979 Register in = R10; // moving in (source) pointer
3980
3981 // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
3982 // VR Constants
3983 VectorRegister vec_0s = VR0;
3984 VectorRegister vec_4s = VR1;
3985 VectorRegister vec_8s = VR2;
3986 VectorRegister vec_special_case_char = VR3;
3987 VectorRegister pack_rshift = VR4;
3988 VectorRegister pack_lshift = VR5;
3989
3990 // VSR Constants
3991 VectorSRegister offsetLUT = VSR0;
3992 VectorSRegister maskLUT = VSR1;
3993 VectorSRegister bitposLUT = VSR2;
3994 VectorSRegister vec_0xfs = VSR3;
3995 VectorSRegister vec_special_case_offset = VSR4;
3996 VectorSRegister pack_permute = VSR5;
3997
3998 // P10 (or later) VSR lookup constants
3999 VectorSRegister table_32_47 = VSR0;
4000 VectorSRegister table_48_63 = VSR1;
4001 VectorSRegister table_64_79 = VSR2;
4002 VectorSRegister table_80_95 = VSR3;
4003 VectorSRegister table_96_111 = VSR4;
4004 VectorSRegister table_112_127 = VSR6;
4005
4006 // Data read in and later converted
4007 VectorRegister input = VR6;
4008 // Variable for testing Base64 validity
4009 VectorRegister non_match = VR10;
4010
4011 // P9 VR Variables for lookup
4012 VectorRegister higher_nibble = VR7;
4013 VectorRegister eq_special_case_char = VR8;
4014 VectorRegister offsets = VR9;
4015
4016 // P9 VSR lookup variables
4017 VectorSRegister bit = VSR6;
4018 VectorSRegister lower_nibble = VSR7;
4019 VectorSRegister M = VSR8;
4020
4021 // P10 (or later) VSR lookup variables
4022 VectorSRegister xlate_a = VSR7;
4023 VectorSRegister xlate_b = VSR8;
4024
4025 // Variables for pack
4026 // VR
4027 VectorRegister l = VR7; // reuse higher_nibble's register
4028 VectorRegister r = VR8; // reuse eq_special_case_char's register
4029 VectorRegister gathered = VR10; // reuse non_match's register
4030
4031 Label not_URL, calculate_size, loop_start, loop_exit, return_zero;
4032
4033 // The upper 32 bits of the non-pointer parameter registers are not
4034 // guaranteed to be zero, so mask off those upper bits.
4035 __ clrldi(sp, sp, 32);
4036 __ clrldi(sl, sl, 32);
4037
4038 // Don't handle the last 4 characters of the source, because this
4039 // VSX-based algorithm doesn't handle padding characters. Also the
4040 // vector code will always write 16 bytes of decoded data on each pass,
4041 // but only the first 12 of those 16 bytes are valid data (16 base64
4042 // characters become 12 bytes of binary data), so for this reason we
4043 // need to subtract an additional 8 bytes from the source length, in
4044 // order not to write past the end of the destination buffer. The
4045 // result of this subtraction implies that a Java function in the
4046 // Base64 class will be used to process the last 12 characters.
4047 __ sub(sl, sl, sp);
4048 __ subi(sl, sl, 12);
4049
4050 // Load CTR with the number of passes through the loop
4051 // = sl >> block_size_shift. After the shift, if sl <= 0, there's too
4052 // little data to be processed by this intrinsic.
4053 __ srawi_(sl, sl, block_size_shift);
4054 __ ble(CR0, return_zero);
4055 __ mtctr(sl);
4056
4057 // Clear the other two parameter registers upper 32 bits.
4058 __ clrldi(isURL, isURL, 32);
4059 __ clrldi(dp, dp, 32);
4060
4061 // Load constant vec registers that need to be loaded from memory
4062 __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
4063 __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
4064 __ lxv(pack_rshift->to_vsr(), BLK_OFFSETOF(pack_rshift_val), const_ptr);
4065 __ lxv(pack_lshift->to_vsr(), BLK_OFFSETOF(pack_lshift_val), const_ptr);
4066 __ lxv(pack_permute, BLK_OFFSETOF(pack_permute_val), const_ptr);
4067
4068 // Splat the constants that can use xxspltib
4069 __ xxspltib(vec_0s->to_vsr(), 0);
4070 __ xxspltib(vec_8s->to_vsr(), 8);
4071 if (PowerArchitecturePPC64 >= 10) {
4072 // Using VALID_B64 for the offsets effectively strips the upper bit
4073 // of each byte that was selected from the table. Setting the upper
4074 // bit gives us a way to distinguish between the 6-bit value of 0
4075 // from an error code of 0, which will happen if the character is
4076 // outside the range of the lookup, or is an illegal Base64
4077 // character, such as %.
4078 __ xxspltib(offsets->to_vsr(), VALID_B64);
4079
4080 __ lxv(table_48_63, BLK_OFFSETOF(table_48_63_val), const_ptr);
4081 __ lxv(table_64_79, BLK_OFFSETOF(table_64_79_val), const_ptr);
4082 __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
4083 __ lxv(table_96_111, BLK_OFFSETOF(table_96_111_val), const_ptr);
4084 __ lxv(table_112_127, BLK_OFFSETOF(table_112_127_val), const_ptr);
4085 } else {
4086 __ xxspltib(vec_4s->to_vsr(), 4);
4087 __ xxspltib(vec_0xfs, 0xf);
4088 __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
4089 }
4090
4091 // The rest of the constants use different values depending on the
4092 // setting of isURL
4093 __ cmpwi(CR0, isURL, 0);
4094 __ beq(CR0, not_URL);
4095
4096 // isURL != 0 (true)
4097 if (PowerArchitecturePPC64 >= 10) {
4098 __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_URL_val), const_ptr);
4099 __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_URL_val), const_ptr);
4100 } else {
4101 __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_URL_val), const_ptr);
4102 __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_URL_val), const_ptr);
4103 __ xxspltib(vec_special_case_char->to_vsr(), '_');
4104 __ xxspltib(vec_special_case_offset, (unsigned char)US);
4105 }
4106 __ b(calculate_size);
4107
4108 // isURL = 0 (false)
4109 __ bind(not_URL);
4110 if (PowerArchitecturePPC64 >= 10) {
4111 __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_val), const_ptr);
4112 __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
4113 } else {
4114 __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_val), const_ptr);
4115 __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_val), const_ptr);
4116 __ xxspltib(vec_special_case_char->to_vsr(), '/');
4117 __ xxspltib(vec_special_case_offset, (unsigned char)SLS);
4118 }
4119
4120 __ bind(calculate_size);
4121
4122 // out starts at d + dp
4123 __ add(out, d, dp);
4124
4125 // in starts at s + sp
4126 __ add(in, s, sp);
4127
4128 __ align(32);
4129 __ bind(loop_start);
4130 __ lxv(input->to_vsr(), 0, in); // offset=0
4131
4132 //
4133 // Lookup
4134 //
4135 if (PowerArchitecturePPC64 >= 10) {
4136 // Use xxpermx to do a lookup of each Base64 character in the
4137 // input vector and translate it to a 6-bit value + 0x80.
4138 // Characters which are not valid Base64 characters will result
4139 // in a zero in the corresponding byte.
4140 //
4141 // Note that due to align(32) call above, the xxpermx instructions do
4142 // not require align_prefix() calls, since the final xxpermx
4143 // prefix+opcode is at byte 24.
4144 __ xxpermx(xlate_a, table_32_47, table_48_63, input->to_vsr(), 1); // offset=4
4145 __ xxpermx(xlate_b, table_64_79, table_80_95, input->to_vsr(), 2); // offset=12
4146 __ xxlor(xlate_b, xlate_a, xlate_b); // offset=20
4147 __ xxpermx(xlate_a, table_96_111, table_112_127, input->to_vsr(), 3); // offset=24
4148 __ xxlor(input->to_vsr(), xlate_a, xlate_b);
4149 // Check for non-Base64 characters by comparing each byte to zero.
4150 __ vcmpequb_(non_match, input, vec_0s);
4151 } else {
4152 // Isolate the upper 4 bits of each character by shifting it right 4 bits
4153 __ vsrb(higher_nibble, input, vec_4s);
4154 // Isolate the lower 4 bits by masking
4155 __ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
4156
4157 // Get the offset (the value to subtract from the byte) by using
4158 // a lookup table indexed by the upper 4 bits of the character
4159 __ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
4160
4161 // Find out which elements are the special case character (isURL ? '/' : '-')
4162 __ vcmpequb(eq_special_case_char, input, vec_special_case_char);
4163
4164 // For each character in the input which is a special case
4165 // character, replace its offset with one that is special for that
4166 // character.
4167 __ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
4168
4169 // Use the lower_nibble to select a mask "M" from the lookup table.
4170 __ xxperm(M, maskLUT, lower_nibble);
4171
4172 // "bit" is used to isolate which of the bits in M is relevant.
4173 __ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
4174
4175 // Each element of non_match correspond to one each of the 16 input
4176 // characters. Those elements that become 0x00 after the xxland
4177 // instruction are invalid Base64 characters.
4178 __ xxland(non_match->to_vsr(), M, bit);
4179
4180 // Compare each element to zero
4181 //
4182 __ vcmpequb_(non_match, non_match, vec_0s);
4183 }
4184 // vmcmpequb_ sets the EQ bit of CR6 if no elements compare equal.
4185 // Any element comparing equal to zero means there is an error in
4186 // that element. Note that the comparison result register
4187 // non_match is not referenced again. Only CR6-EQ matters.
4188 __ bne_predict_not_taken(CR6, loop_exit);
4189
4190 // The Base64 characters had no errors, so add the offsets, which in
4191 // the case of Power10 is a constant vector of all 0x80's (see earlier
4192 // comment where the offsets register is loaded).
4193 __ vaddubm(input, input, offsets);
4194
4195 // Pack
4196 //
4197 // In the tables below, b0, b1, .. b15 are the bytes of decoded
4198 // binary data, the first line of each of the cells (except for
4199 // the constants) uses the bit-field nomenclature from the
4200 // above-linked paper, whereas the second line is more specific
4201 // about which exact bits are present, and is constructed using the
4202 // Power ISA 3.x document style, where:
4203 //
4204 // * The specifier after the colon depicts which bits are there.
4205 // * The bit numbering is big endian style (bit 0 is the most
4206 // significant).
4207 // * || is a concatenate operator.
4208 // * Strings of 0's are a field of zeros with the shown length, and
4209 // likewise for strings of 1's.
4210
4211 // Note that only e12..e15 are shown here because the shifting
4212 // and OR'ing pattern replicates for e8..e11, e4..7, and
4213 // e0..e3.
4214 //
4215 // +======================+=================+======================+======================+=============+
4216 // | Vector | e12 | e13 | e14 | e15 |
4217 // | Element | | | | |
4218 // +======================+=================+======================+======================+=============+
4219 // | after vaddubm | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
4220 // | | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4221 // +----------------------+-----------------+----------------------+----------------------+-------------+
4222 // | pack_lshift | | << 6 | << 4 | << 2 |
4223 // +----------------------+-----------------+----------------------+----------------------+-------------+
4224 // | l after vslb | 00dddddd | cc000000 | bbbb0000 | aaaaaa00 |
4225 // | | 00||b2:2..7 | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 |
4226 // +----------------------+-----------------+----------------------+----------------------+-------------+
4227 // | l after vslo | cc000000 | bbbb0000 | aaaaaa00 | 00000000 |
4228 // | | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 | 00000000 |
4229 // +----------------------+-----------------+----------------------+----------------------+-------------+
4230 // | pack_rshift | | >> 2 | >> 4 | |
4231 // +----------------------+-----------------+----------------------+----------------------+-------------+
4232 // | r after vsrb | 00dddddd | 0000cccc | 000000bb | 00aaaaaa |
4233 // | | 00||b2:2..7 | 0000||b1:4..7 | 000000||b0:6..7 | 00||b0:0..5 |
4234 // +----------------------+-----------------+----------------------+----------------------+-------------+
4235 // | gathered after xxlor | ccdddddd | bbbbcccc | aaaaaabb | 00aaaaaa |
4236 // | | b2:0..7 | b1:0..7 | b0:0..7 | 00||b0:0..5 |
4237 // +======================+=================+======================+======================+=============+
4238 //
4239 // Note: there is a typo in the above-linked paper that shows the result of the gathering process is:
4240 // [ddddddcc|bbbbcccc|aaaaaabb]
4241 // but should be:
4242 // [ccdddddd|bbbbcccc|aaaaaabb]
4243 //
4244 __ vslb(l, input, pack_lshift);
4245 // vslo of vec_8s shifts the vector by one octet toward lower
4246 // element numbers, discarding element 0. This means it actually
4247 // shifts to the right (not left) according to the order of the
4248 // table above.
4249 __ vslo(l, l, vec_8s);
4250 __ vsrb(r, input, pack_rshift);
4251 __ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
4252
4253 // Final rearrangement of bytes into their correct positions.
4254 // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4255 // | Vector | e0 | e1 | e2 | e3 | e4 | e5 | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
4256 // | Elements | | | | | | | | | | | | | | | | |
4257 // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4258 // | after xxlor | b11 | b10 | b9 | xx | b8 | b7 | b6 | xx | b5 | b4 | b3 | xx | b2 | b1 | b0 | xx |
4259 // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4260 // | pack_permute | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 4 | 5 | 6 | 8 | 9 | 10 | 12 | 13 | 14 |
4261 // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4262 // | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
4263 // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4264 // xx bytes are not used to form the final data
4265 // b0..b15 are the decoded and reassembled 8-bit bytes of data
4266 // b11 with asterisk is a "don't care", because these bytes will be
4267 // overwritten on the next iteration.
4268 __ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
4269
4270 // We cannot use a static displacement on the store, since it's a
4271 // multiple of 12, not 16. Note that this stxv instruction actually
4272 // writes 16 bytes, even though only the first 12 are valid data.
4273 __ stxv(gathered->to_vsr(), 0, out);
4274 __ addi(out, out, 12);
4275 __ addi(in, in, 16);
4276 __ bdnz(loop_start);
4277
4278 __ bind(loop_exit);
4279
4280 // Return the number of out bytes produced, which is (out - (d + dp)) == out - d - dp;
4281 __ sub(R3_RET, out, d);
4282 __ sub(R3_RET, R3_RET, dp);
4283
4284 __ blr();
4285
4286 __ bind(return_zero);
4287 __ li(R3_RET, 0);
4288 __ blr();
4289
4290 return start;
4291 }
4292
4293 #undef UC
4294 #undef LC
4295 #undef DIG
4296 #undef PLS
4297 #undef HYP
4298 #undef SLS
4299 #undef US
4300
4301 // This algorithm is based on the methods described in this paper:
4302 // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
4303 //
4304 // The details of this implementation vary from the paper due to the
4305 // difference in the ISA between SSE and AltiVec, especially in the
4306 // splitting bytes section where there is no need on Power to mask after
4307 // the shift because the shift is byte-wise rather than an entire an entire
4308 // 128-bit word.
4309 //
4310 // For the lookup part of the algorithm, different logic is used than
4311 // described in the paper because of the availability of vperm, which can
4312 // do a 64-byte table lookup in four instructions, while preserving the
4313 // branchless nature.
4314 //
4315 // Description of the ENCODE_CORE macro
4316 //
4317 // Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2
4318 // bits of each byte are zeros)
4319 //
4320 // (Note: e7..e0 are not shown because they follow the same pattern as
4321 // e8..e15)
4322 //
4323 // In the table below, b0, b1, .. b15 are the bytes of unencoded
4324 // binary data, the first line of each of the cells (except for
4325 // the constants) uses the bit-field nomenclature from the
4326 // above-linked paper, whereas the second line is more specific
4327 // about which exact bits are present, and is constructed using the
4328 // Power ISA 3.x document style, where:
4329 //
4330 // * The specifier after the colon depicts which bits are there.
4331 // * The bit numbering is big endian style (bit 0 is the most
4332 // significant).
4333 // * || is a concatenate operator.
4334 // * Strings of 0's are a field of zeros with the shown length, and
4335 // likewise for strings of 1's.
4336 //
4337 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4338 // | Vector | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
4339 // | Element | | | | | | | | |
4340 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4341 // | after lxv | jjjjkkkk | iiiiiijj | gghhhhhh | ffffgggg | eeeeeeff | ccdddddd | bbbbcccc | aaaaaabb |
4342 // | | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
4343 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4344 // | xxperm indexes | 0 | 10 | 11 | 12 | 0 | 13 | 14 | 15 |
4345 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4346 // | (1) after xxperm | | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb |
4347 // | | (b15) | b5 | b4 | b3 | (b15) | b2 | b1 | b0 |
4348 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4349 // | rshift_amount | 0 | 6 | 4 | 2 | 0 | 6 | 4 | 2 |
4350 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4351 // | after vsrb | | 000000gg | 0000ffff | 00eeeeee | | 000000cc | 0000bbbb | 00aaaaaa |
4352 // | | (b15) | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | (b15) | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 |
4353 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4354 // | rshift_mask | 00000000 | 000000||11 | 0000||1111 | 00||111111 | 00000000 | 000000||11 | 0000||1111 | 00||111111 |
4355 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4356 // | rshift after vand | 00000000 | 000000gg | 0000ffff | 00eeeeee | 00000000 | 000000cc | 0000bbbb | 00aaaaaa |
4357 // | | 00000000 | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | 00000000 | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 |
4358 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4359 // | 1 octet lshift (1) | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb | 00000000 |
4360 // | | b5 | b4 | b3 | (b15) | b2 | b1 | b0 | 00000000 |
4361 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4362 // | lshift_amount | 0 | 2 | 4 | 0 | 0 | 2 | 4 | 0 |
4363 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4364 // | after vslb | gghhhhhh | ffgggg00 | eeff0000 | | ccdddddd | bbcccc00 | aabb0000 | 00000000 |
4365 // | | b5 | b4:2..7||00 | b3:4..7||0000 | (b15) | b2:0..7 | b1:2..7||00 | b0:4..7||0000 | 00000000 |
4366 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4367 // | lshift_mask | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 |
4368 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4369 // | lshift after vand | 00hhhhhh | 00gggg00 | 00ff0000 | 00000000 | 00dddddd | 00cccc00 | 00bb0000 | 00000000 |
4370 // | | 00||b5:2..7 | 00||b4:4..7||00 | 00||b3:6..7||0000 | 00000000 | 00||b2:2..7 | 00||b1:4..7||00 | 00||b0:6..7||0000 | 00000000 |
4371 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4372 // | after vor lshift, rshift | 00hhhhhh | 00gggggg | 00ffffff | 00eeeeee | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
4373 // | | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4374 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4375 //
4376 // Expand the first 12 bytes into 16 bytes, leaving every 4th byte
4377 // blank for now.
4378 // __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);
4379 //
4380 // Generate two bit-shifted pieces - rshift and lshift - that will
4381 // later be OR'd together.
4382 //
4383 // First the right-shifted piece
4384 // __ vsrb(rshift, input, expand_rshift);
4385 // __ vand(rshift, rshift, expand_rshift_mask);
4386 //
4387 // Now the left-shifted piece, which is done by octet shifting
4388 // the input one byte to the left, then doing a variable shift,
4389 // followed by a mask operation.
4390 //
4391 // __ vslo(lshift, input, vec_8s);
4392 // __ vslb(lshift, lshift, expand_lshift);
4393 // __ vand(lshift, lshift, expand_lshift_mask);
4394 //
4395 // Combine the two pieces by OR'ing
4396 // __ vor(expanded, rshift, lshift);
4397 //
4398 // At this point, expanded is a vector containing a 6-bit value in each
4399 // byte. These values are used as indexes into a 64-byte lookup table that
4400 // is contained in four vector registers. The lookup operation is done
4401 // using vperm instructions with the same indexes for the lower 32 and
4402 // upper 32 bytes. To figure out which of the two looked-up bytes to use
4403 // at each location, all values in expanded are compared to 31. Using
4404 // vsel, values higher than 31 use the results from the upper 32 bytes of
4405 // the lookup operation, while values less than or equal to 31 use the
4406 // lower 32 bytes of the lookup operation.
4407 //
4408 // Note: it's tempting to use a xxpermx,xxpermx,vor sequence here on
4409 // Power10 (or later), but experiments doing so on Power10 yielded a slight
4410 // performance drop, perhaps due to the need for xxpermx instruction
4411 // prefixes.
4412
4413 #define ENCODE_CORE \
4414 __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute); \
4415 __ vsrb(rshift, input, expand_rshift); \
4416 __ vand(rshift, rshift, expand_rshift_mask); \
4417 __ vslo(lshift, input, vec_8s); \
4418 __ vslb(lshift, lshift, expand_lshift); \
4419 __ vand(lshift, lshift, expand_lshift_mask); \
4420 __ vor(expanded, rshift, lshift); \
4421 __ vperm(encoded_00_31, vec_base64_00_15, vec_base64_16_31, expanded); \
4422 __ vperm(encoded_32_63, vec_base64_32_47, vec_base64_48_63, expanded); \
4423 __ vcmpgtub(gt_31, expanded, vec_31s); \
4424 __ vsel(expanded, encoded_00_31, encoded_32_63, gt_31);
4425
4426 // Intrinsic function prototype in Base64.java:
4427 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
4428
4429 address generate_base64_encodeBlock() {
4430 __ align(CodeEntryAlignment);
4431 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
4432 StubCodeMark mark(this, stub_id);
4433 address start = __ function_entry();
4434
4435 typedef struct {
4436 unsigned char expand_permute_val[16];
4437 unsigned char expand_rshift_val[16];
4438 unsigned char expand_rshift_mask_val[16];
4439 unsigned char expand_lshift_val[16];
4440 unsigned char expand_lshift_mask_val[16];
4441 unsigned char base64_00_15_val[16];
4442 unsigned char base64_16_31_val[16];
4443 unsigned char base64_32_47_val[16];
4444 unsigned char base64_48_63_val[16];
4445 unsigned char base64_48_63_URL_val[16];
4446 } constant_block;
4447
4448 alignas(16) static const constant_block const_block = {
4449 .expand_permute_val = {
4450 ARRAY_TO_LXV_ORDER(
4451 0, 4, 5, 6,
4452 0, 7, 8, 9,
4453 0, 10, 11, 12,
4454 0, 13, 14, 15 ) },
4455
4456 .expand_rshift_val = {
4457 ARRAY_TO_LXV_ORDER(
4458 0, 6, 4, 2,
4459 0, 6, 4, 2,
4460 0, 6, 4, 2,
4461 0, 6, 4, 2 ) },
4462
4463 .expand_rshift_mask_val = {
4464 ARRAY_TO_LXV_ORDER(
4465 0b00000000, 0b00000011, 0b00001111, 0b00111111,
4466 0b00000000, 0b00000011, 0b00001111, 0b00111111,
4467 0b00000000, 0b00000011, 0b00001111, 0b00111111,
4468 0b00000000, 0b00000011, 0b00001111, 0b00111111 ) },
4469
4470 .expand_lshift_val = {
4471 ARRAY_TO_LXV_ORDER(
4472 0, 2, 4, 0,
4473 0, 2, 4, 0,
4474 0, 2, 4, 0,
4475 0, 2, 4, 0 ) },
4476
4477 .expand_lshift_mask_val = {
4478 ARRAY_TO_LXV_ORDER(
4479 0b00111111, 0b00111100, 0b00110000, 0b00000000,
4480 0b00111111, 0b00111100, 0b00110000, 0b00000000,
4481 0b00111111, 0b00111100, 0b00110000, 0b00000000,
4482 0b00111111, 0b00111100, 0b00110000, 0b00000000 ) },
4483
4484 .base64_00_15_val = {
4485 ARRAY_TO_LXV_ORDER(
4486 'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P' ) },
4487
4488 .base64_16_31_val = {
4489 ARRAY_TO_LXV_ORDER(
4490 'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f' ) },
4491
4492 .base64_32_47_val = {
4493 ARRAY_TO_LXV_ORDER(
4494 'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v' ) },
4495
4496 .base64_48_63_val = {
4497 ARRAY_TO_LXV_ORDER(
4498 'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/' ) },
4499
4500 .base64_48_63_URL_val = {
4501 ARRAY_TO_LXV_ORDER(
4502 'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) }
4503 };
4504
4505 // Number of bytes to process in each pass through the main loop.
4506 // 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes.
4507 const unsigned block_size = 12;
4508
4509 // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
4510 Register src = R3_ARG1; // source starting address of Base64 characters
4511 Register sp = R4_ARG2; // source starting position
4512 Register sl = R5_ARG3; // total source length of the Base64 characters to be processed
4513 Register dst = R6_ARG4; // destination address
4514 Register dp = R7_ARG5; // destination starting position
4515 Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
4516
4517 // Local variables
4518 Register const_ptr = R12; // used for loading constants (reuses isURL's register)
4519 Register tmp_reg = R9; // used for speeding up load_constant()
4520
4521 Register size = R9; // number of bytes to process (reuses tmp_reg's register)
4522 Register blocked_size = R10; // number of bytes to process a block at a time
4523 Register block_modulo = R12; // == block_size (reuse const_ptr)
4524 Register remaining = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg)
4525 Register in = R4; // current input (source) pointer (reuse sp's register)
4526 Register num_blocks = R11; // number of blocks to be processed by the loop
4527 Register out = R8; // current output (destination) pointer (reuse const_ptr's register)
4528 Register three = R9; // constant divisor (reuse size's register)
4529 Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register)
4530 Register tmp1 = R7; // temp register for lxvl length (reuse dp's register)
4531 Register modulo_chars = R7; // number of bytes written during the final write % 4 (reuse tmp1's register)
4532 Register pad_char = R6; // literal '=' (reuse dst's register)
4533
4534 // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
4535 // VR Constants
4536 VectorRegister vec_8s = VR0;
4537 VectorRegister vec_31s = VR1;
4538 VectorRegister vec_base64_00_15 = VR2;
4539 VectorRegister vec_base64_16_31 = VR3;
4540 VectorRegister vec_base64_32_47 = VR4;
4541 VectorRegister vec_base64_48_63 = VR5;
4542 VectorRegister expand_rshift = VR6;
4543 VectorRegister expand_rshift_mask = VR7;
4544 VectorRegister expand_lshift = VR8;
4545 VectorRegister expand_lshift_mask = VR9;
4546
4547 // VR variables for expand
4548 VectorRegister input = VR10;
4549 VectorRegister rshift = VR11;
4550 VectorRegister lshift = VR12;
4551 VectorRegister expanded = VR13;
4552
4553 // VR variables for lookup
4554 VectorRegister encoded_00_31 = VR10; // (reuse input)
4555 VectorRegister encoded_32_63 = VR11; // (reuse rshift)
4556 VectorRegister gt_31 = VR12; // (reuse lshift)
4557
4558 // VSR Constants
4559 VectorSRegister expand_permute = VSR0;
4560
4561 Label not_URL, calculate_size, calculate_blocked_size, skip_loop;
4562 Label loop_start, le_16_to_write, no_pad, one_pad_char;
4563
4564 // The upper 32 bits of the non-pointer parameter registers are not
4565 // guaranteed to be zero, so mask off those upper bits.
4566 __ clrldi(sp, sp, 32);
4567 __ clrldi(sl, sl, 32);
4568 __ clrldi(dp, dp, 32);
4569 __ clrldi(isURL, isURL, 32);
4570
4571 // load up the constants
4572 __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
4573 __ lxv(expand_permute, BLK_OFFSETOF(expand_permute_val), const_ptr);
4574 __ lxv(expand_rshift->to_vsr(), BLK_OFFSETOF(expand_rshift_val), const_ptr);
4575 __ lxv(expand_rshift_mask->to_vsr(), BLK_OFFSETOF(expand_rshift_mask_val), const_ptr);
4576 __ lxv(expand_lshift->to_vsr(), BLK_OFFSETOF(expand_lshift_val), const_ptr);
4577 __ lxv(expand_lshift_mask->to_vsr(), BLK_OFFSETOF(expand_lshift_mask_val), const_ptr);
4578 __ lxv(vec_base64_00_15->to_vsr(), BLK_OFFSETOF(base64_00_15_val), const_ptr);
4579 __ lxv(vec_base64_16_31->to_vsr(), BLK_OFFSETOF(base64_16_31_val), const_ptr);
4580 __ lxv(vec_base64_32_47->to_vsr(), BLK_OFFSETOF(base64_32_47_val), const_ptr);
4581
4582 // Splat the constants that can use xxspltib
4583 __ xxspltib(vec_8s->to_vsr(), 8);
4584 __ xxspltib(vec_31s->to_vsr(), 31);
4585
4586
4587 // Use a different translation lookup table depending on the
4588 // setting of isURL
4589 __ cmpdi(CR0, isURL, 0);
4590 __ beq(CR0, not_URL);
4591 __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr);
4592 __ b(calculate_size);
4593
4594 __ bind(not_URL);
4595 __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_val), const_ptr);
4596
4597 __ bind(calculate_size);
4598
4599 // size = sl - sp - 4 (*)
4600 // (*) Don't process the last four bytes in the main loop because
4601 // we don't want the lxv instruction to read past the end of the src
4602 // data, in case those four bytes are on the start of an unmapped or
4603 // otherwise inaccessible page.
4604 //
4605 __ sub(size, sl, sp);
4606 __ subi(size, size, 4);
4607 __ cmpdi(CR7, size, block_size);
4608 __ bgt(CR7, calculate_blocked_size);
4609 __ mr(remaining, size);
4610 // Add the 4 back into remaining again
4611 __ addi(remaining, remaining, 4);
4612 // make "in" point to the beginning of the source data: in = src + sp
4613 __ add(in, src, sp);
4614 // out = dst + dp
4615 __ add(out, dst, dp);
4616 __ b(skip_loop);
4617
4618 __ bind(calculate_blocked_size);
4619 __ li(block_modulo, block_size);
4620 // num_blocks = size / block_modulo
4621 __ divwu(num_blocks, size, block_modulo);
4622 // blocked_size = num_blocks * size
4623 __ mullw(blocked_size, num_blocks, block_modulo);
4624 // remaining = size - blocked_size
4625 __ sub(remaining, size, blocked_size);
4626 __ mtctr(num_blocks);
4627
4628 // Add the 4 back in to remaining again
4629 __ addi(remaining, remaining, 4);
4630
4631 // make "in" point to the beginning of the source data: in = src + sp
4632 __ add(in, src, sp);
4633
4634 // out = dst + dp
4635 __ add(out, dst, dp);
4636
4637 __ align(32);
4638 __ bind(loop_start);
4639
4640 __ lxv(input->to_vsr(), 0, in);
4641
4642 ENCODE_CORE
4643
4644 __ stxv(expanded->to_vsr(), 0, out);
4645 __ addi(in, in, 12);
4646 __ addi(out, out, 16);
4647 __ bdnz(loop_start);
4648
4649 __ bind(skip_loop);
4650
4651 // When there are less than 16 bytes left, we need to be careful not to
4652 // read beyond the end of the src buffer, which might be in an unmapped
4653 // page.
4654 // Load the remaining bytes using lxvl.
4655 __ rldicr(tmp1, remaining, 56, 7);
4656 __ lxvl(input->to_vsr(), in, tmp1);
4657
4658 ENCODE_CORE
4659
4660 // bytes_to_write = ((remaining * 4) + 2) / 3
4661 __ li(three, 3);
4662 __ rlwinm(bytes_to_write, remaining, 2, 0, 29); // remaining * 4
4663 __ addi(bytes_to_write, bytes_to_write, 2);
4664 __ divwu(bytes_to_write, bytes_to_write, three);
4665
4666 __ cmpwi(CR7, bytes_to_write, 16);
4667 __ ble_predict_taken(CR7, le_16_to_write);
4668 __ stxv(expanded->to_vsr(), 0, out);
4669
4670 // We've processed 12 of the 13-15 data bytes, so advance the pointers,
4671 // and do one final pass for the remaining 1-3 bytes.
4672 __ addi(in, in, 12);
4673 __ addi(out, out, 16);
4674 __ subi(remaining, remaining, 12);
4675 __ subi(bytes_to_write, bytes_to_write, 16);
4676 __ rldicr(tmp1, bytes_to_write, 56, 7);
4677 __ lxvl(input->to_vsr(), in, tmp1);
4678
4679 ENCODE_CORE
4680
4681 __ bind(le_16_to_write);
4682 // shift bytes_to_write into the upper 8 bits of t1 for use by stxvl
4683 __ rldicr(tmp1, bytes_to_write, 56, 7);
4684 __ stxvl(expanded->to_vsr(), out, tmp1);
4685 __ add(out, out, bytes_to_write);
4686
4687 __ li(pad_char, '=');
4688 __ rlwinm_(modulo_chars, bytes_to_write, 0, 30, 31); // bytes_to_write % 4, set CR0
4689 // Examples:
4690 // remaining bytes_to_write modulo_chars num pad chars
4691 // 0 0 0 0
4692 // 1 2 2 2
4693 // 2 3 3 1
4694 // 3 4 0 0
4695 // 4 6 2 2
4696 // 5 7 3 1
4697 // ...
4698 // 12 16 0 0
4699 // 13 18 2 2
4700 // 14 19 3 1
4701 // 15 20 0 0
4702 __ beq(CR0, no_pad);
4703 __ cmpwi(CR7, modulo_chars, 3);
4704 __ beq(CR7, one_pad_char);
4705
4706 // two pad chars
4707 __ stb(pad_char, out);
4708 __ addi(out, out, 1);
4709
4710 __ bind(one_pad_char);
4711 __ stb(pad_char, out);
4712
4713 __ bind(no_pad);
4714
4715 __ blr();
4716 return start;
4717 }
4718
4719 #endif // VM_LITTLE_ENDIAN
4720
4721 void generate_lookup_secondary_supers_table_stub() {
4722 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
4723 StubCodeMark mark(this, stub_id);
4724
4725 const Register
4726 r_super_klass = R4_ARG2,
4727 r_array_base = R3_ARG1,
4728 r_array_length = R7_ARG5,
4729 r_array_index = R6_ARG4,
4730 r_sub_klass = R5_ARG3,
4731 r_bitmap = R11_scratch1,
4732 result = R8_ARG6;
4733
4734 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
4735 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
4736 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
4737 r_array_base, r_array_length, r_array_index,
4738 r_bitmap, result, slot);
4739 __ blr();
4740 }
4741 }
4742
4743 // Slow path implementation for UseSecondarySupersTable.
4744 address generate_lookup_secondary_supers_table_slow_path_stub() {
4745 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
4746 StubCodeMark mark(this, stub_id);
4747
4748 address start = __ pc();
4749 const Register
4750 r_super_klass = R4_ARG2,
4751 r_array_base = R3_ARG1,
4752 temp1 = R7_ARG5,
4753 r_array_index = R6_ARG4,
4754 r_bitmap = R11_scratch1,
4755 result = R8_ARG6;
4756
4757 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
4758 __ blr();
4759
4760 return start;
4761 }
4762
4763 address generate_cont_thaw(StubId stub_id) {
4764 if (!Continuations::enabled()) return nullptr;
4765
4766 Continuation::thaw_kind kind;
4767 bool return_barrier;
4768 bool return_barrier_exception;
4769
4770 switch (stub_id) {
4771 case StubId::stubgen_cont_thaw_id:
4772 kind = Continuation::thaw_top;
4773 return_barrier = false;
4774 return_barrier_exception = false;
4775 break;
4776 case StubId::stubgen_cont_returnBarrier_id:
4777 kind = Continuation::thaw_return_barrier;
4778 return_barrier = true;
4779 return_barrier_exception = false;
4780 break;
4781 case StubId::stubgen_cont_returnBarrierExc_id:
4782 kind = Continuation::thaw_return_barrier_exception;
4783 return_barrier = true;
4784 return_barrier_exception = true;
4785 break;
4786 default:
4787 ShouldNotReachHere();
4788 }
4789 StubCodeMark mark(this, stub_id);
4790
4791 Register tmp1 = R10_ARG8;
4792 Register tmp2 = R9_ARG7;
4793 Register tmp3 = R8_ARG6;
4794 Register nvtmp = R15_esp; // nonvolatile tmp register
4795 FloatRegister nvftmp = F20; // nonvolatile fp tmp register
4796
4797 address start = __ pc();
4798
4799 if (kind == Continuation::thaw_top) {
4800 __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
4801 }
4802
4803 if (return_barrier) {
4804 __ mr(nvtmp, R3_RET); __ fmr(nvftmp, F1_RET); // preserve possible return value from a method returning to the return barrier
4805 DEBUG_ONLY(__ ld_ptr(tmp1, _abi0(callers_sp), R1_SP);)
4806 __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
4807 #ifdef ASSERT
4808 __ ld_ptr(tmp2, _abi0(callers_sp), R1_SP);
4809 __ cmpd(CR0, tmp1, tmp2);
4810 __ asm_assert_eq(FILE_AND_LINE ": callers sp is corrupt");
4811 #endif
4812 }
4813 #ifdef ASSERT
4814 __ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread);
4815 __ cmpd(CR0, R1_SP, tmp1);
4816 __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
4817 #endif
4818
4819 __ li(R4_ARG2, return_barrier ? 1 : 0);
4820 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), R16_thread, R4_ARG2);
4821
4822 #ifdef ASSERT
4823 DEBUG_ONLY(__ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread));
4824 DEBUG_ONLY(__ cmpd(CR0, R1_SP, tmp1));
4825 __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
4826 #endif
4827
4828 // R3_RET contains the size of the frames to thaw, 0 if overflow or no more frames
4829 Label thaw_success;
4830 __ cmpdi(CR0, R3_RET, 0);
4831 __ bne(CR0, thaw_success);
4832 __ load_const_optimized(tmp1, (SharedRuntime::throw_StackOverflowError_entry()), R0);
4833 __ mtctr(tmp1); __ bctr();
4834 __ bind(thaw_success);
4835
4836 __ addi(R3_RET, R3_RET, frame::native_abi_reg_args_size); // Large abi required for C++ calls.
4837 __ neg(R3_RET, R3_RET);
4838 // align down resulting in a smaller negative offset
4839 __ clrrdi(R3_RET, R3_RET, exact_log2(frame::alignment_in_bytes));
4840 DEBUG_ONLY(__ mr(tmp1, R1_SP);)
4841 __ resize_frame(R3_RET, tmp2); // make room for the thawed frames
4842
4843 __ li(R4_ARG2, kind);
4844 __ call_VM_leaf(Continuation::thaw_entry(), R16_thread, R4_ARG2);
4845 __ mr(R1_SP, R3_RET); // R3_RET contains the SP of the thawed top frame
4846
4847 if (return_barrier) {
4848 // we're now in the caller of the frame that returned to the barrier
4849 __ mr(R3_RET, nvtmp); __ fmr(F1_RET, nvftmp); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4850 } else {
4851 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
4852 __ li(R3_RET, 0); // return 0 (success) from doYield
4853 }
4854
4855 if (return_barrier_exception) {
4856 Register ex_pc = R17_tos; // nonvolatile register
4857 __ ld(ex_pc, _abi0(lr), R1_SP); // LR
4858 __ mr(nvtmp, R3_RET); // save return value containing the exception oop
4859 // The thawed top frame has got a frame::java_abi. This is not sufficient for the runtime call.
4860 __ push_frame_reg_args(0, tmp1);
4861 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), R16_thread, ex_pc);
4862 __ mtlr(R3_RET); // the exception handler
4863 __ pop_frame();
4864 // See OptoRuntime::generate_exception_blob for register arguments
4865 __ mr(R3_ARG1, nvtmp); // exception oop
4866 __ mr(R4_ARG2, ex_pc); // exception pc
4867 } else {
4868 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4869 __ ld(R0, _abi0(lr), R1_SP); // LR
4870 __ mtlr(R0);
4871 }
4872 __ blr();
4873
4874 return start;
4875 }
4876
4877 address generate_cont_thaw() {
4878 return generate_cont_thaw(StubId::stubgen_cont_thaw_id);
4879 }
4880
4881 // TODO: will probably need multiple return barriers depending on return type
4882
4883 address generate_cont_returnBarrier() {
4884 return generate_cont_thaw(StubId::stubgen_cont_returnBarrier_id);
4885 }
4886
4887 address generate_cont_returnBarrier_exception() {
4888 return generate_cont_thaw(StubId::stubgen_cont_returnBarrierExc_id);
4889 }
4890
4891 address generate_cont_preempt_stub() {
4892 if (!Continuations::enabled()) return nullptr;
4893 StubId stub_id = StubId::stubgen_cont_preempt_id;
4894 StubCodeMark mark(this, stub_id);
4895 address start = __ pc();
4896
4897 __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
4898
4899 __ reset_last_Java_frame(false /*check_last_java_sp*/);
4900
4901 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4902 __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
4903
4904 Label preemption_cancelled;
4905 __ lbz(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
4906 __ cmpwi(CR0, R11_scratch1, 0);
4907 __ bne(CR0, preemption_cancelled);
4908
4909 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4910 SharedRuntime::continuation_enter_cleanup(_masm);
4911 __ pop_frame();
4912 __ restore_LR(R11_scratch1);
4913 __ blr();
4914
4915 // We acquired the monitor after freezing the frames so call thaw to continue execution.
4916 __ bind(preemption_cancelled);
4917 __ li(R11_scratch1, 0); // false
4918 __ stb(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
4919 int simm16_offs = __ load_const_optimized(R11_scratch1, ContinuationEntry::thaw_call_pc_address(), R0, true);
4920 __ ld(R11_scratch1, simm16_offs, R11_scratch1);
4921 __ mtctr(R11_scratch1);
4922 __ bctr();
4923
4924 return start;
4925 }
4926
4927 // exception handler for upcall stubs
4928 address generate_upcall_stub_exception_handler() {
4929 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
4930 StubCodeMark mark(this, stub_id);
4931 address start = __ pc();
4932
4933 // Native caller has no idea how to handle exceptions,
4934 // so we just crash here. Up to callee to catch exceptions.
4935 __ verify_oop(R3_ARG1);
4936 __ load_const_optimized(R12_scratch2, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception), R0);
4937 __ call_c(R12_scratch2);
4938 __ should_not_reach_here();
4939
4940 return start;
4941 }
4942
4943 // load Method* target of MethodHandle
4944 // R3_ARG1 = jobject receiver
4945 // R19_method = result Method*
4946 address generate_upcall_stub_load_target() {
4947
4948 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
4949 StubCodeMark mark(this, stub_id);
4950 address start = __ pc();
4951
4952 __ resolve_global_jobject(R3_ARG1, R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS);
4953 // Load target method from receiver
4954 __ load_heap_oop(R19_method, java_lang_invoke_MethodHandle::form_offset(), R3_ARG1,
4955 R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4956 __ load_heap_oop(R19_method, java_lang_invoke_LambdaForm::vmentry_offset(), R19_method,
4957 R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4958 __ load_heap_oop(R19_method, java_lang_invoke_MemberName::method_offset(), R19_method,
4959 R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4960 __ ld(R19_method, java_lang_invoke_ResolvedMethodName::vmtarget_offset(), R19_method);
4961 __ std(R19_method, in_bytes(JavaThread::callee_target_offset()), R16_thread); // just in case callee is deoptimized
4962
4963 __ blr();
4964
4965 return start;
4966 }
4967
4968 // Initialization
4969 void generate_preuniverse_stubs() {
4970 // preuniverse stubs are not needed for ppc
4971 }
4972
4973 void generate_initial_stubs() {
4974 // Generates all stubs and initializes the entry points
4975
4976 // Entry points that exist in all platforms.
4977 // Note: This is code that could be shared among different platforms - however the
4978 // benefit seems to be smaller than the disadvantage of having a
4979 // much more complicated generator structure. See also comment in
4980 // stubRoutines.hpp.
4981
4982 StubRoutines::_forward_exception_entry = generate_forward_exception();
4983 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
4984 StubRoutines::_catch_exception_entry = generate_catch_exception();
4985
4986 if (UnsafeMemoryAccess::_table == nullptr) {
4987 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
4988 }
4989
4990 // CRC32 Intrinsics.
4991 if (UseCRC32Intrinsics) {
4992 StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32_id);
4993 }
4994
4995 // CRC32C Intrinsics.
4996 if (UseCRC32CIntrinsics) {
4997 StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32C_id);
4998 }
4999
5000 if (VM_Version::supports_float16()) {
5001 // For results consistency both intrinsics should be enabled.
5002 StubRoutines::_hf2f = generate_float16ToFloat();
5003 StubRoutines::_f2hf = generate_floatToFloat16();
5004 }
5005 }
5006
5007 void generate_continuation_stubs() {
5008 // Continuation stubs:
5009 StubRoutines::_cont_thaw = generate_cont_thaw();
5010 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
5011 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
5012 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
5013 }
5014
5015 void generate_final_stubs() {
5016 // Generates all stubs and initializes the entry points
5017
5018 // support for verify_oop (must happen after universe_init)
5019 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
5020
5021 // nmethod entry barriers for concurrent class unloading
5022 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
5023
5024 // arraycopy stubs used by compilers
5025 generate_arraycopy_stubs();
5026
5027 #ifdef COMPILER2
5028 if (UseSecondarySupersTable) {
5029 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
5030 if (!InlineSecondarySupersTest) {
5031 generate_lookup_secondary_supers_table_stub();
5032 }
5033 }
5034 #endif // COMPILER2
5035
5036 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
5037 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
5038 }
5039
5040 void generate_compiler_stubs() {
5041 #ifdef COMPILER2
5042
5043 if (UseMultiplyToLenIntrinsic) {
5044 StubRoutines::_multiplyToLen = generate_multiplyToLen();
5045 }
5046 if (UseSquareToLenIntrinsic) {
5047 StubRoutines::_squareToLen = generate_squareToLen();
5048 }
5049 if (UseMulAddIntrinsic) {
5050 StubRoutines::_mulAdd = generate_mulAdd();
5051 }
5052 if (UseMontgomeryMultiplyIntrinsic) {
5053 StubRoutines::_montgomeryMultiply
5054 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
5055 }
5056 if (UseMontgomerySquareIntrinsic) {
5057 StubRoutines::_montgomerySquare
5058 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
5059 }
5060
5061 // data cache line writeback
5062 if (VM_Version::supports_data_cache_line_flush()) {
5063 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5064 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5065 }
5066
5067 if (UseGHASHIntrinsics) {
5068 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5069 }
5070
5071 if (UseAESIntrinsics) {
5072 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5073 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5074 }
5075
5076 if (UseSHA256Intrinsics) {
5077 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
5078 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
5079 }
5080 if (UseSHA512Intrinsics) {
5081 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
5082 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
5083 }
5084
5085 #ifdef VM_LITTLE_ENDIAN
5086 // Currently supported on PPC64LE only
5087 if (UseBASE64Intrinsics) {
5088 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
5089 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
5090 }
5091 #endif
5092 #endif // COMPILER2
5093 }
5094
5095 public:
5096 StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData *stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
5097 switch(blob_id) {
5098 case BlobId::stubgen_preuniverse_id:
5099 generate_preuniverse_stubs();
5100 break;
5101 case BlobId::stubgen_initial_id:
5102 generate_initial_stubs();
5103 break;
5104 case BlobId::stubgen_continuation_id:
5105 generate_continuation_stubs();
5106 break;
5107 case BlobId::stubgen_compiler_id:
5108 generate_compiler_stubs();
5109 break;
5110 case BlobId::stubgen_final_id:
5111 generate_final_stubs();
5112 break;
5113 default:
5114 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
5115 break;
5116 };
5117 }
5118 };
5119
5120 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData *stub_data) {
5121 StubGenerator g(code, blob_id, stub_data);
5122 }
5123