1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2026 SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.inline.hpp"
27 #include "compiler/oopMap.hpp"
28 #include "gc/shared/barrierSet.hpp"
29 #include "gc/shared/barrierSetAssembler.hpp"
30 #include "gc/shared/barrierSetNMethod.hpp"
31 #include "interpreter/interpreter.hpp"
32 #include "nativeInst_ppc.hpp"
33 #include "oops/instanceOop.hpp"
34 #include "oops/method.hpp"
35 #include "oops/objArrayKlass.hpp"
36 #include "oops/oop.inline.hpp"
37 #include "prims/methodHandles.hpp"
38 #include "prims/upcallLinker.hpp"
39 #include "runtime/continuation.hpp"
40 #include "runtime/continuationEntry.inline.hpp"
41 #include "runtime/frame.inline.hpp"
42 #include "runtime/handles.inline.hpp"
43 #include "runtime/javaThread.hpp"
44 #include "runtime/sharedRuntime.hpp"
45 #include "runtime/stubCodeGenerator.hpp"
46 #include "runtime/stubRoutines.hpp"
47 #include "runtime/vm_version.hpp"
48 #include "utilities/align.hpp"
49 #include "utilities/powerOfTwo.hpp"
50 #if INCLUDE_ZGC
51 #include "gc/z/zBarrierSetAssembler.hpp"
52 #endif
53
54 // Declaration and definition of StubGenerator (no .hpp file).
55 // For a more detailed description of the stub routine structure
56 // see the comment in stubRoutines.hpp.
57
58 #define __ _masm->
59
60 #ifdef PRODUCT
61 #define BLOCK_COMMENT(str) // nothing
62 #else
63 #define BLOCK_COMMENT(str) __ block_comment(str)
64 #endif
65
66 #if defined(ABI_ELFv2)
67 #define STUB_ENTRY(name) StubRoutines::name
68 #else
69 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name)->entry()
70 #endif
71
72 class StubGenerator: public StubCodeGenerator {
73 private:
74
75 // Call stubs are used to call Java from C
76 //
77 // Arguments:
78 //
79 // R3 - call wrapper address : address
80 // R4 - result : intptr_t*
81 // R5 - result type : BasicType
82 // R6 - method : Method
83 // R7 - frame mgr entry point : address
84 // R8 - parameter block : intptr_t*
85 // R9 - parameter count in words : int
86 // R10 - thread : Thread*
87 //
88 address generate_call_stub(address& return_address) {
89 // Setup a new c frame, copy java arguments, call template interpreter or
90 // native_entry, and process result.
91
92 StubId stub_id = StubId::stubgen_call_stub_id;
93 StubCodeMark mark(this, stub_id);
94
95 address start = __ function_entry();
96
97 int save_nonvolatile_registers_size = __ save_nonvolatile_registers_size(true, SuperwordUseVSX);
98
99 // some sanity checks
100 STATIC_ASSERT(StackAlignmentInBytes == 16);
101 assert((sizeof(frame::native_abi_minframe) % 16) == 0, "unaligned");
102 assert((sizeof(frame::native_abi_reg_args) % 16) == 0, "unaligned");
103 assert((save_nonvolatile_registers_size % 16) == 0, "unaligned");
104 assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
105 assert((sizeof(frame::entry_frame_locals) % 16) == 0, "unaligned");
106
107 Register r_arg_call_wrapper_addr = R3;
108 Register r_arg_result_addr = R4;
109 Register r_arg_result_type = R5;
110 Register r_arg_method = R6;
111 Register r_arg_entry = R7;
112 Register r_arg_argument_addr = R8;
113 Register r_arg_argument_count = R9;
114 Register r_arg_thread = R10;
115
116 Register r_entryframe_fp = R2; // volatile
117 Register r_argument_size = R11_scratch1; // volatile
118 Register r_top_of_arguments_addr = R21_tmp1;
119
120 {
121 // Stack on entry to call_stub:
122 //
123 // F1 [C_FRAME]
124 // ...
125 Register r_frame_size = R12_scratch2; // volatile
126 Label arguments_copied;
127
128 // Save LR/CR to caller's C_FRAME.
129 __ save_LR_CR(R0);
130
131 // Keep copy of our frame pointer (caller's SP).
132 __ mr(r_entryframe_fp, R1_SP);
133
134 // calculate frame size
135 STATIC_ASSERT(Interpreter::logStackElementSize == 3);
136
137 // space for arguments aligned up: ((arg_count + 1) * 8) &~ 15
138 __ addi(r_frame_size, r_arg_argument_count, 1);
139 __ rldicr(r_frame_size, r_frame_size, 3, 63 - 4);
140
141 // this is the pure space for arguments (excluding alignment padding)
142 __ sldi(r_argument_size, r_arg_argument_count, 3);
143
144 __ addi(r_frame_size, r_frame_size,
145 save_nonvolatile_registers_size + frame::entry_frame_locals_size + frame::top_ijava_frame_abi_size);
146
147 // push ENTRY_FRAME
148 __ push_frame(r_frame_size, R0);
149
150 // Save non-volatiles registers to ENTRY_FRAME.
151 __ save_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
152 true, SuperwordUseVSX);
153
154 BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
155 // Push ENTRY_FRAME including arguments:
156 //
157 // F0 [TOP_IJAVA_FRAME_ABI]
158 // alignment (optional)
159 // [outgoing Java arguments]
160 // [non-volatiles]
161 // [ENTRY_FRAME_LOCALS]
162 // F1 [C_FRAME]
163 // ...
164
165 // initialize call_stub locals (step 1)
166 __ std(r_arg_call_wrapper_addr, _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
167 __ std(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
168 __ std(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
169 // we will save arguments_tos_address later
170
171 BLOCK_COMMENT("Copy Java arguments");
172 // copy Java arguments
173
174 // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
175 __ addi(r_top_of_arguments_addr, r_entryframe_fp,
176 -(save_nonvolatile_registers_size + frame::entry_frame_locals_size));
177 __ sub(r_top_of_arguments_addr, r_top_of_arguments_addr, r_argument_size);
178
179 // any arguments to copy?
180 __ cmpdi(CR0, r_arg_argument_count, 0);
181 __ beq(CR0, arguments_copied);
182
183 // prepare loop and copy arguments in reverse order
184 {
185 Register r_argument_addr = R22_tmp2;
186 Register r_argumentcopy_addr = R23_tmp3;
187 // init CTR with arg_argument_count
188 __ mtctr(r_arg_argument_count);
189
190 // let r_argumentcopy_addr point to last outgoing Java arguments P
191 __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
192
193 // let r_argument_addr point to last incoming java argument
194 __ add(r_argument_addr, r_arg_argument_addr, r_argument_size);
195 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
196
197 // now loop while CTR > 0 and copy arguments
198 {
199 Label next_argument;
200 __ bind(next_argument);
201
202 __ ld(R0, 0, r_argument_addr);
203 // argument_addr--;
204 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
205 __ std(R0, 0, r_argumentcopy_addr);
206 // argumentcopy_addr++;
207 __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
208
209 __ bdnz(next_argument);
210 }
211 }
212
213 // Arguments copied, continue.
214 __ bind(arguments_copied);
215 }
216
217 {
218 BLOCK_COMMENT("Call template interpreter or native entry.");
219 assert_different_registers(r_arg_entry, r_top_of_arguments_addr, r_arg_method, r_arg_thread);
220
221 // Register state on entry to template interpreter / native entry:
222 //
223 // tos - intptr_t* sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
224 // R19_method - Method
225 // R16_thread - JavaThread*
226
227 // Tos must point to last argument - element_size.
228 const Register tos = R15_esp;
229
230 __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
231
232 // initialize call_stub locals (step 2)
233 // now save tos as arguments_tos_address
234 __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
235
236 // load argument registers for call
237 __ mr(R19_method, r_arg_method);
238 __ mr(R16_thread, r_arg_thread);
239 assert(tos != r_arg_method, "trashed r_arg_method");
240 assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
241
242 // Set R15_prev_state to 0 for simplifying checks in callee.
243 __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R0);
244 // Stack on entry to template interpreter / native entry:
245 //
246 // F0 [TOP_IJAVA_FRAME_ABI]
247 // alignment (optional)
248 // [outgoing Java arguments]
249 // [non-volatiles]
250 // [ENTRY_FRAME_LOCALS]
251 // F1 [C_FRAME]
252 // ...
253 //
254
255 // global toc register
256 __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R0);
257 // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
258 // when called via a c2i.
259
260 // Pass initial_caller_sp to framemanager.
261 __ mr(R21_sender_SP, R1_SP);
262
263 // Do a light-weight C-call here, r_arg_entry holds the address
264 // of the interpreter entry point (template interpreter or native entry)
265 // and save runtime-value of LR in return_address.
266 assert(r_arg_entry != tos && r_arg_entry != R19_method && r_arg_entry != R16_thread,
267 "trashed r_arg_entry");
268 return_address = __ call_stub(r_arg_entry);
269 }
270
271 {
272 BLOCK_COMMENT("Returned from template interpreter or native entry.");
273 // Now pop frame, process result, and return to caller.
274
275 // Stack on exit from template interpreter / native entry:
276 //
277 // F0 [ABI]
278 // ...
279 // [non-volatiles]
280 // [ENTRY_FRAME_LOCALS]
281 // F1 [C_FRAME]
282 // ...
283 //
284 // Just pop the topmost frame ...
285 //
286
287 Label ret_is_object;
288 Label ret_is_long;
289 Label ret_is_float;
290 Label ret_is_double;
291
292 Register r_lr = R11_scratch1;
293 Register r_cr = R12_scratch2;
294
295 // Reload some volatile registers which we've spilled before the call
296 // to template interpreter / native entry.
297 // Access all locals via frame pointer, because we know nothing about
298 // the topmost frame's size.
299 __ ld(r_entryframe_fp, _abi0(callers_sp), R1_SP); // restore after call
300 assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
301 __ ld(r_arg_result_addr, _entry_frame_locals_neg(result_address), r_entryframe_fp);
302 __ ld(r_arg_result_type, _entry_frame_locals_neg(result_type), r_entryframe_fp);
303 __ ld(r_cr, _abi0(cr), r_entryframe_fp);
304 __ ld(r_lr, _abi0(lr), r_entryframe_fp);
305 __ mtcr(r_cr); // restore CR
306 __ mtlr(r_lr); // restore LR
307
308 // Store result depending on type. Everything that is not
309 // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
310 // Using volatile CRs.
311 __ cmpwi(CR1, r_arg_result_type, T_OBJECT);
312 __ cmpwi(CR5, r_arg_result_type, T_LONG);
313 __ cmpwi(CR6, r_arg_result_type, T_FLOAT);
314 __ cmpwi(CR7, r_arg_result_type, T_DOUBLE);
315
316 __ pop_cont_fastpath(); // kills CR0, uses R16_thread
317
318 // restore non-volatile registers
319 __ restore_nonvolatile_registers(r_entryframe_fp, -(frame::entry_frame_locals_size + save_nonvolatile_registers_size),
320 true, SuperwordUseVSX);
321
322 // pop frame
323 __ mr(R1_SP, r_entryframe_fp);
324
325 // Stack on exit from call_stub:
326 //
327 // 0 [C_FRAME]
328 // ...
329 //
330 // no call_stub frames left.
331
332 __ beq(CR1, ret_is_object);
333 __ beq(CR5, ret_is_long);
334 __ beq(CR6, ret_is_float);
335 __ beq(CR7, ret_is_double);
336
337 // default:
338 __ stw(R3_RET, 0, r_arg_result_addr);
339 __ blr(); // return to caller
340
341 // case T_OBJECT:
342 __ bind(ret_is_object);
343 if (InlineTypeReturnedAsFields) {
344 // Check for scalarized return value
345 __ cmpdi(CR0, R3_RET, 0);
346 __ beq(CR0, ret_is_long);
347 // Load pack handler address
348 __ untested("call stub InlineTypeReturnedAsFields"); // TODO: check return registers usage
349 __ andi(R12_scratch2, R3_RET, -2);
350 __ ld(R12_scratch2, InlineKlass::adr_members_offset(), R12_scratch2);
351 __ ld(R12_scratch2, InlineKlass::pack_handler_jobject_offset(), R12_scratch2);
352 __ mtctr(R12_scratch2);
353 __ bctr(); // tail call
354 } // else fall through
355
356 // case T_LONG:
357 __ bind(ret_is_long);
358 __ std(R3_RET, 0, r_arg_result_addr);
359 __ blr(); // return to caller
360
361 // case T_FLOAT:
362 __ bind(ret_is_float);
363 __ stfs(F1_RET, 0, r_arg_result_addr);
364 __ blr(); // return to caller
365
366 // case T_DOUBLE:
367 __ bind(ret_is_double);
368 __ stfd(F1_RET, 0, r_arg_result_addr);
369 __ blr(); // return to caller
370 }
371
372 return start;
373 }
374
375 // Return point for a Java call if there's an exception thrown in
376 // Java code. The exception is caught and transformed into a
377 // pending exception stored in JavaThread that can be tested from
378 // within the VM.
379 //
380 address generate_catch_exception() {
381 StubId stub_id = StubId::stubgen_catch_exception_id;
382 StubCodeMark mark(this, stub_id);
383
384 address start = __ pc();
385
386 // Registers alive
387 //
388 // R16_thread
389 // R3_ARG1 - address of pending exception
390 // R4_ARG2 - return address in call stub
391
392 const Register exception_file = R21_tmp1;
393 const Register exception_line = R22_tmp2;
394
395 __ load_const(exception_file, (void*)__FILE__);
396 __ load_const(exception_line, (void*)__LINE__);
397
398 __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
399 // store into `char *'
400 __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
401 // store into `int'
402 __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
403
404 // complete return to VM
405 assert(StubRoutines::_call_stub_return_address != nullptr, "must have been generated before");
406
407 __ mtlr(R4_ARG2);
408 // continue in call stub
409 __ blr();
410
411 return start;
412 }
413
414 // Continuation point for runtime calls returning with a pending
415 // exception. The pending exception check happened in the runtime
416 // or native call stub. The pending exception in Thread is
417 // converted into a Java-level exception.
418 //
419 // Read:
420 //
421 // LR: The pc the runtime library callee wants to return to.
422 // Since the exception occurred in the callee, the return pc
423 // from the point of view of Java is the exception pc.
424 // thread: Needed for method handles.
425 //
426 // Invalidate:
427 //
428 // volatile registers (except below).
429 //
430 // Update:
431 //
432 // R4_ARG2: exception
433 //
434 // (LR is unchanged and is live out).
435 //
436 address generate_forward_exception() {
437 StubId stub_id = StubId::stubgen_forward_exception_id;
438 StubCodeMark mark(this, stub_id);
439 address start = __ pc();
440
441 if (VerifyOops) {
442 // Get pending exception oop.
443 __ ld(R3_ARG1,
444 in_bytes(Thread::pending_exception_offset()),
445 R16_thread);
446 // Make sure that this code is only executed if there is a pending exception.
447 {
448 Label L;
449 __ cmpdi(CR0, R3_ARG1, 0);
450 __ bne(CR0, L);
451 __ stop("StubRoutines::forward exception: no pending exception (1)");
452 __ bind(L);
453 }
454 __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
455 }
456
457 // Save LR/CR and copy exception pc (LR) into R4_ARG2.
458 __ save_LR(R4_ARG2);
459 __ push_frame_reg_args(0, R0);
460 // Find exception handler.
461 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
462 SharedRuntime::exception_handler_for_return_address),
463 R16_thread,
464 R4_ARG2);
465 // Copy handler's address.
466 __ mtctr(R3_RET);
467 __ pop_frame();
468 __ restore_LR(R0);
469
470 // Set up the arguments for the exception handler:
471 // - R3_ARG1: exception oop
472 // - R4_ARG2: exception pc.
473
474 // Load pending exception oop.
475 __ ld(R3_ARG1,
476 in_bytes(Thread::pending_exception_offset()),
477 R16_thread);
478
479 // The exception pc is the return address in the caller.
480 // Must load it into R4_ARG2.
481 __ mflr(R4_ARG2);
482
483 #ifdef ASSERT
484 // Make sure exception is set.
485 {
486 Label L;
487 __ cmpdi(CR0, R3_ARG1, 0);
488 __ bne(CR0, L);
489 __ stop("StubRoutines::forward exception: no pending exception (2)");
490 __ bind(L);
491 }
492 #endif
493
494 // Clear the pending exception.
495 __ li(R0, 0);
496 __ std(R0,
497 in_bytes(Thread::pending_exception_offset()),
498 R16_thread);
499 // Jump to exception handler.
500 __ bctr();
501
502 return start;
503 }
504
505 #undef __
506 #define __ _masm->
507
508 #if !defined(PRODUCT)
509 // Wrapper which calls oopDesc::is_oop_or_null()
510 // Only called by MacroAssembler::verify_oop
511 static void verify_oop_helper(const char* message, oopDesc* o) {
512 if (!oopDesc::is_oop_or_null(o)) {
513 fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
514 }
515 ++ StubRoutines::_verify_oop_count;
516 }
517 #endif
518
519 // Return address of code to be called from code generated by
520 // MacroAssembler::verify_oop.
521 //
522 // Don't generate, rather use C++ code.
523 address generate_verify_oop() {
524 // this is actually a `FunctionDescriptor*'.
525 address start = nullptr;
526
527 #if !defined(PRODUCT)
528 start = CAST_FROM_FN_PTR(address, verify_oop_helper);
529 #endif
530
531 return start;
532 }
533
534 // Computes the Galois/Counter Mode (GCM) product and reduction.
535 //
536 // This function performs polynomial multiplication of the subkey H with
537 // the current GHASH state using vectorized polynomial multiplication (`vpmsumd`).
538 // The subkey H is divided into lower, middle, and higher halves.
539 // The multiplication results are reduced using `vConstC2` to stay within GF(2^128).
540 // The final computed value is stored back into `vState`.
541 static void computeGCMProduct(MacroAssembler* _masm,
542 VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH,
543 VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState,
544 VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct,
545 VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9,
546 VectorRegister vCombinedResult, VectorRegister vSwappedH) {
547 __ vxor(vH, vH, vState);
548 __ vpmsumd(vLowProduct, vLowerH, vH); // L : Lower Half of subkey H
549 __ vpmsumd(vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H
550 __ vpmsumd(vHighProduct, vHigherH, vH); // H : Higher Half of subkey H
551 __ vpmsumd(vReducedLow, vLowProduct, vConstC2); // Reduction
552 __ vsldoi(vTmp8, vMidProduct, vZero, 8); // mL : Extract the lower 64 bits of M
553 __ vsldoi(vTmp9, vZero, vMidProduct, 8); // mH : Extract the higher 64 bits of M
554 __ vxor(vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half
555 __ vxor(vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half
556 __ vsldoi(vLowProduct, vLowProduct, vLowProduct, 8); // Swap
557 __ vxor(vLowProduct, vLowProduct, vReducedLow);
558 __ vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8); // Swap
559 __ vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction using constant
560 __ vxor(vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products
561 __ vxor(vState, vLowProduct, vCombinedResult);
562 }
563
564 // Generate stub for ghash process blocks.
565 //
566 // Arguments for generated stub:
567 // state: R3_ARG1 (long[] state)
568 // subkeyH: R4_ARG2 (long[] subH)
569 // data: R5_ARG3 (byte[] data)
570 // blocks: R6_ARG4 (number of 16-byte blocks to process)
571 //
572 // The polynomials are processed in bit-reflected order for efficiency reasons.
573 // This optimization leverages the structure of the Galois field arithmetic
574 // to minimize the number of bit manipulations required during multiplication.
575 // For an explanation of how this works, refer :
576 // Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich,
577 // Martin Dixon. "Optimized Galois-Counter-Mode Implementation on IntelĀ®
578 // Architecture Processor"
579 // http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf
580 //
581 //
582 address generate_ghash_processBlocks() {
583 StubCodeMark mark(this, "StubRoutines", "ghash");
584 address start = __ function_entry();
585
586 // Registers for parameters
587 Register state = R3_ARG1; // long[] state
588 Register subkeyH = R4_ARG2; // long[] subH
589 Register data = R5_ARG3; // byte[] data
590 Register blocks = R6_ARG4;
591 Register temp1 = R8;
592 // Vector Registers
593 VectorRegister vZero = VR0;
594 VectorRegister vH = VR1;
595 VectorRegister vLowerH = VR2;
596 VectorRegister vHigherH = VR3;
597 VectorRegister vLowProduct = VR4;
598 VectorRegister vMidProduct = VR5;
599 VectorRegister vHighProduct = VR6;
600 VectorRegister vReducedLow = VR7;
601 VectorRegister vTmp8 = VR8;
602 VectorRegister vTmp9 = VR9;
603 VectorRegister vTmp10 = VR10;
604 VectorRegister vSwappedH = VR11;
605 VectorRegister vTmp12 = VR12;
606 VectorRegister loadOrder = VR13;
607 VectorRegister vHigh = VR14;
608 VectorRegister vLow = VR15;
609 VectorRegister vState = VR16;
610 VectorRegister vPerm = VR17;
611 VectorRegister vCombinedResult = VR18;
612 VectorRegister vConstC2 = VR19;
613
614 __ li(temp1, 0xc2);
615 __ sldi(temp1, temp1, 56);
616 __ vspltisb(vZero, 0);
617 __ mtvrd(vConstC2, temp1);
618 __ lxvd2x(vH->to_vsr(), subkeyH);
619 __ lxvd2x(vState->to_vsr(), state);
620 // Operations to obtain lower and higher bytes of subkey H.
621 __ vspltisb(vReducedLow, 1);
622 __ vspltisb(vTmp10, 7);
623 __ vsldoi(vTmp8, vZero, vReducedLow, 1); // 0x1
624 __ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1
625 __ vsplt(vTmp9, 0, vH); // MSB of H
626 __ vsl(vH, vH, vReducedLow); // Carry = H<<7
627 __ vsrab(vTmp9, vTmp9, vTmp10);
628 __ vand(vTmp9, vTmp9, vTmp8); // Carry
629 __ vxor(vTmp10, vH, vTmp9);
630 __ vsldoi(vConstC2, vZero, vConstC2, 8);
631 __ vsldoi(vSwappedH, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H
632 __ vsldoi(vLowerH, vZero, vSwappedH, 8); // H.L
633 __ vsldoi(vHigherH, vSwappedH, vZero, 8); // H.H
634 #ifdef ASSERT
635 __ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero
636 __ asm_assert_ne("blocks should NOT be zero");
637 #endif
638 __ clrldi(blocks, blocks, 32);
639 __ mtctr(blocks);
640 __ lvsl(loadOrder, temp1);
641 #ifdef VM_LITTLE_ENDIAN
642 __ vspltisb(vTmp12, 0xf);
643 __ vxor(loadOrder, loadOrder, vTmp12);
644 #define LE_swap_bytes(x) __ vec_perm(x, x, x, loadOrder)
645 #else
646 #define LE_swap_bytes(x)
647 #endif
648
649 // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation.
650 //
651 // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts,
652 // performing three 128-bit multiplications and combining the results efficiently.
653 //
654 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
655 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
656 //
657 // Inputs:
658 // - vH: The data vector (state), containing both B0 (lower half) and B1 (higher half).
659 // - vLowerH: Lower half of the subkey H (A0).
660 // - vHigherH: Higher half of the subkey H (A1).
661 // - vConstC2: Constant used for reduction (for final processing).
662 //
663 // References:
664 // Shay Gueron, Michael E. Kounavis.
665 // "IntelĀ® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode"
666 // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918
667 //
668 Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop;
669 __ andi(temp1, data, 15);
670 __ cmpwi(CR0, temp1, 0);
671 __ bne(CR0, L_initialize_unaligned_loop);
672
673 __ bind(L_aligned_loop);
674 __ lvx(vH, temp1, data);
675 LE_swap_bytes(vH);
676 computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
677 vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
678 __ addi(data, data, 16);
679 __ bdnz(L_aligned_loop);
680 __ b(L_store);
681
682 __ bind(L_initialize_unaligned_loop);
683 __ li(temp1, 0);
684 __ lvsl(vPerm, temp1, data);
685 __ lvx(vHigh, temp1, data);
686 #ifdef VM_LITTLE_ENDIAN
687 __ vspltisb(vTmp12, -1);
688 __ vxor(vPerm, vPerm, vTmp12);
689 #endif
690 __ bind(L_unaligned_loop);
691 __ addi(data, data, 16);
692 __ lvx(vLow, temp1, data);
693 __ vec_perm(vH, vHigh, vLow, vPerm);
694 computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
695 vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
696 __ vmr(vHigh, vLow);
697 __ bdnz(L_unaligned_loop);
698
699 __ bind(L_store);
700 __ stxvd2x(vState->to_vsr(), state);
701 __ blr();
702
703 return start;
704 }
705 // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
706 //
707 // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
708 // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
709 //
710 // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
711 // for turning on loop predication optimization, and hence the behavior of "array range check"
712 // and "loop invariant check" could be influenced, which potentially boosted JVM98.
713 //
714 // Generate stub for disjoint short fill. If "aligned" is true, the
715 // "to" address is assumed to be heapword aligned.
716 //
717 // Arguments for generated stub:
718 // to: R3_ARG1
719 // value: R4_ARG2
720 // count: R5_ARG3 treated as signed
721 //
722 address generate_fill(StubId stub_id) {
723 BasicType t;
724 bool aligned;
725
726 switch (stub_id) {
727 case StubId::stubgen_jbyte_fill_id:
728 t = T_BYTE;
729 aligned = false;
730 break;
731 case StubId::stubgen_jshort_fill_id:
732 t = T_SHORT;
733 aligned = false;
734 break;
735 case StubId::stubgen_jint_fill_id:
736 t = T_INT;
737 aligned = false;
738 break;
739 case StubId::stubgen_arrayof_jbyte_fill_id:
740 t = T_BYTE;
741 aligned = true;
742 break;
743 case StubId::stubgen_arrayof_jshort_fill_id:
744 t = T_SHORT;
745 aligned = true;
746 break;
747 case StubId::stubgen_arrayof_jint_fill_id:
748 t = T_INT;
749 aligned = true;
750 break;
751 default:
752 ShouldNotReachHere();
753 }
754
755 StubCodeMark mark(this, stub_id);
756 address start = __ function_entry();
757
758 const Register to = R3_ARG1; // source array address
759 const Register value = R4_ARG2; // fill value
760 const Register count = R5_ARG3; // elements count
761 const Register temp = R6_ARG4; // temp register
762
763 //assert_clean_int(count, O3); // Make sure 'count' is clean int.
764
765 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
766 Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
767
768 int shift = -1;
769 switch (t) {
770 case T_BYTE:
771 shift = 2;
772 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
773 __ rldimi(value, value, 8, 48); // 8 bit -> 16 bit
774 __ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
775 __ blt(CR0, L_fill_elements);
776 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
777 break;
778 case T_SHORT:
779 shift = 1;
780 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
781 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
782 __ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
783 __ blt(CR0, L_fill_elements);
784 break;
785 case T_INT:
786 shift = 0;
787 __ cmpdi(CR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
788 __ blt(CR0, L_fill_4_bytes);
789 break;
790 default: ShouldNotReachHere();
791 }
792
793 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
794 // Align source address at 4 bytes address boundary.
795 if (t == T_BYTE) {
796 // One byte misalignment happens only for byte arrays.
797 __ andi_(temp, to, 1);
798 __ beq(CR0, L_skip_align1);
799 __ stb(value, 0, to);
800 __ addi(to, to, 1);
801 __ addi(count, count, -1);
802 __ bind(L_skip_align1);
803 }
804 // Two bytes misalignment happens only for byte and short (char) arrays.
805 __ andi_(temp, to, 2);
806 __ beq(CR0, L_skip_align2);
807 __ sth(value, 0, to);
808 __ addi(to, to, 2);
809 __ addi(count, count, -(1 << (shift - 1)));
810 __ bind(L_skip_align2);
811 }
812
813 if (!aligned) {
814 // Align to 8 bytes, we know we are 4 byte aligned to start.
815 __ andi_(temp, to, 7);
816 __ beq(CR0, L_fill_32_bytes);
817 __ stw(value, 0, to);
818 __ addi(to, to, 4);
819 __ addi(count, count, -(1 << shift));
820 __ bind(L_fill_32_bytes);
821 }
822
823 __ li(temp, 8<<shift); // Prepare for 32 byte loop.
824 // Clone bytes int->long as above.
825 __ rldimi(value, value, 32, 0); // 32 bit -> 64 bit
826
827 Label L_check_fill_8_bytes;
828 // Fill 32-byte chunks.
829 __ subf_(count, temp, count);
830 __ blt(CR0, L_check_fill_8_bytes);
831
832 Label L_fill_32_bytes_loop;
833 __ align(32);
834 __ bind(L_fill_32_bytes_loop);
835
836 __ std(value, 0, to);
837 __ std(value, 8, to);
838 __ subf_(count, temp, count); // Update count.
839 __ std(value, 16, to);
840 __ std(value, 24, to);
841
842 __ addi(to, to, 32);
843 __ bge(CR0, L_fill_32_bytes_loop);
844
845 __ bind(L_check_fill_8_bytes);
846 __ add_(count, temp, count);
847 __ beq(CR0, L_exit);
848 __ addic_(count, count, -(2 << shift));
849 __ blt(CR0, L_fill_4_bytes);
850
851 //
852 // Length is too short, just fill 8 bytes at a time.
853 //
854 Label L_fill_8_bytes_loop;
855 __ bind(L_fill_8_bytes_loop);
856 __ std(value, 0, to);
857 __ addic_(count, count, -(2 << shift));
858 __ addi(to, to, 8);
859 __ bge(CR0, L_fill_8_bytes_loop);
860
861 // Fill trailing 4 bytes.
862 __ bind(L_fill_4_bytes);
863 __ andi_(temp, count, 1<<shift);
864 __ beq(CR0, L_fill_2_bytes);
865
866 __ stw(value, 0, to);
867 if (t == T_BYTE || t == T_SHORT) {
868 __ addi(to, to, 4);
869 // Fill trailing 2 bytes.
870 __ bind(L_fill_2_bytes);
871 __ andi_(temp, count, 1<<(shift-1));
872 __ beq(CR0, L_fill_byte);
873 __ sth(value, 0, to);
874 if (t == T_BYTE) {
875 __ addi(to, to, 2);
876 // Fill trailing byte.
877 __ bind(L_fill_byte);
878 __ andi_(count, count, 1);
879 __ beq(CR0, L_exit);
880 __ stb(value, 0, to);
881 } else {
882 __ bind(L_fill_byte);
883 }
884 } else {
885 __ bind(L_fill_2_bytes);
886 }
887 __ bind(L_exit);
888 __ blr();
889
890 // Handle copies less than 8 bytes. Int is handled elsewhere.
891 if (t == T_BYTE) {
892 __ bind(L_fill_elements);
893 Label L_fill_2, L_fill_4;
894 __ andi_(temp, count, 1);
895 __ beq(CR0, L_fill_2);
896 __ stb(value, 0, to);
897 __ addi(to, to, 1);
898 __ bind(L_fill_2);
899 __ andi_(temp, count, 2);
900 __ beq(CR0, L_fill_4);
901 __ stb(value, 0, to);
902 __ stb(value, 0, to);
903 __ addi(to, to, 2);
904 __ bind(L_fill_4);
905 __ andi_(temp, count, 4);
906 __ beq(CR0, L_exit);
907 __ stb(value, 0, to);
908 __ stb(value, 1, to);
909 __ stb(value, 2, to);
910 __ stb(value, 3, to);
911 __ blr();
912 }
913
914 if (t == T_SHORT) {
915 Label L_fill_2;
916 __ bind(L_fill_elements);
917 __ andi_(temp, count, 1);
918 __ beq(CR0, L_fill_2);
919 __ sth(value, 0, to);
920 __ addi(to, to, 2);
921 __ bind(L_fill_2);
922 __ andi_(temp, count, 2);
923 __ beq(CR0, L_exit);
924 __ sth(value, 0, to);
925 __ sth(value, 2, to);
926 __ blr();
927 }
928 return start;
929 }
930
931 inline void assert_positive_int(Register count) {
932 #ifdef ASSERT
933 __ srdi_(R0, count, 31);
934 __ asm_assert_eq("missing zero extend");
935 #endif
936 }
937
938 // Generate overlap test for array copy stubs.
939 //
940 // Input:
941 // R3_ARG1 - from
942 // R4_ARG2 - to
943 // R5_ARG3 - element count
944 //
945 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
946 Register tmp1 = R6_ARG4;
947 Register tmp2 = R7_ARG5;
948
949 assert_positive_int(R5_ARG3);
950
951 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
952 __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
953 __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
954 __ cmpld(CR1, tmp1, tmp2);
955 __ crnand(CR0, Assembler::less, CR1, Assembler::less);
956 // Overlaps if Src before dst and distance smaller than size.
957 // Branch to forward copy routine otherwise (within range of 32kB).
958 __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::less), no_overlap_target);
959
960 // need to copy backwards
961 }
962
963 // This is common errorexit stub for UnsafeMemoryAccess.
964 address generate_unsafecopy_common_error_exit() {
965 address start_pc = __ pc();
966 Register tmp1 = R6_ARG4;
967 // probably copy stub would have changed value reset it.
968 if (VM_Version::has_mfdscr()) {
969 __ load_const_optimized(tmp1, VM_Version::_dscr_val);
970 __ mtdscr(tmp1);
971 }
972 __ li(R3_RET, 0); // return 0
973 __ blr();
974 return start_pc;
975 }
976
977 // The guideline in the implementations of generate_disjoint_xxx_copy
978 // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
979 // single instructions, but to avoid alignment interrupts (see subsequent
980 // comment). Furthermore, we try to minimize misaligned access, even
981 // though they cause no alignment interrupt.
982 //
983 // In Big-Endian mode, the PowerPC architecture requires implementations to
984 // handle automatically misaligned integer halfword and word accesses,
985 // word-aligned integer doubleword accesses, and word-aligned floating-point
986 // accesses. Other accesses may or may not generate an Alignment interrupt
987 // depending on the implementation.
988 // Alignment interrupt handling may require on the order of hundreds of cycles,
989 // so every effort should be made to avoid misaligned memory values.
990 //
991 //
992 // Generate stub for disjoint byte copy. If "aligned" is true, the
993 // "from" and "to" addresses are assumed to be heapword aligned.
994 //
995 // Arguments for generated stub:
996 // from: R3_ARG1
997 // to: R4_ARG2
998 // count: R5_ARG3 treated as signed
999 //
1000 address generate_disjoint_byte_copy(StubId stub_id) {
1001 bool aligned;
1002 switch (stub_id) {
1003 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1004 aligned = false;
1005 break;
1006 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1007 aligned = true;
1008 break;
1009 default:
1010 ShouldNotReachHere();
1011 }
1012
1013 StubCodeMark mark(this, stub_id);
1014 address start = __ function_entry();
1015 assert_positive_int(R5_ARG3);
1016
1017 Register tmp1 = R6_ARG4;
1018 Register tmp2 = R7_ARG5;
1019 Register tmp3 = R8_ARG6;
1020 Register tmp4 = R9_ARG7;
1021
1022 VectorSRegister tmp_vsr1 = VSR1;
1023 VectorSRegister tmp_vsr2 = VSR2;
1024
1025 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
1026 {
1027 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1028 UnsafeMemoryAccessMark umam(this, !aligned, false);
1029
1030 // Don't try anything fancy if arrays don't have many elements.
1031 __ li(tmp3, 0);
1032 __ cmpwi(CR0, R5_ARG3, 17);
1033 __ ble(CR0, l_6); // copy 4 at a time
1034
1035 if (!aligned) {
1036 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1037 __ andi_(tmp1, tmp1, 3);
1038 __ bne(CR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1039
1040 // Copy elements if necessary to align to 4 bytes.
1041 __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1042 __ andi_(tmp1, tmp1, 3);
1043 __ beq(CR0, l_2);
1044
1045 __ subf(R5_ARG3, tmp1, R5_ARG3);
1046 __ bind(l_9);
1047 __ lbz(tmp2, 0, R3_ARG1);
1048 __ addic_(tmp1, tmp1, -1);
1049 __ stb(tmp2, 0, R4_ARG2);
1050 __ addi(R3_ARG1, R3_ARG1, 1);
1051 __ addi(R4_ARG2, R4_ARG2, 1);
1052 __ bne(CR0, l_9);
1053
1054 __ bind(l_2);
1055 }
1056
1057 // copy 8 elements at a time
1058 __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1059 __ andi_(tmp1, tmp2, 7);
1060 __ bne(CR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1061
1062 // copy a 2-element word if necessary to align to 8 bytes
1063 __ andi_(R0, R3_ARG1, 7);
1064 __ beq(CR0, l_7);
1065
1066 __ lwzx(tmp2, R3_ARG1, tmp3);
1067 __ addi(R5_ARG3, R5_ARG3, -4);
1068 __ stwx(tmp2, R4_ARG2, tmp3);
1069 { // FasterArrayCopy
1070 __ addi(R3_ARG1, R3_ARG1, 4);
1071 __ addi(R4_ARG2, R4_ARG2, 4);
1072 }
1073 __ bind(l_7);
1074
1075 { // FasterArrayCopy
1076 __ cmpwi(CR0, R5_ARG3, 31);
1077 __ ble(CR0, l_6); // copy 2 at a time if less than 32 elements remain
1078
1079 __ srdi(tmp1, R5_ARG3, 5);
1080 __ andi_(R5_ARG3, R5_ARG3, 31);
1081 __ mtctr(tmp1);
1082
1083
1084 // Prefetch the data into the L2 cache.
1085 __ dcbt(R3_ARG1, 0);
1086
1087 // If supported set DSCR pre-fetch to deepest.
1088 if (VM_Version::has_mfdscr()) {
1089 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1090 __ mtdscr(tmp2);
1091 }
1092 __ li(tmp1, 16);
1093
1094 // Backbranch target aligned to 32-byte. Not 16-byte align as
1095 // loop contains < 8 instructions that fit inside a single
1096 // i-cache sector.
1097 __ align(32);
1098
1099 __ bind(l_10);
1100 // Use loop with VSX load/store instructions to
1101 // copy 32 elements a time.
1102 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1103 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1104 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1105 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1106 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1107 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1108 __ bdnz(l_10); // Dec CTR and loop if not zero.
1109
1110 // Restore DSCR pre-fetch value.
1111 if (VM_Version::has_mfdscr()) {
1112 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1113 __ mtdscr(tmp2);
1114 }
1115
1116 } // FasterArrayCopy
1117
1118 __ bind(l_6);
1119
1120 // copy 4 elements at a time
1121 __ cmpwi(CR0, R5_ARG3, 4);
1122 __ blt(CR0, l_1);
1123 __ srdi(tmp1, R5_ARG3, 2);
1124 __ mtctr(tmp1); // is > 0
1125 __ andi_(R5_ARG3, R5_ARG3, 3);
1126
1127 { // FasterArrayCopy
1128 __ addi(R3_ARG1, R3_ARG1, -4);
1129 __ addi(R4_ARG2, R4_ARG2, -4);
1130 __ bind(l_3);
1131 __ lwzu(tmp2, 4, R3_ARG1);
1132 __ stwu(tmp2, 4, R4_ARG2);
1133 __ bdnz(l_3);
1134 __ addi(R3_ARG1, R3_ARG1, 4);
1135 __ addi(R4_ARG2, R4_ARG2, 4);
1136 }
1137
1138 // do single element copy
1139 __ bind(l_1);
1140 __ cmpwi(CR0, R5_ARG3, 0);
1141 __ beq(CR0, l_4);
1142
1143 { // FasterArrayCopy
1144 __ mtctr(R5_ARG3);
1145 __ addi(R3_ARG1, R3_ARG1, -1);
1146 __ addi(R4_ARG2, R4_ARG2, -1);
1147
1148 __ bind(l_5);
1149 __ lbzu(tmp2, 1, R3_ARG1);
1150 __ stbu(tmp2, 1, R4_ARG2);
1151 __ bdnz(l_5);
1152 }
1153 }
1154
1155 __ bind(l_4);
1156 __ li(R3_RET, 0); // return 0
1157 __ blr();
1158
1159 return start;
1160 }
1161
1162 // Generate stub for conjoint byte copy. If "aligned" is true, the
1163 // "from" and "to" addresses are assumed to be heapword aligned.
1164 //
1165 // Arguments for generated stub:
1166 // from: R3_ARG1
1167 // to: R4_ARG2
1168 // count: R5_ARG3 treated as signed
1169 //
1170 address generate_conjoint_byte_copy(StubId stub_id) {
1171 bool aligned;
1172 switch (stub_id) {
1173 case StubId::stubgen_jbyte_arraycopy_id:
1174 aligned = false;
1175 break;
1176 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1177 aligned = true;
1178 break;
1179 default:
1180 ShouldNotReachHere();
1181 }
1182
1183 StubCodeMark mark(this, stub_id);
1184 address start = __ function_entry();
1185 assert_positive_int(R5_ARG3);
1186
1187 Register tmp1 = R6_ARG4;
1188 Register tmp2 = R7_ARG5;
1189 Register tmp3 = R8_ARG6;
1190
1191 address nooverlap_target = aligned ?
1192 STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy()) :
1193 STUB_ENTRY(jbyte_disjoint_arraycopy());
1194
1195 array_overlap_test(nooverlap_target, 0);
1196 // Do reverse copy. We assume the case of actual overlap is rare enough
1197 // that we don't have to optimize it.
1198 Label l_1, l_2;
1199 {
1200 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1201 UnsafeMemoryAccessMark umam(this, !aligned, false);
1202 __ b(l_2);
1203 __ bind(l_1);
1204 __ stbx(tmp1, R4_ARG2, R5_ARG3);
1205 __ bind(l_2);
1206 __ addic_(R5_ARG3, R5_ARG3, -1);
1207 __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1208 __ bge(CR0, l_1);
1209 }
1210 __ li(R3_RET, 0); // return 0
1211 __ blr();
1212
1213 return start;
1214 }
1215
1216 // Generate stub for disjoint short copy. If "aligned" is true, the
1217 // "from" and "to" addresses are assumed to be heapword aligned.
1218 //
1219 // Arguments for generated stub:
1220 // from: R3_ARG1
1221 // to: R4_ARG2
1222 // elm.count: R5_ARG3 treated as signed
1223 //
1224 // Strategy for aligned==true:
1225 //
1226 // If length <= 9:
1227 // 1. copy 2 elements at a time (l_6)
1228 // 2. copy last element if original element count was odd (l_1)
1229 //
1230 // If length > 9:
1231 // 1. copy 4 elements at a time until less than 4 elements are left (l_7)
1232 // 2. copy 2 elements at a time until less than 2 elements are left (l_6)
1233 // 3. copy last element if one was left in step 2. (l_1)
1234 //
1235 //
1236 // Strategy for aligned==false:
1237 //
1238 // If length <= 9: same as aligned==true case, but NOTE: load/stores
1239 // can be unaligned (see comment below)
1240 //
1241 // If length > 9:
1242 // 1. continue with step 6. if the alignment of from and to mod 4
1243 // is different.
1244 // 2. align from and to to 4 bytes by copying 1 element if necessary
1245 // 3. at l_2 from and to are 4 byte aligned; continue with
1246 // 5. if they cannot be aligned to 8 bytes because they have
1247 // got different alignment mod 8.
1248 // 4. at this point we know that both, from and to, have the same
1249 // alignment mod 8, now copy one element if necessary to get
1250 // 8 byte alignment of from and to.
1251 // 5. copy 4 elements at a time until less than 4 elements are
1252 // left; depending on step 3. all load/stores are aligned or
1253 // either all loads or all stores are unaligned.
1254 // 6. copy 2 elements at a time until less than 2 elements are
1255 // left (l_6); arriving here from step 1., there is a chance
1256 // that all accesses are unaligned.
1257 // 7. copy last element if one was left in step 6. (l_1)
1258 //
1259 // There are unaligned data accesses using integer load/store
1260 // instructions in this stub. POWER allows such accesses.
1261 //
1262 // According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1263 // Chapter 2: Effect of Operand Placement on Performance) unaligned
1264 // integer load/stores have good performance. Only unaligned
1265 // floating point load/stores can have poor performance.
1266 //
1267 // TODO:
1268 //
1269 // 1. check if aligning the backbranch target of loops is beneficial
1270 //
1271 address generate_disjoint_short_copy(StubId stub_id) {
1272 bool aligned;
1273 switch (stub_id) {
1274 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1275 aligned = false;
1276 break;
1277 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1278 aligned = true;
1279 break;
1280 default:
1281 ShouldNotReachHere();
1282 }
1283
1284 StubCodeMark mark(this, stub_id);
1285
1286 Register tmp1 = R6_ARG4;
1287 Register tmp2 = R7_ARG5;
1288 Register tmp3 = R8_ARG6;
1289 Register tmp4 = R9_ARG7;
1290
1291 VectorSRegister tmp_vsr1 = VSR1;
1292 VectorSRegister tmp_vsr2 = VSR2;
1293
1294 address start = __ function_entry();
1295 assert_positive_int(R5_ARG3);
1296
1297 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1298 {
1299 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1300 UnsafeMemoryAccessMark umam(this, !aligned, false);
1301 // don't try anything fancy if arrays don't have many elements
1302 __ li(tmp3, 0);
1303 __ cmpwi(CR0, R5_ARG3, 9);
1304 __ ble(CR0, l_6); // copy 2 at a time
1305
1306 if (!aligned) {
1307 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1308 __ andi_(tmp1, tmp1, 3);
1309 __ bne(CR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1310
1311 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1312
1313 // Copy 1 element if necessary to align to 4 bytes.
1314 __ andi_(tmp1, R3_ARG1, 3);
1315 __ beq(CR0, l_2);
1316
1317 __ lhz(tmp2, 0, R3_ARG1);
1318 __ addi(R3_ARG1, R3_ARG1, 2);
1319 __ sth(tmp2, 0, R4_ARG2);
1320 __ addi(R4_ARG2, R4_ARG2, 2);
1321 __ addi(R5_ARG3, R5_ARG3, -1);
1322 __ bind(l_2);
1323
1324 // At this point the positions of both, from and to, are at least 4 byte aligned.
1325
1326 // Copy 4 elements at a time.
1327 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1328 __ xorr(tmp2, R3_ARG1, R4_ARG2);
1329 __ andi_(tmp1, tmp2, 7);
1330 __ bne(CR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1331
1332 // Copy a 2-element word if necessary to align to 8 bytes.
1333 __ andi_(R0, R3_ARG1, 7);
1334 __ beq(CR0, l_7);
1335
1336 __ lwzx(tmp2, R3_ARG1, tmp3);
1337 __ addi(R5_ARG3, R5_ARG3, -2);
1338 __ stwx(tmp2, R4_ARG2, tmp3);
1339 { // FasterArrayCopy
1340 __ addi(R3_ARG1, R3_ARG1, 4);
1341 __ addi(R4_ARG2, R4_ARG2, 4);
1342 }
1343 }
1344
1345 __ bind(l_7);
1346
1347 // Copy 4 elements at a time; either the loads or the stores can
1348 // be unaligned if aligned == false.
1349
1350 { // FasterArrayCopy
1351 __ cmpwi(CR0, R5_ARG3, 15);
1352 __ ble(CR0, l_6); // copy 2 at a time if less than 16 elements remain
1353
1354 __ srdi(tmp1, R5_ARG3, 4);
1355 __ andi_(R5_ARG3, R5_ARG3, 15);
1356 __ mtctr(tmp1);
1357
1358
1359 // Processor supports VSX, so use it to mass copy.
1360
1361 // Prefetch src data into L2 cache.
1362 __ dcbt(R3_ARG1, 0);
1363
1364 // If supported set DSCR pre-fetch to deepest.
1365 if (VM_Version::has_mfdscr()) {
1366 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1367 __ mtdscr(tmp2);
1368 }
1369 __ li(tmp1, 16);
1370
1371 // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1372 // as loop contains < 8 instructions that fit inside a single
1373 // i-cache sector.
1374 __ align(32);
1375
1376 __ bind(l_9);
1377 // Use loop with VSX load/store instructions to
1378 // copy 16 elements a time.
1379 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src.
1380 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst.
1381 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
1382 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1383 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
1384 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
1385 __ bdnz(l_9); // Dec CTR and loop if not zero.
1386
1387 // Restore DSCR pre-fetch value.
1388 if (VM_Version::has_mfdscr()) {
1389 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1390 __ mtdscr(tmp2);
1391 }
1392
1393 } // FasterArrayCopy
1394 __ bind(l_6);
1395
1396 // copy 2 elements at a time
1397 { // FasterArrayCopy
1398 __ cmpwi(CR0, R5_ARG3, 2);
1399 __ blt(CR0, l_1);
1400 __ srdi(tmp1, R5_ARG3, 1);
1401 __ andi_(R5_ARG3, R5_ARG3, 1);
1402
1403 __ addi(R3_ARG1, R3_ARG1, -4);
1404 __ addi(R4_ARG2, R4_ARG2, -4);
1405 __ mtctr(tmp1);
1406
1407 __ bind(l_3);
1408 __ lwzu(tmp2, 4, R3_ARG1);
1409 __ stwu(tmp2, 4, R4_ARG2);
1410 __ bdnz(l_3);
1411
1412 __ addi(R3_ARG1, R3_ARG1, 4);
1413 __ addi(R4_ARG2, R4_ARG2, 4);
1414 }
1415
1416 // do single element copy
1417 __ bind(l_1);
1418 __ cmpwi(CR0, R5_ARG3, 0);
1419 __ beq(CR0, l_4);
1420
1421 { // FasterArrayCopy
1422 __ mtctr(R5_ARG3);
1423 __ addi(R3_ARG1, R3_ARG1, -2);
1424 __ addi(R4_ARG2, R4_ARG2, -2);
1425
1426 __ bind(l_5);
1427 __ lhzu(tmp2, 2, R3_ARG1);
1428 __ sthu(tmp2, 2, R4_ARG2);
1429 __ bdnz(l_5);
1430 }
1431 }
1432
1433 __ bind(l_4);
1434 __ li(R3_RET, 0); // return 0
1435 __ blr();
1436
1437 return start;
1438 }
1439
1440 // Generate stub for conjoint short copy. If "aligned" is true, the
1441 // "from" and "to" addresses are assumed to be heapword aligned.
1442 //
1443 // Arguments for generated stub:
1444 // from: R3_ARG1
1445 // to: R4_ARG2
1446 // count: R5_ARG3 treated as signed
1447 //
1448 address generate_conjoint_short_copy(StubId stub_id) {
1449 bool aligned;
1450 switch (stub_id) {
1451 case StubId::stubgen_jshort_arraycopy_id:
1452 aligned = false;
1453 break;
1454 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1455 aligned = true;
1456 break;
1457 default:
1458 ShouldNotReachHere();
1459 }
1460
1461 StubCodeMark mark(this, stub_id);
1462 address start = __ function_entry();
1463 assert_positive_int(R5_ARG3);
1464
1465 Register tmp1 = R6_ARG4;
1466 Register tmp2 = R7_ARG5;
1467 Register tmp3 = R8_ARG6;
1468
1469 address nooverlap_target = aligned ?
1470 STUB_ENTRY(arrayof_jshort_disjoint_arraycopy()) :
1471 STUB_ENTRY(jshort_disjoint_arraycopy());
1472
1473 array_overlap_test(nooverlap_target, 1);
1474
1475 Label l_1, l_2;
1476 {
1477 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1478 UnsafeMemoryAccessMark umam(this, !aligned, false);
1479 __ sldi(tmp1, R5_ARG3, 1);
1480 __ b(l_2);
1481 __ bind(l_1);
1482 __ sthx(tmp2, R4_ARG2, tmp1);
1483 __ bind(l_2);
1484 __ addic_(tmp1, tmp1, -2);
1485 __ lhzx(tmp2, R3_ARG1, tmp1);
1486 __ bge(CR0, l_1);
1487 }
1488 __ li(R3_RET, 0); // return 0
1489 __ blr();
1490
1491 return start;
1492 }
1493
1494 // Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned"
1495 // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1496 //
1497 // Arguments:
1498 // from: R3_ARG1
1499 // to: R4_ARG2
1500 // count: R5_ARG3 treated as signed
1501 //
1502 void generate_disjoint_int_copy_core(bool aligned) {
1503 Register tmp1 = R6_ARG4;
1504 Register tmp2 = R7_ARG5;
1505 Register tmp3 = R8_ARG6;
1506 Register tmp4 = R0;
1507
1508 VectorSRegister tmp_vsr1 = VSR1;
1509 VectorSRegister tmp_vsr2 = VSR2;
1510
1511 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1512
1513 // for short arrays, just do single element copy
1514 __ li(tmp3, 0);
1515 __ cmpwi(CR0, R5_ARG3, 5);
1516 __ ble(CR0, l_2);
1517
1518 if (!aligned) {
1519 // check if arrays have same alignment mod 8.
1520 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1521 __ andi_(R0, tmp1, 7);
1522 // Not the same alignment, but ld and std just need to be 4 byte aligned.
1523 __ bne(CR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1524
1525 // copy 1 element to align to and from on an 8 byte boundary
1526 __ andi_(R0, R3_ARG1, 7);
1527 __ beq(CR0, l_4);
1528
1529 __ lwzx(tmp2, R3_ARG1, tmp3);
1530 __ addi(R5_ARG3, R5_ARG3, -1);
1531 __ stwx(tmp2, R4_ARG2, tmp3);
1532 { // FasterArrayCopy
1533 __ addi(R3_ARG1, R3_ARG1, 4);
1534 __ addi(R4_ARG2, R4_ARG2, 4);
1535 }
1536 __ bind(l_4);
1537 }
1538
1539 { // FasterArrayCopy
1540 __ cmpwi(CR0, R5_ARG3, 7);
1541 __ ble(CR0, l_2); // copy 1 at a time if less than 8 elements remain
1542
1543 __ srdi(tmp1, R5_ARG3, 3);
1544 __ andi_(R5_ARG3, R5_ARG3, 7);
1545 __ mtctr(tmp1);
1546
1547 // Processor supports VSX, so use it to mass copy.
1548
1549 // Prefetch the data into the L2 cache.
1550 __ dcbt(R3_ARG1, 0);
1551
1552 // Set DSCR pre-fetch to deepest.
1553 if (VM_Version::has_mfdscr()) {
1554 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1555 __ mtdscr(tmp2);
1556 }
1557 __ li(tmp1, 16);
1558
1559 // Backbranch target aligned to 32-byte. Not 16-byte align as
1560 // loop contains < 8 instructions that fit inside a single
1561 // i-cache sector.
1562 __ align(32);
1563
1564 __ bind(l_7);
1565 // Use loop with VSX load/store instructions to
1566 // copy 8 elements a time.
1567 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1568 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1569 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1570 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1571 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1572 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1573 __ bdnz(l_7); // Dec CTR and loop if not zero.
1574
1575 // Restore DSCR pre-fetch value.
1576 if (VM_Version::has_mfdscr()) {
1577 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1578 __ mtdscr(tmp2);
1579 }
1580
1581 } // FasterArrayCopy
1582
1583 // copy 1 element at a time
1584 __ bind(l_2);
1585 __ cmpwi(CR0, R5_ARG3, 0);
1586 __ beq(CR0, l_1);
1587
1588 { // FasterArrayCopy
1589 __ mtctr(R5_ARG3);
1590 __ addi(R3_ARG1, R3_ARG1, -4);
1591 __ addi(R4_ARG2, R4_ARG2, -4);
1592
1593 __ bind(l_3);
1594 __ lwzu(tmp2, 4, R3_ARG1);
1595 __ stwu(tmp2, 4, R4_ARG2);
1596 __ bdnz(l_3);
1597 }
1598
1599 __ bind(l_1);
1600 return;
1601 }
1602
1603 // Generate stub for disjoint int copy. If "aligned" is true, the
1604 // "from" and "to" addresses are assumed to be heapword aligned.
1605 //
1606 // Arguments for generated stub:
1607 // from: R3_ARG1
1608 // to: R4_ARG2
1609 // count: R5_ARG3 treated as signed
1610 //
1611 address generate_disjoint_int_copy(StubId stub_id) {
1612 bool aligned;
1613 switch (stub_id) {
1614 case StubId::stubgen_jint_disjoint_arraycopy_id:
1615 aligned = false;
1616 break;
1617 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1618 aligned = true;
1619 break;
1620 default:
1621 ShouldNotReachHere();
1622 }
1623
1624 StubCodeMark mark(this, stub_id);
1625 address start = __ function_entry();
1626 assert_positive_int(R5_ARG3);
1627 {
1628 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1629 UnsafeMemoryAccessMark umam(this, !aligned, false);
1630 generate_disjoint_int_copy_core(aligned);
1631 }
1632 __ li(R3_RET, 0); // return 0
1633 __ blr();
1634 return start;
1635 }
1636
1637 // Generate core code for conjoint int copy (and oop copy on
1638 // 32-bit). If "aligned" is true, the "from" and "to" addresses
1639 // are assumed to be heapword aligned.
1640 //
1641 // Arguments:
1642 // from: R3_ARG1
1643 // to: R4_ARG2
1644 // count: R5_ARG3 treated as signed
1645 //
1646 void generate_conjoint_int_copy_core(bool aligned) {
1647 // Do reverse copy. We assume the case of actual overlap is rare enough
1648 // that we don't have to optimize it.
1649
1650 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1651
1652 Register tmp1 = R6_ARG4;
1653 Register tmp2 = R7_ARG5;
1654 Register tmp3 = R8_ARG6;
1655 Register tmp4 = R0;
1656
1657 VectorSRegister tmp_vsr1 = VSR1;
1658 VectorSRegister tmp_vsr2 = VSR2;
1659
1660 { // FasterArrayCopy
1661 __ cmpwi(CR0, R5_ARG3, 0);
1662 __ beq(CR0, l_6);
1663
1664 __ sldi(R5_ARG3, R5_ARG3, 2);
1665 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1666 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1667 __ srdi(R5_ARG3, R5_ARG3, 2);
1668
1669 if (!aligned) {
1670 // check if arrays have same alignment mod 8.
1671 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1672 __ andi_(R0, tmp1, 7);
1673 // Not the same alignment, but ld and std just need to be 4 byte aligned.
1674 __ bne(CR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1675
1676 // copy 1 element to align to and from on an 8 byte boundary
1677 __ andi_(R0, R3_ARG1, 7);
1678 __ beq(CR0, l_7);
1679
1680 __ addi(R3_ARG1, R3_ARG1, -4);
1681 __ addi(R4_ARG2, R4_ARG2, -4);
1682 __ addi(R5_ARG3, R5_ARG3, -1);
1683 __ lwzx(tmp2, R3_ARG1);
1684 __ stwx(tmp2, R4_ARG2);
1685 __ bind(l_7);
1686 }
1687
1688 __ cmpwi(CR0, R5_ARG3, 7);
1689 __ ble(CR0, l_5); // copy 1 at a time if less than 8 elements remain
1690
1691 __ srdi(tmp1, R5_ARG3, 3);
1692 __ andi(R5_ARG3, R5_ARG3, 7);
1693 __ mtctr(tmp1);
1694
1695 // Processor supports VSX, so use it to mass copy.
1696 // Prefetch the data into the L2 cache.
1697 __ dcbt(R3_ARG1, 0);
1698
1699 // Set DSCR pre-fetch to deepest.
1700 if (VM_Version::has_mfdscr()) {
1701 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1702 __ mtdscr(tmp2);
1703 }
1704 __ li(tmp1, 16);
1705
1706 // Backbranch target aligned to 32-byte. Not 16-byte align as
1707 // loop contains < 8 instructions that fit inside a single
1708 // i-cache sector.
1709 __ align(32);
1710
1711 __ bind(l_4);
1712 // Use loop with VSX load/store instructions to
1713 // copy 8 elements a time.
1714 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
1715 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
1716 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
1717 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1718 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1719 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1720 __ bdnz(l_4);
1721
1722 // Restore DSCR pre-fetch value.
1723 if (VM_Version::has_mfdscr()) {
1724 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1725 __ mtdscr(tmp2);
1726 }
1727
1728 __ cmpwi(CR0, R5_ARG3, 0);
1729 __ beq(CR0, l_6);
1730
1731 __ bind(l_5);
1732 __ mtctr(R5_ARG3);
1733 __ bind(l_3);
1734 __ lwz(R0, -4, R3_ARG1);
1735 __ stw(R0, -4, R4_ARG2);
1736 __ addi(R3_ARG1, R3_ARG1, -4);
1737 __ addi(R4_ARG2, R4_ARG2, -4);
1738 __ bdnz(l_3);
1739
1740 __ bind(l_6);
1741 }
1742 }
1743
1744 // Generate stub for conjoint int copy. If "aligned" is true, the
1745 // "from" and "to" addresses are assumed to be heapword aligned.
1746 //
1747 // Arguments for generated stub:
1748 // from: R3_ARG1
1749 // to: R4_ARG2
1750 // count: R5_ARG3 treated as signed
1751 //
1752 address generate_conjoint_int_copy(StubId stub_id) {
1753 bool aligned;
1754 switch (stub_id) {
1755 case StubId::stubgen_jint_arraycopy_id:
1756 aligned = false;
1757 break;
1758 case StubId::stubgen_arrayof_jint_arraycopy_id:
1759 aligned = true;
1760 break;
1761 default:
1762 ShouldNotReachHere();
1763 }
1764
1765 StubCodeMark mark(this, stub_id);
1766 address start = __ function_entry();
1767 assert_positive_int(R5_ARG3);
1768 address nooverlap_target = aligned ?
1769 STUB_ENTRY(arrayof_jint_disjoint_arraycopy()) :
1770 STUB_ENTRY(jint_disjoint_arraycopy());
1771
1772 array_overlap_test(nooverlap_target, 2);
1773 {
1774 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1775 UnsafeMemoryAccessMark umam(this, !aligned, false);
1776 generate_conjoint_int_copy_core(aligned);
1777 }
1778
1779 __ li(R3_RET, 0); // return 0
1780 __ blr();
1781
1782 return start;
1783 }
1784
1785 // Generate core code for disjoint long copy (and oop copy on
1786 // 64-bit). If "aligned" is true, the "from" and "to" addresses
1787 // are assumed to be heapword aligned.
1788 //
1789 // Arguments:
1790 // from: R3_ARG1
1791 // to: R4_ARG2
1792 // count: R5_ARG3 treated as signed
1793 //
1794 void generate_disjoint_long_copy_core(bool aligned) {
1795 Register tmp1 = R6_ARG4;
1796 Register tmp2 = R7_ARG5;
1797 Register tmp3 = R8_ARG6;
1798 Register tmp4 = R0;
1799
1800 Label l_1, l_2, l_3, l_4, l_5;
1801
1802 VectorSRegister tmp_vsr1 = VSR1;
1803 VectorSRegister tmp_vsr2 = VSR2;
1804
1805 { // FasterArrayCopy
1806 __ cmpwi(CR0, R5_ARG3, 3);
1807 __ ble(CR0, l_3); // copy 1 at a time if less than 4 elements remain
1808
1809 __ srdi(tmp1, R5_ARG3, 2);
1810 __ andi_(R5_ARG3, R5_ARG3, 3);
1811 __ mtctr(tmp1);
1812
1813 // Processor supports VSX, so use it to mass copy.
1814
1815 // Prefetch the data into the L2 cache.
1816 __ dcbt(R3_ARG1, 0);
1817
1818 // Set DSCR pre-fetch to deepest.
1819 if (VM_Version::has_mfdscr()) {
1820 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1821 __ mtdscr(tmp2);
1822 }
1823 __ li(tmp1, 16);
1824
1825 // Backbranch target aligned to 32-byte. Not 16-byte align as
1826 // loop contains < 8 instructions that fit inside a single
1827 // i-cache sector.
1828 __ align(32);
1829
1830 __ bind(l_5);
1831 // Use loop with VSX load/store instructions to
1832 // copy 4 elements a time.
1833 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1834 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1835 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
1836 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1837 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
1838 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
1839 __ bdnz(l_5); // Dec CTR and loop if not zero.
1840
1841 // Restore DSCR pre-fetch value.
1842 if (VM_Version::has_mfdscr()) {
1843 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1844 __ mtdscr(tmp2);
1845 }
1846
1847 } // FasterArrayCopy
1848
1849 // copy 1 element at a time
1850 __ bind(l_3);
1851 __ cmpwi(CR0, R5_ARG3, 0);
1852 __ beq(CR0, l_1);
1853
1854 { // FasterArrayCopy
1855 __ mtctr(R5_ARG3);
1856 __ addi(R3_ARG1, R3_ARG1, -8);
1857 __ addi(R4_ARG2, R4_ARG2, -8);
1858
1859 __ bind(l_2);
1860 __ ldu(R0, 8, R3_ARG1);
1861 __ stdu(R0, 8, R4_ARG2);
1862 __ bdnz(l_2);
1863
1864 }
1865 __ bind(l_1);
1866 }
1867
1868 // Generate stub for disjoint long copy. If "aligned" is true, the
1869 // "from" and "to" addresses are assumed to be heapword aligned.
1870 //
1871 // Arguments for generated stub:
1872 // from: R3_ARG1
1873 // to: R4_ARG2
1874 // count: R5_ARG3 treated as signed
1875 //
1876 address generate_disjoint_long_copy(StubId stub_id) {
1877 bool aligned;
1878 switch (stub_id) {
1879 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1880 aligned = false;
1881 break;
1882 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1883 aligned = true;
1884 break;
1885 default:
1886 ShouldNotReachHere();
1887 }
1888
1889 StubCodeMark mark(this, stub_id);
1890 address start = __ function_entry();
1891 assert_positive_int(R5_ARG3);
1892 {
1893 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
1894 UnsafeMemoryAccessMark umam(this, !aligned, false);
1895 generate_disjoint_long_copy_core(aligned);
1896 }
1897 __ li(R3_RET, 0); // return 0
1898 __ blr();
1899
1900 return start;
1901 }
1902
1903 // Generate core code for conjoint long copy (and oop copy on
1904 // 64-bit). If "aligned" is true, the "from" and "to" addresses
1905 // are assumed to be heapword aligned.
1906 //
1907 // Arguments:
1908 // from: R3_ARG1
1909 // to: R4_ARG2
1910 // count: R5_ARG3 treated as signed
1911 //
1912 void generate_conjoint_long_copy_core(bool aligned) {
1913 Register tmp1 = R6_ARG4;
1914 Register tmp2 = R7_ARG5;
1915 Register tmp3 = R8_ARG6;
1916 Register tmp4 = R0;
1917
1918 VectorSRegister tmp_vsr1 = VSR1;
1919 VectorSRegister tmp_vsr2 = VSR2;
1920
1921 Label l_1, l_2, l_3, l_4, l_5;
1922
1923 __ cmpwi(CR0, R5_ARG3, 0);
1924 __ beq(CR0, l_1);
1925
1926 { // FasterArrayCopy
1927 __ sldi(R5_ARG3, R5_ARG3, 3);
1928 __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1929 __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1930 __ srdi(R5_ARG3, R5_ARG3, 3);
1931
1932 __ cmpwi(CR0, R5_ARG3, 3);
1933 __ ble(CR0, l_5); // copy 1 at a time if less than 4 elements remain
1934
1935 __ srdi(tmp1, R5_ARG3, 2);
1936 __ andi(R5_ARG3, R5_ARG3, 3);
1937 __ mtctr(tmp1);
1938
1939 // Processor supports VSX, so use it to mass copy.
1940 // Prefetch the data into the L2 cache.
1941 __ dcbt(R3_ARG1, 0);
1942
1943 // Set DSCR pre-fetch to deepest.
1944 if (VM_Version::has_mfdscr()) {
1945 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1946 __ mtdscr(tmp2);
1947 }
1948 __ li(tmp1, 16);
1949
1950 // Backbranch target aligned to 32-byte. Not 16-byte align as
1951 // loop contains < 8 instructions that fit inside a single
1952 // i-cache sector.
1953 __ align(32);
1954
1955 __ bind(l_4);
1956 // Use loop with VSX load/store instructions to
1957 // copy 4 elements a time.
1958 __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
1959 __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
1960 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
1961 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
1962 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1963 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
1964 __ bdnz(l_4);
1965
1966 // Restore DSCR pre-fetch value.
1967 if (VM_Version::has_mfdscr()) {
1968 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1969 __ mtdscr(tmp2);
1970 }
1971
1972 __ cmpwi(CR0, R5_ARG3, 0);
1973 __ beq(CR0, l_1);
1974
1975 __ bind(l_5);
1976 __ mtctr(R5_ARG3);
1977 __ bind(l_3);
1978 __ ld(R0, -8, R3_ARG1);
1979 __ std(R0, -8, R4_ARG2);
1980 __ addi(R3_ARG1, R3_ARG1, -8);
1981 __ addi(R4_ARG2, R4_ARG2, -8);
1982 __ bdnz(l_3);
1983
1984 }
1985 __ bind(l_1);
1986 }
1987
1988 // Generate stub for conjoint long copy. If "aligned" is true, the
1989 // "from" and "to" addresses are assumed to be heapword aligned.
1990 //
1991 // Arguments for generated stub:
1992 // from: R3_ARG1
1993 // to: R4_ARG2
1994 // count: R5_ARG3 treated as signed
1995 //
1996 address generate_conjoint_long_copy(StubId stub_id) {
1997 bool aligned;
1998 switch (stub_id) {
1999 case StubId::stubgen_jlong_arraycopy_id:
2000 aligned = false;
2001 break;
2002 case StubId::stubgen_arrayof_jlong_arraycopy_id:
2003 aligned = true;
2004 break;
2005 default:
2006 ShouldNotReachHere();
2007 }
2008
2009 StubCodeMark mark(this, stub_id);
2010 address start = __ function_entry();
2011 assert_positive_int(R5_ARG3);
2012 address nooverlap_target = aligned ?
2013 STUB_ENTRY(arrayof_jlong_disjoint_arraycopy()) :
2014 STUB_ENTRY(jlong_disjoint_arraycopy());
2015
2016 array_overlap_test(nooverlap_target, 3);
2017 {
2018 // UnsafeMemoryAccess page error: continue at UnsafeMemoryAccess common_error_exit
2019 UnsafeMemoryAccessMark umam(this, !aligned, false);
2020 generate_conjoint_long_copy_core(aligned);
2021 }
2022 __ li(R3_RET, 0); // return 0
2023 __ blr();
2024
2025 return start;
2026 }
2027
2028 // Generate stub for conjoint oop copy. If "aligned" is true, the
2029 // "from" and "to" addresses are assumed to be heapword aligned.
2030 //
2031 // Arguments for generated stub:
2032 // from: R3_ARG1
2033 // to: R4_ARG2
2034 // count: R5_ARG3 treated as signed
2035 // dest_uninitialized: G1 support
2036 //
2037 address generate_conjoint_oop_copy(StubId stub_id) {
2038 bool aligned;
2039 bool dest_uninitialized;
2040 switch (stub_id) {
2041 case StubId::stubgen_oop_arraycopy_id:
2042 aligned = false;
2043 dest_uninitialized = false;
2044 break;
2045 case StubId::stubgen_arrayof_oop_arraycopy_id:
2046 aligned = true;
2047 dest_uninitialized = false;
2048 break;
2049 case StubId::stubgen_oop_arraycopy_uninit_id:
2050 aligned = false;
2051 dest_uninitialized = true;
2052 break;
2053 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
2054 aligned = true;
2055 dest_uninitialized = true;
2056 break;
2057 default:
2058 ShouldNotReachHere();
2059 }
2060
2061 StubCodeMark mark(this, stub_id);
2062 address start = __ function_entry();
2063 assert_positive_int(R5_ARG3);
2064 address nooverlap_target = aligned ?
2065 STUB_ENTRY(arrayof_oop_disjoint_arraycopy(dest_uninitialized)) :
2066 STUB_ENTRY(oop_disjoint_arraycopy(dest_uninitialized));
2067
2068 array_overlap_test(nooverlap_target, UseCompressedOops ? 2 : 3);
2069
2070 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2071 if (dest_uninitialized) {
2072 decorators |= IS_DEST_UNINITIALIZED;
2073 }
2074 if (aligned) {
2075 decorators |= ARRAYCOPY_ALIGNED;
2076 }
2077
2078 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2079 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2080
2081 if (UseCompressedOops) {
2082 generate_conjoint_int_copy_core(aligned);
2083 } else {
2084 #if INCLUDE_ZGC
2085 if (UseZGC) {
2086 ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
2087 zbs->generate_conjoint_oop_copy(_masm, dest_uninitialized);
2088 } else
2089 #endif
2090 generate_conjoint_long_copy_core(aligned);
2091 }
2092
2093 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2094 __ li(R3_RET, 0); // return 0
2095 __ blr();
2096 return start;
2097 }
2098
2099 // Generate stub for disjoint oop copy. If "aligned" is true, the
2100 // "from" and "to" addresses are assumed to be heapword aligned.
2101 //
2102 // Arguments for generated stub:
2103 // from: R3_ARG1
2104 // to: R4_ARG2
2105 // count: R5_ARG3 treated as signed
2106 // dest_uninitialized: G1 support
2107 //
2108 address generate_disjoint_oop_copy(StubId stub_id) {
2109 bool aligned;
2110 bool dest_uninitialized;
2111 switch (stub_id) {
2112 case StubId::stubgen_oop_disjoint_arraycopy_id:
2113 aligned = false;
2114 dest_uninitialized = false;
2115 break;
2116 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
2117 aligned = true;
2118 dest_uninitialized = false;
2119 break;
2120 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
2121 aligned = false;
2122 dest_uninitialized = true;
2123 break;
2124 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
2125 aligned = true;
2126 dest_uninitialized = true;
2127 break;
2128 default:
2129 ShouldNotReachHere();
2130 }
2131
2132 StubCodeMark mark(this, stub_id);
2133 address start = __ function_entry();
2134 assert_positive_int(R5_ARG3);
2135
2136 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2137 if (dest_uninitialized) {
2138 decorators |= IS_DEST_UNINITIALIZED;
2139 }
2140 if (aligned) {
2141 decorators |= ARRAYCOPY_ALIGNED;
2142 }
2143
2144 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2145 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2146
2147 if (UseCompressedOops) {
2148 generate_disjoint_int_copy_core(aligned);
2149 } else {
2150 #if INCLUDE_ZGC
2151 if (UseZGC) {
2152 ZBarrierSetAssembler *zbs = (ZBarrierSetAssembler*)bs;
2153 zbs->generate_disjoint_oop_copy(_masm, dest_uninitialized);
2154 } else
2155 #endif
2156 generate_disjoint_long_copy_core(aligned);
2157 }
2158
2159 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2160 __ li(R3_RET, 0); // return 0
2161 __ blr();
2162
2163 return start;
2164 }
2165
2166
2167 // Helper for generating a dynamic type check.
2168 // Smashes only the given temp registers.
2169 void generate_type_check(Register sub_klass,
2170 Register super_check_offset,
2171 Register super_klass,
2172 Register temp1,
2173 Register temp2,
2174 Label& L_success) {
2175 assert_different_registers(sub_klass, super_check_offset, super_klass);
2176
2177 BLOCK_COMMENT("type_check:");
2178
2179 Label L_miss;
2180
2181 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp1, temp2, &L_success, &L_miss, nullptr,
2182 super_check_offset);
2183 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success);
2184
2185 // Fall through on failure!
2186 __ bind(L_miss);
2187 }
2188
2189
2190 // Generate stub for checked oop copy.
2191 //
2192 // Arguments for generated stub:
2193 // from: R3
2194 // to: R4
2195 // count: R5 treated as signed
2196 // ckoff: R6 (super_check_offset)
2197 // ckval: R7 (super_klass)
2198 // ret: R3 zero for success; (-1^K) where K is partial transfer count
2199 //
2200 address generate_checkcast_copy(StubId stub_id) {
2201 const Register R3_from = R3_ARG1; // source array address
2202 const Register R4_to = R4_ARG2; // destination array address
2203 const Register R5_count = R5_ARG3; // elements count
2204 const Register R6_ckoff = R6_ARG4; // super_check_offset
2205 const Register R7_ckval = R7_ARG5; // super_klass
2206
2207 const Register R8_offset = R8_ARG6; // loop var, with stride wordSize
2208 const Register R9_remain = R9_ARG7; // loop var, with stride -1
2209 const Register R10_oop = R10_ARG8; // actual oop copied
2210 const Register R11_klass = R11_scratch1; // oop._klass
2211 const Register R12_tmp = R12_scratch2;
2212 const Register R2_tmp = R2;
2213
2214 bool dest_uninitialized;
2215 switch (stub_id) {
2216 case StubId::stubgen_checkcast_arraycopy_id:
2217 dest_uninitialized = false;
2218 break;
2219 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2220 dest_uninitialized = true;
2221 break;
2222 default:
2223 ShouldNotReachHere();
2224 }
2225 //__ align(CodeEntryAlignment);
2226 StubCodeMark mark(this, stub_id);
2227 address start = __ function_entry();
2228
2229 // Assert that int is 64 bit sign extended and arrays are not conjoint.
2230 #ifdef ASSERT
2231 {
2232 assert_positive_int(R5_ARG3);
2233 const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2;
2234 Label no_overlap;
2235 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
2236 __ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes
2237 __ cmpld(CR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
2238 __ cmpld(CR1, tmp1, tmp2);
2239 __ crnand(CR0, Assembler::less, CR1, Assembler::less);
2240 // Overlaps if Src before dst and distance smaller than size.
2241 // Branch to forward copy routine otherwise.
2242 __ blt(CR0, no_overlap);
2243 __ stop("overlap in checkcast_copy");
2244 __ bind(no_overlap);
2245 }
2246 #endif
2247
2248 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2249 if (dest_uninitialized) {
2250 decorators |= IS_DEST_UNINITIALIZED;
2251 }
2252
2253 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2254 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_from, R4_to, R5_count, /* preserve: */ R6_ckoff, R7_ckval);
2255
2256 //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
2257
2258 Label load_element, store_element, store_null, success, do_epilogue;
2259 __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
2260 __ li(R8_offset, 0); // Offset from start of arrays.
2261 __ bne(CR0, load_element);
2262
2263 // Empty array: Nothing to do.
2264 __ li(R3_RET, 0); // Return 0 on (trivial) success.
2265 __ blr();
2266
2267 // ======== begin loop ========
2268 // (Entry is load_element.)
2269 __ align(OptoLoopAlignment);
2270 __ bind(store_element);
2271 if (UseCompressedOops) {
2272 __ encode_heap_oop_not_null(R10_oop);
2273 __ bind(store_null);
2274 __ stw(R10_oop, R8_offset, R4_to);
2275 } else {
2276 __ bind(store_null);
2277 #if INCLUDE_ZGC
2278 if (UseZGC) {
2279 __ store_heap_oop(R10_oop, R8_offset, R4_to, R11_scratch1, R12_tmp, noreg,
2280 MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2281 dest_uninitialized ? IS_DEST_UNINITIALIZED : 0);
2282 } else
2283 #endif
2284 __ std(R10_oop, R8_offset, R4_to);
2285 }
2286
2287 __ addi(R8_offset, R8_offset, heapOopSize); // Step to next offset.
2288 __ addic_(R9_remain, R9_remain, -1); // Decrement the count.
2289 __ beq(CR0, success);
2290
2291 // ======== loop entry is here ========
2292 __ bind(load_element);
2293 #if INCLUDE_ZGC
2294 if (UseZGC) {
2295 __ load_heap_oop(R10_oop, R8_offset, R3_from,
2296 R11_scratch1, R12_tmp,
2297 MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2298 0, &store_null);
2299 } else
2300 #endif
2301 __ load_heap_oop(R10_oop, R8_offset, R3_from,
2302 R11_scratch1, R12_tmp,
2303 MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS,
2304 AS_RAW, &store_null);
2305
2306 __ load_klass(R11_klass, R10_oop); // Query the object klass.
2307
2308 generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, R2_tmp,
2309 // Branch to this on success:
2310 store_element);
2311 // ======== end loop ========
2312
2313 // It was a real error; we must depend on the caller to finish the job.
2314 // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
2315 // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
2316 // and report their number to the caller.
2317 __ subf_(R5_count, R9_remain, R5_count);
2318 __ nand(R3_RET, R5_count, R5_count); // report (-1^K) to caller
2319 __ bne(CR0, do_epilogue);
2320 __ blr();
2321
2322 __ bind(success);
2323 __ li(R3_RET, 0);
2324
2325 __ bind(do_epilogue);
2326 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_to, R5_count, /* preserve */ R3_RET);
2327
2328 __ blr();
2329 return start;
2330 }
2331
2332
2333 // Generate 'unsafe' array copy stub.
2334 // Though just as safe as the other stubs, it takes an unscaled
2335 // size_t argument instead of an element count.
2336 //
2337 // Arguments for generated stub:
2338 // from: R3
2339 // to: R4
2340 // count: R5 byte count, treated as ssize_t, can be zero
2341 //
2342 // Examines the alignment of the operands and dispatches
2343 // to a long, int, short, or byte copy loop.
2344 //
2345 address generate_unsafe_copy(address byte_copy_entry,
2346 address short_copy_entry,
2347 address int_copy_entry,
2348 address long_copy_entry) {
2349
2350 const Register R3_from = R3_ARG1; // source array address
2351 const Register R4_to = R4_ARG2; // destination array address
2352 const Register R5_count = R5_ARG3; // elements count (as long on PPC64)
2353
2354 const Register R6_bits = R6_ARG4; // test copy of low bits
2355 const Register R7_tmp = R7_ARG5;
2356
2357 //__ align(CodeEntryAlignment);
2358 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2359 StubCodeMark mark(this, stub_id);
2360 address start = __ function_entry();
2361
2362 // Bump this on entry, not on exit:
2363 //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
2364
2365 Label short_copy, int_copy, long_copy;
2366
2367 __ orr(R6_bits, R3_from, R4_to);
2368 __ orr(R6_bits, R6_bits, R5_count);
2369 __ andi_(R0, R6_bits, (BytesPerLong-1));
2370 __ beq(CR0, long_copy);
2371
2372 __ andi_(R0, R6_bits, (BytesPerInt-1));
2373 __ beq(CR0, int_copy);
2374
2375 __ andi_(R0, R6_bits, (BytesPerShort-1));
2376 __ beq(CR0, short_copy);
2377
2378 // byte_copy:
2379 __ b(byte_copy_entry);
2380
2381 __ bind(short_copy);
2382 __ srwi(R5_count, R5_count, LogBytesPerShort);
2383 __ b(short_copy_entry);
2384
2385 __ bind(int_copy);
2386 __ srwi(R5_count, R5_count, LogBytesPerInt);
2387 __ b(int_copy_entry);
2388
2389 __ bind(long_copy);
2390 __ srwi(R5_count, R5_count, LogBytesPerLong);
2391 __ b(long_copy_entry);
2392
2393 return start;
2394 }
2395
2396
2397 // Perform range checks on the proposed arraycopy.
2398 // Kills the two temps, but nothing else.
2399 // Also, clean the sign bits of src_pos and dst_pos.
2400 void arraycopy_range_checks(Register src, // source array oop
2401 Register src_pos, // source position
2402 Register dst, // destination array oop
2403 Register dst_pos, // destination position
2404 Register length, // length of copy
2405 Register temp1, Register temp2,
2406 Label& L_failed) {
2407 BLOCK_COMMENT("arraycopy_range_checks:");
2408
2409 const Register array_length = temp1; // scratch
2410 const Register end_pos = temp2; // scratch
2411
2412 // if (src_pos + length > arrayOop(src)->length() ) FAIL;
2413 __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
2414 __ add(end_pos, src_pos, length); // src_pos + length
2415 __ cmpd(CR0, end_pos, array_length);
2416 __ bgt(CR0, L_failed);
2417
2418 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2419 __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
2420 __ add(end_pos, dst_pos, length); // src_pos + length
2421 __ cmpd(CR0, end_pos, array_length);
2422 __ bgt(CR0, L_failed);
2423
2424 BLOCK_COMMENT("arraycopy_range_checks done");
2425 }
2426
2427
2428 // Helper for generate_unsafe_setmemory
2429 //
2430 // Atomically fill an array of memory using 1-, 2-, 4-, or 8-byte chunks and return.
2431 static void do_setmemory_atomic_loop(int elem_size, Register dest, Register size, Register byteVal,
2432 MacroAssembler *_masm) {
2433
2434 Label L_Loop, L_Tail; // 2x unrolled loop
2435
2436 // Propagate byte to required width
2437 if (elem_size > 1) __ rldimi(byteVal, byteVal, 8, 64 - 2 * 8);
2438 if (elem_size > 2) __ rldimi(byteVal, byteVal, 16, 64 - 2 * 16);
2439 if (elem_size > 4) __ rldimi(byteVal, byteVal, 32, 64 - 2 * 32);
2440
2441 __ srwi_(R0, size, exact_log2(2 * elem_size)); // size is a 32 bit value
2442 __ beq(CR0, L_Tail);
2443 __ mtctr(R0);
2444
2445 __ align(32); // loop alignment
2446 __ bind(L_Loop);
2447 __ store_sized_value(byteVal, 0, dest, elem_size);
2448 __ store_sized_value(byteVal, elem_size, dest, elem_size);
2449 __ addi(dest, dest, 2 * elem_size);
2450 __ bdnz(L_Loop);
2451
2452 __ bind(L_Tail);
2453 __ andi_(R0, size, elem_size);
2454 __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintbhBCLRisReturn);
2455 __ store_sized_value(byteVal, 0, dest, elem_size);
2456 __ blr();
2457 }
2458
2459 //
2460 // Generate 'unsafe' set memory stub
2461 // Though just as safe as the other stubs, it takes an unscaled
2462 // size_t (# bytes) argument instead of an element count.
2463 //
2464 // Input:
2465 // R3_ARG1 - destination array address
2466 // R4_ARG2 - byte count (size_t)
2467 // R5_ARG3 - byte value
2468 //
2469 address generate_unsafe_setmemory(address unsafe_byte_fill) {
2470 __ align(CodeEntryAlignment);
2471 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2472 address start = __ function_entry();
2473
2474 // bump this on entry, not on exit:
2475 // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
2476
2477 {
2478 Label L_fill8Bytes, L_fill4Bytes, L_fillBytes;
2479
2480 const Register dest = R3_ARG1;
2481 const Register size = R4_ARG2;
2482 const Register byteVal = R5_ARG3;
2483 const Register rScratch1 = R6;
2484
2485 // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
2486
2487 // Check for pointer & size alignment
2488 __ orr(rScratch1, dest, size);
2489
2490 __ andi_(R0, rScratch1, 7);
2491 __ beq(CR0, L_fill8Bytes);
2492
2493 __ andi_(R0, rScratch1, 3);
2494 __ beq(CR0, L_fill4Bytes);
2495
2496 __ andi_(R0, rScratch1, 1);
2497 __ bne(CR0, L_fillBytes);
2498
2499 // Mark remaining code as such which performs Unsafe accesses.
2500 UnsafeMemoryAccessMark umam(this, true, false);
2501
2502 // At this point, we know the lower bit of size is zero and a
2503 // multiple of 2
2504 do_setmemory_atomic_loop(2, dest, size, byteVal, _masm);
2505
2506 __ align(32);
2507 __ bind(L_fill8Bytes);
2508 // At this point, we know the lower 3 bits of size are zero and a
2509 // multiple of 8
2510 do_setmemory_atomic_loop(8, dest, size, byteVal, _masm);
2511
2512 __ align(32);
2513 __ bind(L_fill4Bytes);
2514 // At this point, we know the lower 2 bits of size are zero and a
2515 // multiple of 4
2516 do_setmemory_atomic_loop(4, dest, size, byteVal, _masm);
2517
2518 __ align(32);
2519 __ bind(L_fillBytes);
2520 do_setmemory_atomic_loop(1, dest, size, byteVal, _masm);
2521 }
2522
2523 return start;
2524 }
2525
2526
2527 //
2528 // Generate generic array copy stubs
2529 //
2530 // Input:
2531 // R3 - src oop
2532 // R4 - src_pos
2533 // R5 - dst oop
2534 // R6 - dst_pos
2535 // R7 - element count
2536 //
2537 // Output:
2538 // R3 == 0 - success
2539 // R3 == -1 - need to call System.arraycopy
2540 //
2541 address generate_generic_copy(address entry_jbyte_arraycopy,
2542 address entry_jshort_arraycopy,
2543 address entry_jint_arraycopy,
2544 address entry_oop_arraycopy,
2545 address entry_disjoint_oop_arraycopy,
2546 address entry_jlong_arraycopy,
2547 address entry_checkcast_arraycopy) {
2548 Label L_failed, L_objArray;
2549
2550 // Input registers
2551 const Register src = R3_ARG1; // source array oop
2552 const Register src_pos = R4_ARG2; // source position
2553 const Register dst = R5_ARG3; // destination array oop
2554 const Register dst_pos = R6_ARG4; // destination position
2555 const Register length = R7_ARG5; // elements count
2556
2557 // registers used as temp
2558 const Register src_klass = R8_ARG6; // source array klass
2559 const Register dst_klass = R9_ARG7; // destination array klass
2560 const Register lh = R10_ARG8; // layout handler
2561 const Register temp = R2;
2562
2563 //__ align(CodeEntryAlignment);
2564 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2565 StubCodeMark mark(this, stub_id);
2566 address start = __ function_entry();
2567
2568 // Bump this on entry, not on exit:
2569 //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
2570
2571 // In principle, the int arguments could be dirty.
2572
2573 //-----------------------------------------------------------------------
2574 // Assembler stubs will be used for this call to arraycopy
2575 // if the following conditions are met:
2576 //
2577 // (1) src and dst must not be null.
2578 // (2) src_pos must not be negative.
2579 // (3) dst_pos must not be negative.
2580 // (4) length must not be negative.
2581 // (5) src klass and dst klass should be the same and not null.
2582 // (6) src and dst should be arrays.
2583 // (7) src_pos + length must not exceed length of src.
2584 // (8) dst_pos + length must not exceed length of dst.
2585 BLOCK_COMMENT("arraycopy initial argument checks");
2586
2587 __ cmpdi(CR1, src, 0); // if (src == nullptr) return -1;
2588 __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
2589 __ cmpdi(CR5, dst, 0); // if (dst == nullptr) return -1;
2590 __ cror(CR1, Assembler::equal, CR0, Assembler::less);
2591 __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
2592 __ cror(CR5, Assembler::equal, CR0, Assembler::less);
2593 __ extsw_(length, length); // if (length < 0) return -1;
2594 __ cror(CR1, Assembler::equal, CR5, Assembler::equal);
2595 __ cror(CR1, Assembler::equal, CR0, Assembler::less);
2596 __ beq(CR1, L_failed);
2597
2598 BLOCK_COMMENT("arraycopy argument klass checks");
2599 __ load_klass(src_klass, src);
2600 __ load_klass(dst_klass, dst);
2601
2602 // Load layout helper
2603 //
2604 // |array_tag| | header_size | element_type | |log2_element_size|
2605 // 32 30 24 16 8 2 0
2606 //
2607 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2608 //
2609
2610 int lh_offset = in_bytes(Klass::layout_helper_offset());
2611
2612 // Load 32-bits signed value. Use br() instruction with it to check icc.
2613 __ lwz(lh, lh_offset, src_klass);
2614
2615 // Handle objArrays completely differently...
2616 jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2617 __ load_const_optimized(temp, objArray_lh, R0);
2618 __ cmpw(CR0, lh, temp);
2619 __ beq(CR0, L_objArray);
2620
2621 __ cmpd(CR5, src_klass, dst_klass); // if (src->klass() != dst->klass()) return -1;
2622 __ bne(CR5, L_failed);
2623
2624 // Check for flat inline type array -> return -1
2625 __ test_flat_array_oop(src, temp, L_failed);
2626
2627 // Check for null-free (non-flat) inline type array -> handle as object array
2628 __ test_null_free_array_oop(src, temp, L_objArray);
2629
2630 __ cmpwi(CR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
2631 __ bge(CR6, L_failed);
2632
2633 // At this point, it is known to be a typeArray (array_tag 0x3).
2634 #ifdef ASSERT
2635 { Label L;
2636 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2637 __ load_const_optimized(temp, lh_prim_tag_in_place, R0);
2638 __ cmpw(CR0, lh, temp);
2639 __ bge(CR0, L);
2640 __ stop("must be a primitive array");
2641 __ bind(L);
2642 }
2643 #endif
2644
2645 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2646 temp, dst_klass, L_failed);
2647
2648 // TypeArrayKlass
2649 //
2650 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2651 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2652 //
2653
2654 const Register offset = dst_klass; // array offset
2655 const Register elsize = src_klass; // log2 element size
2656
2657 __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
2658 __ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
2659 __ add(src, offset, src); // src array offset
2660 __ add(dst, offset, dst); // dst array offset
2661
2662 // Next registers should be set before the jump to corresponding stub.
2663 const Register from = R3_ARG1; // source array address
2664 const Register to = R4_ARG2; // destination array address
2665 const Register count = R5_ARG3; // elements count
2666
2667 // 'from', 'to', 'count' registers should be set in this order
2668 // since they are the same as 'src', 'src_pos', 'dst'.
2669
2670 BLOCK_COMMENT("scale indexes to element size");
2671 __ sld(src_pos, src_pos, elsize);
2672 __ sld(dst_pos, dst_pos, elsize);
2673 __ add(from, src_pos, src); // src_addr
2674 __ add(to, dst_pos, dst); // dst_addr
2675 __ mr(count, length); // length
2676
2677 BLOCK_COMMENT("choose copy loop based on element size");
2678 // Using conditional branches with range 32kB.
2679 const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CR0, Assembler::equal);
2680 __ cmpwi(CR0, elsize, 0);
2681 __ bc(bo, bi, entry_jbyte_arraycopy);
2682 __ cmpwi(CR0, elsize, LogBytesPerShort);
2683 __ bc(bo, bi, entry_jshort_arraycopy);
2684 __ cmpwi(CR0, elsize, LogBytesPerInt);
2685 __ bc(bo, bi, entry_jint_arraycopy);
2686 #ifdef ASSERT
2687 { Label L;
2688 __ cmpwi(CR0, elsize, LogBytesPerLong);
2689 __ beq(CR0, L);
2690 __ stop("must be long copy, but elsize is wrong");
2691 __ bind(L);
2692 }
2693 #endif
2694 __ b(entry_jlong_arraycopy);
2695
2696 // ObjArrayKlass
2697 __ bind(L_objArray);
2698 // live at this point: src_klass, dst_klass, src[_pos], dst[_pos], length
2699
2700 Label L_disjoint_plain_copy, L_checkcast_copy;
2701 // test array classes for subtyping
2702 __ cmpd(CR0, src_klass, dst_klass); // usual case is exact equality
2703 __ bne(CR0, L_checkcast_copy);
2704
2705 // Identically typed arrays can be copied without element-wise checks.
2706 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2707 temp, lh, L_failed);
2708
2709 __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2710 __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2711 __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2712 __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2713 __ add(from, src_pos, src); // src_addr
2714 __ add(to, dst_pos, dst); // dst_addr
2715 __ mr(count, length); // length
2716 __ b(entry_oop_arraycopy);
2717
2718 __ bind(L_checkcast_copy);
2719 // live at this point: src_klass, dst_klass
2720 {
2721 // Before looking at dst.length, make sure dst is also an objArray.
2722 __ lwz(temp, lh_offset, dst_klass);
2723 __ cmpw(CR0, lh, temp);
2724 __ bne(CR0, L_failed);
2725
2726 // It is safe to examine both src.length and dst.length.
2727 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2728 temp, lh, L_failed);
2729
2730 // Marshal the base address arguments now, freeing registers.
2731 __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2732 __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2733 __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2734 __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2735 __ add(from, src_pos, src); // src_addr
2736 __ add(to, dst_pos, dst); // dst_addr
2737 __ mr(count, length); // length
2738
2739 Register sco_temp = R6_ARG4; // This register is free now.
2740 assert_different_registers(from, to, count, sco_temp,
2741 dst_klass, src_klass);
2742
2743 // Generate the type check.
2744 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2745 __ lwz(sco_temp, sco_offset, dst_klass);
2746 generate_type_check(src_klass, sco_temp, dst_klass,
2747 temp, /* temp */ R10_ARG8, L_disjoint_plain_copy);
2748
2749 // Fetch destination element klass from the ObjArrayKlass header.
2750 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2751
2752 // The checkcast_copy loop needs two extra arguments:
2753 __ ld(R7_ARG5, ek_offset, dst_klass); // dest elem klass
2754 __ lwz(R6_ARG4, sco_offset, R7_ARG5); // sco of elem klass
2755 __ b(entry_checkcast_arraycopy);
2756 }
2757
2758 __ bind(L_disjoint_plain_copy);
2759 __ b(entry_disjoint_oop_arraycopy);
2760
2761 __ bind(L_failed);
2762 __ li(R3_RET, -1); // return -1
2763 __ blr();
2764 return start;
2765 }
2766
2767 // Arguments for generated stub:
2768 // R3_ARG1 - source byte array address
2769 // R4_ARG2 - destination byte array address
2770 // R5_ARG3 - round key array
2771 address generate_aescrypt_encryptBlock() {
2772 assert(UseAES, "need AES instructions and misaligned SSE support");
2773 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2774 StubCodeMark mark(this, stub_id);
2775
2776 address start = __ function_entry();
2777
2778 Label L_doLast, L_error;
2779
2780 Register from = R3_ARG1; // source array address
2781 Register to = R4_ARG2; // destination array address
2782 Register key = R5_ARG3; // round key array
2783
2784 Register keylen = R8;
2785 Register temp = R9;
2786 Register keypos = R10;
2787 Register fifteen = R12;
2788
2789 VectorRegister vRet = VR0;
2790
2791 VectorRegister vKey1 = VR1;
2792 VectorRegister vKey2 = VR2;
2793 VectorRegister vKey3 = VR3;
2794 VectorRegister vKey4 = VR4;
2795
2796 VectorRegister fromPerm = VR5;
2797 VectorRegister keyPerm = VR6;
2798 VectorRegister toPerm = VR7;
2799 VectorRegister fSplt = VR8;
2800
2801 VectorRegister vTmp1 = VR9;
2802 VectorRegister vTmp2 = VR10;
2803 VectorRegister vTmp3 = VR11;
2804 VectorRegister vTmp4 = VR12;
2805
2806 __ li (fifteen, 15);
2807
2808 // load unaligned from[0-15] to vRet
2809 __ lvx (vRet, from);
2810 __ lvx (vTmp1, fifteen, from);
2811 __ lvsl (fromPerm, from);
2812 #ifdef VM_LITTLE_ENDIAN
2813 __ vspltisb (fSplt, 0x0f);
2814 __ vxor (fromPerm, fromPerm, fSplt);
2815 #endif
2816 __ vperm (vRet, vRet, vTmp1, fromPerm);
2817
2818 // load keylen (44 or 52 or 60)
2819 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2820
2821 // to load keys
2822 __ load_perm (keyPerm, key);
2823 #ifdef VM_LITTLE_ENDIAN
2824 __ vspltisb (vTmp2, -16);
2825 __ vrld (keyPerm, keyPerm, vTmp2);
2826 __ vrld (keyPerm, keyPerm, vTmp2);
2827 __ vsldoi (keyPerm, keyPerm, keyPerm, 8);
2828 #endif
2829
2830 // load the 1st round key to vTmp1
2831 __ lvx (vTmp1, key);
2832 __ li (keypos, 16);
2833 __ lvx (vKey1, keypos, key);
2834 __ vec_perm (vTmp1, vKey1, keyPerm);
2835
2836 // 1st round
2837 __ vxor (vRet, vRet, vTmp1);
2838
2839 // load the 2nd round key to vKey1
2840 __ li (keypos, 32);
2841 __ lvx (vKey2, keypos, key);
2842 __ vec_perm (vKey1, vKey2, keyPerm);
2843
2844 // load the 3rd round key to vKey2
2845 __ li (keypos, 48);
2846 __ lvx (vKey3, keypos, key);
2847 __ vec_perm (vKey2, vKey3, keyPerm);
2848
2849 // load the 4th round key to vKey3
2850 __ li (keypos, 64);
2851 __ lvx (vKey4, keypos, key);
2852 __ vec_perm (vKey3, vKey4, keyPerm);
2853
2854 // load the 5th round key to vKey4
2855 __ li (keypos, 80);
2856 __ lvx (vTmp1, keypos, key);
2857 __ vec_perm (vKey4, vTmp1, keyPerm);
2858
2859 // 2nd - 5th rounds
2860 __ vcipher (vRet, vRet, vKey1);
2861 __ vcipher (vRet, vRet, vKey2);
2862 __ vcipher (vRet, vRet, vKey3);
2863 __ vcipher (vRet, vRet, vKey4);
2864
2865 // load the 6th round key to vKey1
2866 __ li (keypos, 96);
2867 __ lvx (vKey2, keypos, key);
2868 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2869
2870 // load the 7th round key to vKey2
2871 __ li (keypos, 112);
2872 __ lvx (vKey3, keypos, key);
2873 __ vec_perm (vKey2, vKey3, keyPerm);
2874
2875 // load the 8th round key to vKey3
2876 __ li (keypos, 128);
2877 __ lvx (vKey4, keypos, key);
2878 __ vec_perm (vKey3, vKey4, keyPerm);
2879
2880 // load the 9th round key to vKey4
2881 __ li (keypos, 144);
2882 __ lvx (vTmp1, keypos, key);
2883 __ vec_perm (vKey4, vTmp1, keyPerm);
2884
2885 // 6th - 9th rounds
2886 __ vcipher (vRet, vRet, vKey1);
2887 __ vcipher (vRet, vRet, vKey2);
2888 __ vcipher (vRet, vRet, vKey3);
2889 __ vcipher (vRet, vRet, vKey4);
2890
2891 // load the 10th round key to vKey1
2892 __ li (keypos, 160);
2893 __ lvx (vKey2, keypos, key);
2894 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2895
2896 // load the 11th round key to vKey2
2897 __ li (keypos, 176);
2898 __ lvx (vTmp1, keypos, key);
2899 __ vec_perm (vKey2, vTmp1, keyPerm);
2900
2901 // if all round keys are loaded, skip next 4 rounds
2902 __ cmpwi (CR0, keylen, 44);
2903 __ beq (CR0, L_doLast);
2904
2905 // 10th - 11th rounds
2906 __ vcipher (vRet, vRet, vKey1);
2907 __ vcipher (vRet, vRet, vKey2);
2908
2909 // load the 12th round key to vKey1
2910 __ li (keypos, 192);
2911 __ lvx (vKey2, keypos, key);
2912 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2913
2914 // load the 13th round key to vKey2
2915 __ li (keypos, 208);
2916 __ lvx (vTmp1, keypos, key);
2917 __ vec_perm (vKey2, vTmp1, keyPerm);
2918
2919 // if all round keys are loaded, skip next 2 rounds
2920 __ cmpwi (CR0, keylen, 52);
2921 __ beq (CR0, L_doLast);
2922
2923 #ifdef ASSERT
2924 __ cmpwi (CR0, keylen, 60);
2925 __ bne (CR0, L_error);
2926 #endif
2927
2928 // 12th - 13th rounds
2929 __ vcipher (vRet, vRet, vKey1);
2930 __ vcipher (vRet, vRet, vKey2);
2931
2932 // load the 14th round key to vKey1
2933 __ li (keypos, 224);
2934 __ lvx (vKey2, keypos, key);
2935 __ vec_perm (vKey1, vTmp1, vKey2, keyPerm);
2936
2937 // load the 15th round key to vKey2
2938 __ li (keypos, 240);
2939 __ lvx (vTmp1, keypos, key);
2940 __ vec_perm (vKey2, vTmp1, keyPerm);
2941
2942 __ bind(L_doLast);
2943
2944 // last two rounds
2945 __ vcipher (vRet, vRet, vKey1);
2946 __ vcipherlast (vRet, vRet, vKey2);
2947
2948 #ifdef VM_LITTLE_ENDIAN
2949 // toPerm = 0x0F0E0D0C0B0A09080706050403020100
2950 __ lvsl (toPerm, keypos); // keypos is a multiple of 16
2951 __ vxor (toPerm, toPerm, fSplt);
2952
2953 // Swap Bytes
2954 __ vperm (vRet, vRet, vRet, toPerm);
2955 #endif
2956
2957 // store result (unaligned)
2958 // Note: We can't use a read-modify-write sequence which touches additional Bytes.
2959 Register lo = temp, hi = fifteen; // Reuse
2960 __ vsldoi (vTmp1, vRet, vRet, 8);
2961 __ mfvrd (hi, vRet);
2962 __ mfvrd (lo, vTmp1);
2963 __ std (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
2964 __ std (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
2965
2966 __ blr();
2967
2968 #ifdef ASSERT
2969 __ bind(L_error);
2970 __ stop("aescrypt_encryptBlock: invalid key length");
2971 #endif
2972 return start;
2973 }
2974
2975 // Arguments for generated stub:
2976 // R3_ARG1 - source byte array address
2977 // R4_ARG2 - destination byte array address
2978 // R5_ARG3 - sessionKe (key) in little endian int array
2979 address generate_aescrypt_decryptBlock() {
2980 assert(UseAES, "need AES instructions and misaligned SSE support");
2981 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2982 StubCodeMark mark(this, stub_id);
2983
2984 address start = __ function_entry();
2985
2986 Label L_doLast, L_do44, L_do52, L_error;
2987
2988 Register from = R3_ARG1; // source array address
2989 Register to = R4_ARG2; // destination array address
2990 Register key = R5_ARG3; // round key array
2991
2992 Register keylen = R8;
2993 Register temp = R9;
2994 Register keypos = R10;
2995 Register fifteen = R12;
2996
2997 VectorRegister vRet = VR0;
2998
2999 VectorRegister vKey1 = VR1;
3000 VectorRegister vKey2 = VR2;
3001 VectorRegister vKey3 = VR3;
3002 VectorRegister vKey4 = VR4;
3003 VectorRegister vKey5 = VR5;
3004
3005 VectorRegister fromPerm = VR6;
3006 VectorRegister keyPerm = VR7;
3007 VectorRegister toPerm = VR8;
3008 VectorRegister fSplt = VR9;
3009
3010 VectorRegister vTmp1 = VR10;
3011 VectorRegister vTmp2 = VR11;
3012 VectorRegister vTmp3 = VR12;
3013 VectorRegister vTmp4 = VR13;
3014
3015 __ li (fifteen, 15);
3016
3017 // load unaligned from[0-15] to vRet
3018 __ lvx (vRet, from);
3019 __ lvx (vTmp1, fifteen, from);
3020 __ lvsl (fromPerm, from);
3021 #ifdef VM_LITTLE_ENDIAN
3022 __ vspltisb (fSplt, 0x0f);
3023 __ vxor (fromPerm, fromPerm, fSplt);
3024 #endif
3025 __ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
3026
3027 // load keylen (44 or 52 or 60)
3028 __ lwz (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
3029
3030 // to load keys
3031 __ load_perm (keyPerm, key);
3032 #ifdef VM_LITTLE_ENDIAN
3033 __ vxor (vTmp2, vTmp2, vTmp2);
3034 __ vspltisb (vTmp2, -16);
3035 __ vrld (keyPerm, keyPerm, vTmp2);
3036 __ vrld (keyPerm, keyPerm, vTmp2);
3037 __ vsldoi (keyPerm, keyPerm, keyPerm, 8);
3038 #endif
3039
3040 __ cmpwi (CR0, keylen, 44);
3041 __ beq (CR0, L_do44);
3042
3043 __ cmpwi (CR0, keylen, 52);
3044 __ beq (CR0, L_do52);
3045
3046 #ifdef ASSERT
3047 __ cmpwi (CR0, keylen, 60);
3048 __ bne (CR0, L_error);
3049 #endif
3050
3051 // load the 15th round key to vKey1
3052 __ li (keypos, 240);
3053 __ lvx (vKey1, keypos, key);
3054 __ li (keypos, 224);
3055 __ lvx (vKey2, keypos, key);
3056 __ vec_perm (vKey1, vKey2, vKey1, keyPerm);
3057
3058 // load the 14th round key to vKey2
3059 __ li (keypos, 208);
3060 __ lvx (vKey3, keypos, key);
3061 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
3062
3063 // load the 13th round key to vKey3
3064 __ li (keypos, 192);
3065 __ lvx (vKey4, keypos, key);
3066 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
3067
3068 // load the 12th round key to vKey4
3069 __ li (keypos, 176);
3070 __ lvx (vKey5, keypos, key);
3071 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
3072
3073 // load the 11th round key to vKey5
3074 __ li (keypos, 160);
3075 __ lvx (vTmp1, keypos, key);
3076 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
3077
3078 // 1st - 5th rounds
3079 __ vxor (vRet, vRet, vKey1);
3080 __ vncipher (vRet, vRet, vKey2);
3081 __ vncipher (vRet, vRet, vKey3);
3082 __ vncipher (vRet, vRet, vKey4);
3083 __ vncipher (vRet, vRet, vKey5);
3084
3085 __ b (L_doLast);
3086
3087 __ align(32);
3088 __ bind (L_do52);
3089
3090 // load the 13th round key to vKey1
3091 __ li (keypos, 208);
3092 __ lvx (vKey1, keypos, key);
3093 __ li (keypos, 192);
3094 __ lvx (vKey2, keypos, key);
3095 __ vec_perm (vKey1, vKey2, vKey1, keyPerm);
3096
3097 // load the 12th round key to vKey2
3098 __ li (keypos, 176);
3099 __ lvx (vKey3, keypos, key);
3100 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
3101
3102 // load the 11th round key to vKey3
3103 __ li (keypos, 160);
3104 __ lvx (vTmp1, keypos, key);
3105 __ vec_perm (vKey3, vTmp1, vKey3, keyPerm);
3106
3107 // 1st - 3rd rounds
3108 __ vxor (vRet, vRet, vKey1);
3109 __ vncipher (vRet, vRet, vKey2);
3110 __ vncipher (vRet, vRet, vKey3);
3111
3112 __ b (L_doLast);
3113
3114 __ align(32);
3115 __ bind (L_do44);
3116
3117 // load the 11th round key to vKey1
3118 __ li (keypos, 176);
3119 __ lvx (vKey1, keypos, key);
3120 __ li (keypos, 160);
3121 __ lvx (vTmp1, keypos, key);
3122 __ vec_perm (vKey1, vTmp1, vKey1, keyPerm);
3123
3124 // 1st round
3125 __ vxor (vRet, vRet, vKey1);
3126
3127 __ bind (L_doLast);
3128
3129 // load the 10th round key to vKey1
3130 __ li (keypos, 144);
3131 __ lvx (vKey2, keypos, key);
3132 __ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
3133
3134 // load the 9th round key to vKey2
3135 __ li (keypos, 128);
3136 __ lvx (vKey3, keypos, key);
3137 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
3138
3139 // load the 8th round key to vKey3
3140 __ li (keypos, 112);
3141 __ lvx (vKey4, keypos, key);
3142 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
3143
3144 // load the 7th round key to vKey4
3145 __ li (keypos, 96);
3146 __ lvx (vKey5, keypos, key);
3147 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
3148
3149 // load the 6th round key to vKey5
3150 __ li (keypos, 80);
3151 __ lvx (vTmp1, keypos, key);
3152 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
3153
3154 // last 10th - 6th rounds
3155 __ vncipher (vRet, vRet, vKey1);
3156 __ vncipher (vRet, vRet, vKey2);
3157 __ vncipher (vRet, vRet, vKey3);
3158 __ vncipher (vRet, vRet, vKey4);
3159 __ vncipher (vRet, vRet, vKey5);
3160
3161 // load the 5th round key to vKey1
3162 __ li (keypos, 64);
3163 __ lvx (vKey2, keypos, key);
3164 __ vec_perm (vKey1, vKey2, vTmp1, keyPerm);
3165
3166 // load the 4th round key to vKey2
3167 __ li (keypos, 48);
3168 __ lvx (vKey3, keypos, key);
3169 __ vec_perm (vKey2, vKey3, vKey2, keyPerm);
3170
3171 // load the 3rd round key to vKey3
3172 __ li (keypos, 32);
3173 __ lvx (vKey4, keypos, key);
3174 __ vec_perm (vKey3, vKey4, vKey3, keyPerm);
3175
3176 // load the 2nd round key to vKey4
3177 __ li (keypos, 16);
3178 __ lvx (vKey5, keypos, key);
3179 __ vec_perm (vKey4, vKey5, vKey4, keyPerm);
3180
3181 // load the 1st round key to vKey5
3182 __ lvx (vTmp1, key);
3183 __ vec_perm (vKey5, vTmp1, vKey5, keyPerm);
3184
3185 // last 5th - 1th rounds
3186 __ vncipher (vRet, vRet, vKey1);
3187 __ vncipher (vRet, vRet, vKey2);
3188 __ vncipher (vRet, vRet, vKey3);
3189 __ vncipher (vRet, vRet, vKey4);
3190 __ vncipherlast (vRet, vRet, vKey5);
3191
3192 #ifdef VM_LITTLE_ENDIAN
3193 // toPerm = 0x0F0E0D0C0B0A09080706050403020100
3194 __ lvsl (toPerm, keypos); // keypos is a multiple of 16
3195 __ vxor (toPerm, toPerm, fSplt);
3196
3197 // Swap Bytes
3198 __ vperm (vRet, vRet, vRet, toPerm);
3199 #endif
3200
3201 // store result (unaligned)
3202 // Note: We can't use a read-modify-write sequence which touches additional Bytes.
3203 Register lo = temp, hi = fifteen; // Reuse
3204 __ vsldoi (vTmp1, vRet, vRet, 8);
3205 __ mfvrd (hi, vRet);
3206 __ mfvrd (lo, vTmp1);
3207 __ std (hi, 0 LITTLE_ENDIAN_ONLY(+ 8), to);
3208 __ std (lo, 0 BIG_ENDIAN_ONLY(+ 8), to);
3209
3210 __ blr();
3211
3212 #ifdef ASSERT
3213 __ bind(L_error);
3214 __ stop("aescrypt_decryptBlock: invalid key length");
3215 #endif
3216 return start;
3217 }
3218
3219 address generate_sha256_implCompress(StubId stub_id) {
3220 assert(UseSHA, "need SHA instructions");
3221 bool multi_block;
3222 switch (stub_id) {
3223 case StubId::stubgen_sha256_implCompress_id:
3224 multi_block = false;
3225 break;
3226 case StubId::stubgen_sha256_implCompressMB_id:
3227 multi_block = true;
3228 break;
3229 default:
3230 ShouldNotReachHere();
3231 }
3232 StubCodeMark mark(this, stub_id);
3233 address start = __ function_entry();
3234
3235 __ sha256 (multi_block);
3236 __ blr();
3237
3238 return start;
3239 }
3240
3241 address generate_sha512_implCompress(StubId stub_id) {
3242 assert(UseSHA, "need SHA instructions");
3243 bool multi_block;
3244 switch (stub_id) {
3245 case StubId::stubgen_sha512_implCompress_id:
3246 multi_block = false;
3247 break;
3248 case StubId::stubgen_sha512_implCompressMB_id:
3249 multi_block = true;
3250 break;
3251 default:
3252 ShouldNotReachHere();
3253 }
3254 StubCodeMark mark(this, stub_id);
3255 address start = __ function_entry();
3256
3257 __ sha512 (multi_block);
3258 __ blr();
3259
3260 return start;
3261 }
3262
3263 address generate_data_cache_writeback() {
3264 const Register cacheline = R3_ARG1;
3265 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
3266 StubCodeMark mark(this, stub_id);
3267 address start = __ pc();
3268
3269 __ cache_wb(Address(cacheline));
3270 __ blr();
3271
3272 return start;
3273 }
3274
3275 address generate_data_cache_writeback_sync() {
3276 const Register is_presync = R3_ARG1;
3277 Register temp = R4;
3278 Label SKIP;
3279 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
3280 StubCodeMark mark(this, stub_id);
3281 address start = __ pc();
3282
3283 __ andi_(temp, is_presync, 1);
3284 __ bne(CR0, SKIP);
3285 __ cache_wbsync(false); // post sync => emit 'sync'
3286 __ bind(SKIP); // pre sync => emit nothing
3287 __ blr();
3288
3289 return start;
3290 }
3291
3292 void generate_arraycopy_stubs() {
3293 // generate the common exit first so later stubs can rely on it if
3294 // they want an UnsafeMemoryAccess exit non-local to the stub
3295 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
3296 // register the stub as the default exit with class UnsafeMemoryAccess
3297 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
3298
3299 // Note: the disjoint stubs must be generated first, some of the
3300 // conjoint stubs use them.
3301
3302 // Note: chaining of stubs does not rely on branching to an
3303 // auxiliary post-push entry because none of the stubs
3304 // push/pop a frame.
3305
3306 // non-aligned disjoint versions
3307 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id);
3308 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(StubId::stubgen_jshort_disjoint_arraycopy_id);
3309 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(StubId::stubgen_jint_disjoint_arraycopy_id);
3310 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(StubId::stubgen_jlong_disjoint_arraycopy_id);
3311 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_id);
3312 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
3313
3314 // aligned disjoint versions
3315 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id);
3316 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id);
3317 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id);
3318 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id);
3319 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id);
3320 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(StubId::stubgen_oop_disjoint_arraycopy_uninit_id);
3321
3322 // non-aligned conjoint versions
3323 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(StubId::stubgen_jbyte_arraycopy_id);
3324 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(StubId::stubgen_jshort_arraycopy_id);
3325 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(StubId::stubgen_jint_arraycopy_id);
3326 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(StubId::stubgen_jlong_arraycopy_id);
3327 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_id);
3328 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_oop_arraycopy_uninit_id);
3329
3330 // aligned conjoint versions
3331 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id);
3332 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(StubId::stubgen_arrayof_jshort_arraycopy_id);
3333 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(StubId::stubgen_arrayof_jint_arraycopy_id);
3334 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(StubId::stubgen_arrayof_jlong_arraycopy_id);
3335 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
3336 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubId::stubgen_arrayof_oop_arraycopy_id);
3337
3338 // special/generic versions
3339 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id);
3340 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id);
3341
3342 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(STUB_ENTRY(jbyte_arraycopy()),
3343 STUB_ENTRY(jshort_arraycopy()),
3344 STUB_ENTRY(jint_arraycopy()),
3345 STUB_ENTRY(jlong_arraycopy()));
3346 StubRoutines::_generic_arraycopy = generate_generic_copy(STUB_ENTRY(jbyte_arraycopy()),
3347 STUB_ENTRY(jshort_arraycopy()),
3348 STUB_ENTRY(jint_arraycopy()),
3349 STUB_ENTRY(oop_arraycopy()),
3350 STUB_ENTRY(oop_disjoint_arraycopy()),
3351 STUB_ENTRY(jlong_arraycopy()),
3352 STUB_ENTRY(checkcast_arraycopy()));
3353
3354 // fill routines
3355 #ifdef COMPILER2
3356 if (OptimizeFill) {
3357 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
3358 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
3359 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
3360 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
3361 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
3362 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
3363 }
3364 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(StubRoutines::_jbyte_fill);
3365 #endif
3366 }
3367
3368 // Stub for BigInteger::multiplyToLen()
3369 //
3370 // Arguments:
3371 //
3372 // Input:
3373 // R3 - x address
3374 // R4 - x length
3375 // R5 - y address
3376 // R6 - y length
3377 // R7 - z address
3378 //
3379 address generate_multiplyToLen() {
3380
3381 StubId stub_id = StubId::stubgen_multiplyToLen_id;
3382 StubCodeMark mark(this, stub_id);
3383
3384 address start = __ function_entry();
3385
3386 const Register x = R3;
3387 const Register xlen = R4;
3388 const Register y = R5;
3389 const Register ylen = R6;
3390 const Register z = R7;
3391
3392 const Register tmp1 = R2; // TOC not used.
3393 const Register tmp2 = R9;
3394 const Register tmp3 = R10;
3395 const Register tmp4 = R11;
3396 const Register tmp5 = R12;
3397
3398 // non-volatile regs
3399 const Register tmp6 = R31;
3400 const Register tmp7 = R30;
3401 const Register tmp8 = R29;
3402 const Register tmp9 = R28;
3403 const Register tmp10 = R27;
3404 const Register tmp11 = R26;
3405 const Register tmp12 = R25;
3406 const Register tmp13 = R24;
3407
3408 BLOCK_COMMENT("Entry:");
3409
3410 // C2 does not respect int to long conversion for stub calls.
3411 __ clrldi(xlen, xlen, 32);
3412 __ clrldi(ylen, ylen, 32);
3413
3414 // Save non-volatile regs (frameless).
3415 int current_offs = 8;
3416 __ std(R24, -current_offs, R1_SP); current_offs += 8;
3417 __ std(R25, -current_offs, R1_SP); current_offs += 8;
3418 __ std(R26, -current_offs, R1_SP); current_offs += 8;
3419 __ std(R27, -current_offs, R1_SP); current_offs += 8;
3420 __ std(R28, -current_offs, R1_SP); current_offs += 8;
3421 __ std(R29, -current_offs, R1_SP); current_offs += 8;
3422 __ std(R30, -current_offs, R1_SP); current_offs += 8;
3423 __ std(R31, -current_offs, R1_SP);
3424
3425 __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5,
3426 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
3427
3428 // Restore non-volatile regs.
3429 current_offs = 8;
3430 __ ld(R24, -current_offs, R1_SP); current_offs += 8;
3431 __ ld(R25, -current_offs, R1_SP); current_offs += 8;
3432 __ ld(R26, -current_offs, R1_SP); current_offs += 8;
3433 __ ld(R27, -current_offs, R1_SP); current_offs += 8;
3434 __ ld(R28, -current_offs, R1_SP); current_offs += 8;
3435 __ ld(R29, -current_offs, R1_SP); current_offs += 8;
3436 __ ld(R30, -current_offs, R1_SP); current_offs += 8;
3437 __ ld(R31, -current_offs, R1_SP);
3438
3439 __ blr(); // Return to caller.
3440
3441 return start;
3442 }
3443
3444 /**
3445 * Arguments:
3446 *
3447 * Input:
3448 * R3_ARG1 - out address
3449 * R4_ARG2 - in address
3450 * R5_ARG3 - offset
3451 * R6_ARG4 - len
3452 * R7_ARG5 - k
3453 * Output:
3454 * R3_RET - carry
3455 */
3456 address generate_mulAdd() {
3457 __ align(CodeEntryAlignment);
3458 StubId stub_id = StubId::stubgen_mulAdd_id;
3459 StubCodeMark mark(this, stub_id);
3460
3461 address start = __ function_entry();
3462
3463 // C2 does not sign extend signed parameters to full 64 bits registers:
3464 __ rldic (R5_ARG3, R5_ARG3, 2, 32); // always positive
3465 __ clrldi(R6_ARG4, R6_ARG4, 32); // force zero bits on higher word
3466 __ clrldi(R7_ARG5, R7_ARG5, 32); // force zero bits on higher word
3467
3468 __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
3469
3470 // Moves output carry to return register
3471 __ mr (R3_RET, R10);
3472
3473 __ blr();
3474
3475 return start;
3476 }
3477
3478 /**
3479 * Arguments:
3480 *
3481 * Input:
3482 * R3_ARG1 - in address
3483 * R4_ARG2 - in length
3484 * R5_ARG3 - out address
3485 * R6_ARG4 - out length
3486 */
3487 address generate_squareToLen() {
3488 __ align(CodeEntryAlignment);
3489 StubId stub_id = StubId::stubgen_squareToLen_id;
3490 StubCodeMark mark(this, stub_id);
3491
3492 address start = __ function_entry();
3493
3494 // args - higher word is cleaned (unsignedly) due to int to long casting
3495 const Register in = R3_ARG1;
3496 const Register in_len = R4_ARG2;
3497 __ clrldi(in_len, in_len, 32);
3498 const Register out = R5_ARG3;
3499 const Register out_len = R6_ARG4;
3500 __ clrldi(out_len, out_len, 32);
3501
3502 // output
3503 const Register ret = R3_RET;
3504
3505 // temporaries
3506 const Register lplw_s = R7;
3507 const Register in_aux = R8;
3508 const Register out_aux = R9;
3509 const Register piece = R10;
3510 const Register product = R14;
3511 const Register lplw = R15;
3512 const Register i_minus1 = R16;
3513 const Register carry = R17;
3514 const Register offset = R18;
3515 const Register off_aux = R19;
3516 const Register t = R20;
3517 const Register mlen = R21;
3518 const Register len = R22;
3519 const Register a = R23;
3520 const Register b = R24;
3521 const Register i = R25;
3522 const Register c = R26;
3523 const Register cs = R27;
3524
3525 // Labels
3526 Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_LOOP_SQUARE;
3527 Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_SQUARE;
3528
3529 // Save non-volatile regs (frameless).
3530 int current_offs = -8;
3531 __ std(R28, current_offs, R1_SP); current_offs -= 8;
3532 __ std(R27, current_offs, R1_SP); current_offs -= 8;
3533 __ std(R26, current_offs, R1_SP); current_offs -= 8;
3534 __ std(R25, current_offs, R1_SP); current_offs -= 8;
3535 __ std(R24, current_offs, R1_SP); current_offs -= 8;
3536 __ std(R23, current_offs, R1_SP); current_offs -= 8;
3537 __ std(R22, current_offs, R1_SP); current_offs -= 8;
3538 __ std(R21, current_offs, R1_SP); current_offs -= 8;
3539 __ std(R20, current_offs, R1_SP); current_offs -= 8;
3540 __ std(R19, current_offs, R1_SP); current_offs -= 8;
3541 __ std(R18, current_offs, R1_SP); current_offs -= 8;
3542 __ std(R17, current_offs, R1_SP); current_offs -= 8;
3543 __ std(R16, current_offs, R1_SP); current_offs -= 8;
3544 __ std(R15, current_offs, R1_SP); current_offs -= 8;
3545 __ std(R14, current_offs, R1_SP);
3546
3547 // Store the squares, right shifted one bit (i.e., divided by 2)
3548 __ subi (out_aux, out, 8);
3549 __ subi (in_aux, in, 4);
3550 __ cmpwi (CR0, in_len, 0);
3551 // Initialize lplw outside of the loop
3552 __ xorr (lplw, lplw, lplw);
3553 __ ble (CR0, SKIP_LOOP_SQUARE); // in_len <= 0
3554 __ mtctr (in_len);
3555
3556 __ bind(LOOP_SQUARE);
3557 __ lwzu (piece, 4, in_aux);
3558 __ mulld (product, piece, piece);
3559 // shift left 63 bits and only keep the MSB
3560 __ rldic (lplw_s, lplw, 63, 0);
3561 __ mr (lplw, product);
3562 // shift right 1 bit without sign extension
3563 __ srdi (product, product, 1);
3564 // join them to the same register and store it
3565 __ orr (product, lplw_s, product);
3566 #ifdef VM_LITTLE_ENDIAN
3567 // Swap low and high words for little endian
3568 __ rldicl (product, product, 32, 0);
3569 #endif
3570 __ stdu (product, 8, out_aux);
3571 __ bdnz (LOOP_SQUARE);
3572
3573 __ bind(SKIP_LOOP_SQUARE);
3574
3575 // Add in off-diagonal sums
3576 __ cmpwi (CR0, in_len, 0);
3577 __ ble (CR0, SKIP_DIAGONAL_SUM);
3578 // Avoid CTR usage here in order to use it at mulAdd
3579 __ subi (i_minus1, in_len, 1);
3580 __ li (offset, 4);
3581
3582 __ bind(LOOP_DIAGONAL_SUM);
3583
3584 __ sldi (off_aux, out_len, 2);
3585 __ sub (off_aux, off_aux, offset);
3586
3587 __ mr (len, i_minus1);
3588 __ sldi (mlen, i_minus1, 2);
3589 __ lwzx (t, in, mlen);
3590
3591 __ muladd (out, in, off_aux, len, t, a, b, carry);
3592
3593 // begin<addOne>
3594 // off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
3595 __ addi (mlen, mlen, 4);
3596 __ sldi (a, out_len, 2);
3597 __ subi (a, a, 4);
3598 __ sub (a, a, mlen);
3599 __ subi (off_aux, offset, 4);
3600 __ sub (off_aux, a, off_aux);
3601
3602 __ lwzx (b, off_aux, out);
3603 __ add (b, b, carry);
3604 __ stwx (b, off_aux, out);
3605
3606 // if (((uint64_t)s >> 32) != 0) {
3607 __ srdi_ (a, b, 32);
3608 __ beq (CR0, SKIP_ADDONE);
3609
3610 // while (--mlen >= 0) {
3611 __ bind(LOOP_ADDONE);
3612 __ subi (mlen, mlen, 4);
3613 __ cmpwi (CR0, mlen, 0);
3614 __ beq (CR0, SKIP_ADDONE);
3615
3616 // if (--offset_aux < 0) { // Carry out of number
3617 __ subi (off_aux, off_aux, 4);
3618 __ cmpwi (CR0, off_aux, 0);
3619 __ blt (CR0, SKIP_ADDONE);
3620
3621 // } else {
3622 __ lwzx (b, off_aux, out);
3623 __ addi (b, b, 1);
3624 __ stwx (b, off_aux, out);
3625 __ cmpwi (CR0, b, 0);
3626 __ bne (CR0, SKIP_ADDONE);
3627 __ b (LOOP_ADDONE);
3628
3629 __ bind(SKIP_ADDONE);
3630 // } } } end<addOne>
3631
3632 __ addi (offset, offset, 8);
3633 __ subi (i_minus1, i_minus1, 1);
3634 __ cmpwi (CR0, i_minus1, 0);
3635 __ bge (CR0, LOOP_DIAGONAL_SUM);
3636
3637 __ bind(SKIP_DIAGONAL_SUM);
3638
3639 // Shift back up and set low bit
3640 // Shifts 1 bit left up to len positions. Assumes no leading zeros
3641 // begin<primitiveLeftShift>
3642 __ cmpwi (CR0, out_len, 0);
3643 __ ble (CR0, SKIP_LSHIFT);
3644 __ li (i, 0);
3645 __ lwz (c, 0, out);
3646 __ subi (b, out_len, 1);
3647 __ mtctr (b);
3648
3649 __ bind(LOOP_LSHIFT);
3650 __ mr (b, c);
3651 __ addi (cs, i, 4);
3652 __ lwzx (c, out, cs);
3653
3654 __ sldi (b, b, 1);
3655 __ srwi (cs, c, 31);
3656 __ orr (b, b, cs);
3657 __ stwx (b, i, out);
3658
3659 __ addi (i, i, 4);
3660 __ bdnz (LOOP_LSHIFT);
3661
3662 __ sldi (c, out_len, 2);
3663 __ subi (c, c, 4);
3664 __ lwzx (b, out, c);
3665 __ sldi (b, b, 1);
3666 __ stwx (b, out, c);
3667
3668 __ bind(SKIP_LSHIFT);
3669 // end<primitiveLeftShift>
3670
3671 // Set low bit
3672 __ sldi (i, in_len, 2);
3673 __ subi (i, i, 4);
3674 __ lwzx (i, in, i);
3675 __ sldi (c, out_len, 2);
3676 __ subi (c, c, 4);
3677 __ lwzx (b, out, c);
3678
3679 __ andi (i, i, 1);
3680 __ orr (i, b, i);
3681
3682 __ stwx (i, out, c);
3683
3684 // Restore non-volatile regs.
3685 current_offs = -8;
3686 __ ld(R28, current_offs, R1_SP); current_offs -= 8;
3687 __ ld(R27, current_offs, R1_SP); current_offs -= 8;
3688 __ ld(R26, current_offs, R1_SP); current_offs -= 8;
3689 __ ld(R25, current_offs, R1_SP); current_offs -= 8;
3690 __ ld(R24, current_offs, R1_SP); current_offs -= 8;
3691 __ ld(R23, current_offs, R1_SP); current_offs -= 8;
3692 __ ld(R22, current_offs, R1_SP); current_offs -= 8;
3693 __ ld(R21, current_offs, R1_SP); current_offs -= 8;
3694 __ ld(R20, current_offs, R1_SP); current_offs -= 8;
3695 __ ld(R19, current_offs, R1_SP); current_offs -= 8;
3696 __ ld(R18, current_offs, R1_SP); current_offs -= 8;
3697 __ ld(R17, current_offs, R1_SP); current_offs -= 8;
3698 __ ld(R16, current_offs, R1_SP); current_offs -= 8;
3699 __ ld(R15, current_offs, R1_SP); current_offs -= 8;
3700 __ ld(R14, current_offs, R1_SP);
3701
3702 __ mr(ret, out);
3703 __ blr();
3704
3705 return start;
3706 }
3707
3708 /**
3709 * Arguments:
3710 *
3711 * Inputs:
3712 * R3_ARG1 - int crc
3713 * R4_ARG2 - byte* buf
3714 * R5_ARG3 - int length (of buffer)
3715 *
3716 * scratch:
3717 * R2, R6-R12
3718 *
3719 * Output:
3720 * R3_RET - int crc result
3721 */
3722 // Compute CRC32 function.
3723 address generate_CRC32_updateBytes(StubId stub_id) {
3724 bool is_crc32c;
3725 switch (stub_id) {
3726 case StubId::stubgen_updateBytesCRC32_id:
3727 is_crc32c = false;
3728 break;
3729 case StubId::stubgen_updateBytesCRC32C_id:
3730 is_crc32c = true;
3731 break;
3732 default:
3733 ShouldNotReachHere();
3734 }
3735 __ align(CodeEntryAlignment);
3736 StubCodeMark mark(this, stub_id);
3737 address start = __ function_entry(); // Remember stub start address (is rtn value).
3738 __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
3739 __ blr();
3740 return start;
3741 }
3742
3743 address generate_floatToFloat16() {
3744 __ align(CodeEntryAlignment);
3745 StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
3746 address start = __ function_entry();
3747 __ f2hf(R3_RET, F1_ARG1, F0);
3748 __ blr();
3749 return start;
3750 }
3751
3752 address generate_float16ToFloat() {
3753 __ align(CodeEntryAlignment);
3754 StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
3755 address start = __ function_entry();
3756 __ hf2f(F1_RET, R3_ARG1);
3757 __ blr();
3758 return start;
3759 }
3760
3761 address generate_method_entry_barrier() {
3762 __ align(CodeEntryAlignment);
3763 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
3764 StubCodeMark mark(this, stub_id);
3765
3766 address stub_address = __ pc();
3767
3768 int nbytes_save = MacroAssembler::num_volatile_regs * BytesPerWord;
3769 __ save_volatile_gprs(R1_SP, -nbytes_save, true);
3770
3771 // Link register points to instruction in prologue of the guarded nmethod.
3772 // As the stub requires one layer of indirection (argument is of type address* and not address),
3773 // passing the link register's value directly doesn't work.
3774 // Since we have to save the link register on the stack anyway, we calculate the corresponding stack address
3775 // and pass that one instead.
3776 __ addi(R3_ARG1, R1_SP, _abi0(lr));
3777
3778 __ save_LR(R0);
3779 __ push_frame_reg_args(nbytes_save, R0);
3780
3781 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier));
3782 __ mr(R0, R3_RET);
3783
3784 __ pop_frame();
3785 __ restore_LR(R3_RET /* used as tmp register */);
3786 __ restore_volatile_gprs(R1_SP, -nbytes_save, true);
3787
3788 __ cmpdi(CR0, R0, 0);
3789
3790 // Return to prologue if no deoptimization is required (bnelr)
3791 __ bclr(Assembler::bcondCRbiIs1, Assembler::bi0(CR0, Assembler::equal), Assembler::bhintIsTaken);
3792
3793 // Deoptimization required.
3794 // For actually handling the deoptimization, the 'wrong method stub' is invoked.
3795 __ load_const_optimized(R0, SharedRuntime::get_handle_wrong_method_stub());
3796 __ mtctr(R0);
3797
3798 // Pop the frame built in the prologue.
3799 __ pop_frame();
3800
3801 // Restore link register. Required as the 'wrong method stub' needs the caller's frame
3802 // to properly deoptimize this method (e.g. by re-resolving the call site for compiled methods).
3803 // This method's prologue is aborted.
3804 __ restore_LR(R0);
3805
3806 __ bctr();
3807 return stub_address;
3808 }
3809
3810 #ifdef VM_LITTLE_ENDIAN
3811 // The following Base64 decode intrinsic is based on an algorithm outlined
3812 // in here:
3813 // http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html
3814 // in the section titled "Vector lookup (pshufb with bitmask)"
3815 //
3816 // This implementation differs in the following ways:
3817 // * Instead of Intel SSE instructions, Power AltiVec VMX and VSX instructions
3818 // are used instead. It turns out that some of the vector operations
3819 // needed in the algorithm require fewer AltiVec instructions.
3820 // * The algorithm in the above mentioned paper doesn't handle the
3821 // Base64-URL variant in RFC 4648. Adjustments to both the code and to two
3822 // lookup tables are needed for this.
3823 // * The "Pack" section of the code is a complete rewrite for Power because we
3824 // can utilize better instructions for this step.
3825 //
3826
3827 // Offsets per group of Base64 characters
3828 // Uppercase
3829 #define UC (signed char)((-'A' + 0) & 0xff)
3830 // Lowercase
3831 #define LC (signed char)((-'a' + 26) & 0xff)
3832 // Digits
3833 #define DIG (signed char)((-'0' + 52) & 0xff)
3834 // Plus sign (URL = 0)
3835 #define PLS (signed char)((-'+' + 62) & 0xff)
3836 // Hyphen (URL = 1)
3837 #define HYP (signed char)((-'-' + 62) & 0xff)
3838 // Slash (URL = 0)
3839 #define SLS (signed char)((-'/' + 63) & 0xff)
3840 // Underscore (URL = 1)
3841 #define US (signed char)((-'_' + 63) & 0xff)
3842
3843 // For P10 (or later) only
3844 #define VALID_B64 0x80
3845 #define VB64(x) (VALID_B64 | x)
3846
3847 #define BLK_OFFSETOF(x) (offsetof(constant_block, x))
3848
3849 // In little-endian mode, the lxv instruction loads the element at EA into
3850 // element 15 of the vector register, EA+1 goes into element 14, and so
3851 // on.
3852 //
3853 // To make a look-up table easier to read, ARRAY_TO_LXV_ORDER reverses the
3854 // order of the elements in a vector initialization.
3855 #define ARRAY_TO_LXV_ORDER(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15) e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0
3856
3857 //
3858 // Base64 decodeBlock intrinsic
3859 address generate_base64_decodeBlock() {
3860 __ align(CodeEntryAlignment);
3861 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
3862 StubCodeMark mark(this, stub_id);
3863 address start = __ function_entry();
3864
3865 typedef struct {
3866 signed char offsetLUT_val[16];
3867 signed char offsetLUT_URL_val[16];
3868 unsigned char maskLUT_val[16];
3869 unsigned char maskLUT_URL_val[16];
3870 unsigned char bitposLUT_val[16];
3871 unsigned char table_32_47_val[16];
3872 unsigned char table_32_47_URL_val[16];
3873 unsigned char table_48_63_val[16];
3874 unsigned char table_64_79_val[16];
3875 unsigned char table_80_95_val[16];
3876 unsigned char table_80_95_URL_val[16];
3877 unsigned char table_96_111_val[16];
3878 unsigned char table_112_127_val[16];
3879 unsigned char pack_lshift_val[16];
3880 unsigned char pack_rshift_val[16];
3881 unsigned char pack_permute_val[16];
3882 } constant_block;
3883
3884 alignas(16) static const constant_block const_block = {
3885
3886 .offsetLUT_val = {
3887 ARRAY_TO_LXV_ORDER(
3888 0, 0, PLS, DIG, UC, UC, LC, LC,
3889 0, 0, 0, 0, 0, 0, 0, 0 ) },
3890
3891 .offsetLUT_URL_val = {
3892 ARRAY_TO_LXV_ORDER(
3893 0, 0, HYP, DIG, UC, UC, LC, LC,
3894 0, 0, 0, 0, 0, 0, 0, 0 ) },
3895
3896 .maskLUT_val = {
3897 ARRAY_TO_LXV_ORDER(
3898 /* 0 */ (unsigned char)0b10101000,
3899 /* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3900 (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3901 (unsigned char)0b11111000,
3902 /* 10 */ (unsigned char)0b11110000,
3903 /* 11 */ (unsigned char)0b01010100,
3904 /* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000,
3905 /* 15 */ (unsigned char)0b01010100 ) },
3906
3907 .maskLUT_URL_val = {
3908 ARRAY_TO_LXV_ORDER(
3909 /* 0 */ (unsigned char)0b10101000,
3910 /* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3911 (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
3912 (unsigned char)0b11111000,
3913 /* 10 */ (unsigned char)0b11110000,
3914 /* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000,
3915 /* 13 */ (unsigned char)0b01010100,
3916 /* 14 */ (unsigned char)0b01010000,
3917 /* 15 */ (unsigned char)0b01110000 ) },
3918
3919 .bitposLUT_val = {
3920 ARRAY_TO_LXV_ORDER(
3921 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80,
3922 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) },
3923
3924 // In the following table_*_val constants, a 0 value means the
3925 // character is not in the Base64 character set
3926 .table_32_47_val = {
3927 ARRAY_TO_LXV_ORDER (
3928 /* space .. '*' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '+' = 62 */ VB64(62), /* ',' .. '.' = 0 */ 0, 0, 0, /* '/' = 63 */ VB64(63) ) },
3929
3930 .table_32_47_URL_val = {
3931 ARRAY_TO_LXV_ORDER(
3932 /* space .. ',' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '-' = 62 */ VB64(62), /* '.' .. '/' */ 0, 0 ) },
3933
3934 .table_48_63_val = {
3935 ARRAY_TO_LXV_ORDER(
3936 /* '0' .. '9' = 52 .. 61 */ VB64(52), VB64(53), VB64(54), VB64(55), VB64(56), VB64(57), VB64(58), VB64(59), VB64(60), VB64(61),
3937 /* ':' .. '?' = 0 */ 0, 0, 0, 0, 0, 0 ) },
3938
3939 .table_64_79_val = {
3940 ARRAY_TO_LXV_ORDER(
3941 /* '@' = 0 */ 0, /* 'A' .. 'O' = 0 .. 14 */ VB64(0), VB64(1), VB64(2), VB64(3), VB64(4), VB64(5), VB64(6), VB64(7), VB64(8),
3942 VB64(9), VB64(10), VB64(11), VB64(12), VB64(13), VB64(14) ) },
3943
3944 .table_80_95_val = {
3945 ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3946 VB64(23), VB64(24), VB64(25), /* '[' .. '_' = 0 */ 0, 0, 0, 0, 0 ) },
3947
3948 .table_80_95_URL_val = {
3949 ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
3950 VB64(23), VB64(24), VB64(25), /* '[' .. '^' = 0 */ 0, 0, 0, 0, /* '_' = 63 */ VB64(63) ) },
3951
3952 .table_96_111_val = {
3953 ARRAY_TO_LXV_ORDER(/* '`' = 0 */ 0, /* 'a' .. 'o' = 26 .. 40 */ VB64(26), VB64(27), VB64(28), VB64(29), VB64(30), VB64(31),
3954 VB64(32), VB64(33), VB64(34), VB64(35), VB64(36), VB64(37), VB64(38), VB64(39), VB64(40) ) },
3955
3956 .table_112_127_val = {
3957 ARRAY_TO_LXV_ORDER(/* 'p' .. 'z' = 41 .. 51 */ VB64(41), VB64(42), VB64(43), VB64(44), VB64(45), VB64(46), VB64(47), VB64(48),
3958 VB64(49), VB64(50), VB64(51), /* '{' .. DEL = 0 */ 0, 0, 0, 0, 0 ) },
3959
3960 .pack_lshift_val = {
3961 ARRAY_TO_LXV_ORDER(
3962 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2 ) },
3963
3964 .pack_rshift_val = {
3965 ARRAY_TO_LXV_ORDER(
3966 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0 ) },
3967
3968 // The first 4 index values are "don't care" because
3969 // we only use the first 12 bytes of the vector,
3970 // which are decoded from 16 bytes of Base64 characters.
3971 .pack_permute_val = {
3972 ARRAY_TO_LXV_ORDER(
3973 0, 0, 0, 0,
3974 0, 1, 2,
3975 4, 5, 6,
3976 8, 9, 10,
3977 12, 13, 14 ) }
3978 };
3979
3980 const unsigned block_size = 16; // number of bytes to process in each pass through the loop
3981 const unsigned block_size_shift = 4;
3982
3983 // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
3984 Register s = R3_ARG1; // source starting address of Base64 characters
3985 Register sp = R4_ARG2; // source offset
3986 Register sl = R5_ARG3; // source length = # of Base64 characters to be processed
3987 Register d = R6_ARG4; // destination address
3988 Register dp = R7_ARG5; // destination offset
3989 Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
3990 Register isMIME = R9_ARG7; // boolean, if non-zero indicates use of RFC 2045 MIME encoding - not used
3991
3992 // Local variables
3993 Register const_ptr = R9; // used for loading constants
3994 Register tmp_reg = R10; // used for speeding up load_constant_optimized()
3995
3996 // Re-use R9 and R10 to avoid using non-volatile registers (requires save/restore)
3997 Register out = R9; // moving out (destination) pointer
3998 Register in = R10; // moving in (source) pointer
3999
4000 // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
4001 // VR Constants
4002 VectorRegister vec_0s = VR0;
4003 VectorRegister vec_4s = VR1;
4004 VectorRegister vec_8s = VR2;
4005 VectorRegister vec_special_case_char = VR3;
4006 VectorRegister pack_rshift = VR4;
4007 VectorRegister pack_lshift = VR5;
4008
4009 // VSR Constants
4010 VectorSRegister offsetLUT = VSR0;
4011 VectorSRegister maskLUT = VSR1;
4012 VectorSRegister bitposLUT = VSR2;
4013 VectorSRegister vec_0xfs = VSR3;
4014 VectorSRegister vec_special_case_offset = VSR4;
4015 VectorSRegister pack_permute = VSR5;
4016
4017 // P10 (or later) VSR lookup constants
4018 VectorSRegister table_32_47 = VSR0;
4019 VectorSRegister table_48_63 = VSR1;
4020 VectorSRegister table_64_79 = VSR2;
4021 VectorSRegister table_80_95 = VSR3;
4022 VectorSRegister table_96_111 = VSR4;
4023 VectorSRegister table_112_127 = VSR6;
4024
4025 // Data read in and later converted
4026 VectorRegister input = VR6;
4027 // Variable for testing Base64 validity
4028 VectorRegister non_match = VR10;
4029
4030 // P9 VR Variables for lookup
4031 VectorRegister higher_nibble = VR7;
4032 VectorRegister eq_special_case_char = VR8;
4033 VectorRegister offsets = VR9;
4034
4035 // P9 VSR lookup variables
4036 VectorSRegister bit = VSR6;
4037 VectorSRegister lower_nibble = VSR7;
4038 VectorSRegister M = VSR8;
4039
4040 // P10 (or later) VSR lookup variables
4041 VectorSRegister xlate_a = VSR7;
4042 VectorSRegister xlate_b = VSR8;
4043
4044 // Variables for pack
4045 // VR
4046 VectorRegister l = VR7; // reuse higher_nibble's register
4047 VectorRegister r = VR8; // reuse eq_special_case_char's register
4048 VectorRegister gathered = VR10; // reuse non_match's register
4049
4050 Label not_URL, calculate_size, loop_start, loop_exit, return_zero;
4051
4052 // The upper 32 bits of the non-pointer parameter registers are not
4053 // guaranteed to be zero, so mask off those upper bits.
4054 __ clrldi(sp, sp, 32);
4055 __ clrldi(sl, sl, 32);
4056
4057 // Don't handle the last 4 characters of the source, because this
4058 // VSX-based algorithm doesn't handle padding characters. Also the
4059 // vector code will always write 16 bytes of decoded data on each pass,
4060 // but only the first 12 of those 16 bytes are valid data (16 base64
4061 // characters become 12 bytes of binary data), so for this reason we
4062 // need to subtract an additional 8 bytes from the source length, in
4063 // order not to write past the end of the destination buffer. The
4064 // result of this subtraction implies that a Java function in the
4065 // Base64 class will be used to process the last 12 characters.
4066 __ sub(sl, sl, sp);
4067 __ subi(sl, sl, 12);
4068
4069 // Load CTR with the number of passes through the loop
4070 // = sl >> block_size_shift. After the shift, if sl <= 0, there's too
4071 // little data to be processed by this intrinsic.
4072 __ srawi_(sl, sl, block_size_shift);
4073 __ ble(CR0, return_zero);
4074 __ mtctr(sl);
4075
4076 // Clear the other two parameter registers upper 32 bits.
4077 __ clrldi(isURL, isURL, 32);
4078 __ clrldi(dp, dp, 32);
4079
4080 // Load constant vec registers that need to be loaded from memory
4081 __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
4082 __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
4083 __ lxv(pack_rshift->to_vsr(), BLK_OFFSETOF(pack_rshift_val), const_ptr);
4084 __ lxv(pack_lshift->to_vsr(), BLK_OFFSETOF(pack_lshift_val), const_ptr);
4085 __ lxv(pack_permute, BLK_OFFSETOF(pack_permute_val), const_ptr);
4086
4087 // Splat the constants that can use xxspltib
4088 __ xxspltib(vec_0s->to_vsr(), 0);
4089 __ xxspltib(vec_8s->to_vsr(), 8);
4090 if (PowerArchitecturePPC64 >= 10) {
4091 // Using VALID_B64 for the offsets effectively strips the upper bit
4092 // of each byte that was selected from the table. Setting the upper
4093 // bit gives us a way to distinguish between the 6-bit value of 0
4094 // from an error code of 0, which will happen if the character is
4095 // outside the range of the lookup, or is an illegal Base64
4096 // character, such as %.
4097 __ xxspltib(offsets->to_vsr(), VALID_B64);
4098
4099 __ lxv(table_48_63, BLK_OFFSETOF(table_48_63_val), const_ptr);
4100 __ lxv(table_64_79, BLK_OFFSETOF(table_64_79_val), const_ptr);
4101 __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
4102 __ lxv(table_96_111, BLK_OFFSETOF(table_96_111_val), const_ptr);
4103 __ lxv(table_112_127, BLK_OFFSETOF(table_112_127_val), const_ptr);
4104 } else {
4105 __ xxspltib(vec_4s->to_vsr(), 4);
4106 __ xxspltib(vec_0xfs, 0xf);
4107 __ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
4108 }
4109
4110 // The rest of the constants use different values depending on the
4111 // setting of isURL
4112 __ cmpwi(CR0, isURL, 0);
4113 __ beq(CR0, not_URL);
4114
4115 // isURL != 0 (true)
4116 if (PowerArchitecturePPC64 >= 10) {
4117 __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_URL_val), const_ptr);
4118 __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_URL_val), const_ptr);
4119 } else {
4120 __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_URL_val), const_ptr);
4121 __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_URL_val), const_ptr);
4122 __ xxspltib(vec_special_case_char->to_vsr(), '_');
4123 __ xxspltib(vec_special_case_offset, (unsigned char)US);
4124 }
4125 __ b(calculate_size);
4126
4127 // isURL = 0 (false)
4128 __ bind(not_URL);
4129 if (PowerArchitecturePPC64 >= 10) {
4130 __ lxv(table_32_47, BLK_OFFSETOF(table_32_47_val), const_ptr);
4131 __ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
4132 } else {
4133 __ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_val), const_ptr);
4134 __ lxv(maskLUT, BLK_OFFSETOF(maskLUT_val), const_ptr);
4135 __ xxspltib(vec_special_case_char->to_vsr(), '/');
4136 __ xxspltib(vec_special_case_offset, (unsigned char)SLS);
4137 }
4138
4139 __ bind(calculate_size);
4140
4141 // out starts at d + dp
4142 __ add(out, d, dp);
4143
4144 // in starts at s + sp
4145 __ add(in, s, sp);
4146
4147 __ align(32);
4148 __ bind(loop_start);
4149 __ lxv(input->to_vsr(), 0, in); // offset=0
4150
4151 //
4152 // Lookup
4153 //
4154 if (PowerArchitecturePPC64 >= 10) {
4155 // Use xxpermx to do a lookup of each Base64 character in the
4156 // input vector and translate it to a 6-bit value + 0x80.
4157 // Characters which are not valid Base64 characters will result
4158 // in a zero in the corresponding byte.
4159 //
4160 // Note that due to align(32) call above, the xxpermx instructions do
4161 // not require align_prefix() calls, since the final xxpermx
4162 // prefix+opcode is at byte 24.
4163 __ xxpermx(xlate_a, table_32_47, table_48_63, input->to_vsr(), 1); // offset=4
4164 __ xxpermx(xlate_b, table_64_79, table_80_95, input->to_vsr(), 2); // offset=12
4165 __ xxlor(xlate_b, xlate_a, xlate_b); // offset=20
4166 __ xxpermx(xlate_a, table_96_111, table_112_127, input->to_vsr(), 3); // offset=24
4167 __ xxlor(input->to_vsr(), xlate_a, xlate_b);
4168 // Check for non-Base64 characters by comparing each byte to zero.
4169 __ vcmpequb_(non_match, input, vec_0s);
4170 } else {
4171 // Isolate the upper 4 bits of each character by shifting it right 4 bits
4172 __ vsrb(higher_nibble, input, vec_4s);
4173 // Isolate the lower 4 bits by masking
4174 __ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
4175
4176 // Get the offset (the value to subtract from the byte) by using
4177 // a lookup table indexed by the upper 4 bits of the character
4178 __ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
4179
4180 // Find out which elements are the special case character (isURL ? '/' : '-')
4181 __ vcmpequb(eq_special_case_char, input, vec_special_case_char);
4182
4183 // For each character in the input which is a special case
4184 // character, replace its offset with one that is special for that
4185 // character.
4186 __ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
4187
4188 // Use the lower_nibble to select a mask "M" from the lookup table.
4189 __ xxperm(M, maskLUT, lower_nibble);
4190
4191 // "bit" is used to isolate which of the bits in M is relevant.
4192 __ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
4193
4194 // Each element of non_match correspond to one each of the 16 input
4195 // characters. Those elements that become 0x00 after the xxland
4196 // instruction are invalid Base64 characters.
4197 __ xxland(non_match->to_vsr(), M, bit);
4198
4199 // Compare each element to zero
4200 //
4201 __ vcmpequb_(non_match, non_match, vec_0s);
4202 }
4203 // vmcmpequb_ sets the EQ bit of CR6 if no elements compare equal.
4204 // Any element comparing equal to zero means there is an error in
4205 // that element. Note that the comparison result register
4206 // non_match is not referenced again. Only CR6-EQ matters.
4207 __ bne_predict_not_taken(CR6, loop_exit);
4208
4209 // The Base64 characters had no errors, so add the offsets, which in
4210 // the case of Power10 is a constant vector of all 0x80's (see earlier
4211 // comment where the offsets register is loaded).
4212 __ vaddubm(input, input, offsets);
4213
4214 // Pack
4215 //
4216 // In the tables below, b0, b1, .. b15 are the bytes of decoded
4217 // binary data, the first line of each of the cells (except for
4218 // the constants) uses the bit-field nomenclature from the
4219 // above-linked paper, whereas the second line is more specific
4220 // about which exact bits are present, and is constructed using the
4221 // Power ISA 3.x document style, where:
4222 //
4223 // * The specifier after the colon depicts which bits are there.
4224 // * The bit numbering is big endian style (bit 0 is the most
4225 // significant).
4226 // * || is a concatenate operator.
4227 // * Strings of 0's are a field of zeros with the shown length, and
4228 // likewise for strings of 1's.
4229
4230 // Note that only e12..e15 are shown here because the shifting
4231 // and OR'ing pattern replicates for e8..e11, e4..7, and
4232 // e0..e3.
4233 //
4234 // +======================+=================+======================+======================+=============+
4235 // | Vector | e12 | e13 | e14 | e15 |
4236 // | Element | | | | |
4237 // +======================+=================+======================+======================+=============+
4238 // | after vaddubm | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
4239 // | | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4240 // +----------------------+-----------------+----------------------+----------------------+-------------+
4241 // | pack_lshift | | << 6 | << 4 | << 2 |
4242 // +----------------------+-----------------+----------------------+----------------------+-------------+
4243 // | l after vslb | 00dddddd | cc000000 | bbbb0000 | aaaaaa00 |
4244 // | | 00||b2:2..7 | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 |
4245 // +----------------------+-----------------+----------------------+----------------------+-------------+
4246 // | l after vslo | cc000000 | bbbb0000 | aaaaaa00 | 00000000 |
4247 // | | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 | 00000000 |
4248 // +----------------------+-----------------+----------------------+----------------------+-------------+
4249 // | pack_rshift | | >> 2 | >> 4 | |
4250 // +----------------------+-----------------+----------------------+----------------------+-------------+
4251 // | r after vsrb | 00dddddd | 0000cccc | 000000bb | 00aaaaaa |
4252 // | | 00||b2:2..7 | 0000||b1:4..7 | 000000||b0:6..7 | 00||b0:0..5 |
4253 // +----------------------+-----------------+----------------------+----------------------+-------------+
4254 // | gathered after xxlor | ccdddddd | bbbbcccc | aaaaaabb | 00aaaaaa |
4255 // | | b2:0..7 | b1:0..7 | b0:0..7 | 00||b0:0..5 |
4256 // +======================+=================+======================+======================+=============+
4257 //
4258 // Note: there is a typo in the above-linked paper that shows the result of the gathering process is:
4259 // [ddddddcc|bbbbcccc|aaaaaabb]
4260 // but should be:
4261 // [ccdddddd|bbbbcccc|aaaaaabb]
4262 //
4263 __ vslb(l, input, pack_lshift);
4264 // vslo of vec_8s shifts the vector by one octet toward lower
4265 // element numbers, discarding element 0. This means it actually
4266 // shifts to the right (not left) according to the order of the
4267 // table above.
4268 __ vslo(l, l, vec_8s);
4269 __ vsrb(r, input, pack_rshift);
4270 __ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
4271
4272 // Final rearrangement of bytes into their correct positions.
4273 // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4274 // | Vector | e0 | e1 | e2 | e3 | e4 | e5 | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
4275 // | Elements | | | | | | | | | | | | | | | | |
4276 // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4277 // | after xxlor | b11 | b10 | b9 | xx | b8 | b7 | b6 | xx | b5 | b4 | b3 | xx | b2 | b1 | b0 | xx |
4278 // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4279 // | pack_permute | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 4 | 5 | 6 | 8 | 9 | 10 | 12 | 13 | 14 |
4280 // +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
4281 // | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
4282 // +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
4283 // xx bytes are not used to form the final data
4284 // b0..b15 are the decoded and reassembled 8-bit bytes of data
4285 // b11 with asterisk is a "don't care", because these bytes will be
4286 // overwritten on the next iteration.
4287 __ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
4288
4289 // We cannot use a static displacement on the store, since it's a
4290 // multiple of 12, not 16. Note that this stxv instruction actually
4291 // writes 16 bytes, even though only the first 12 are valid data.
4292 __ stxv(gathered->to_vsr(), 0, out);
4293 __ addi(out, out, 12);
4294 __ addi(in, in, 16);
4295 __ bdnz(loop_start);
4296
4297 __ bind(loop_exit);
4298
4299 // Return the number of out bytes produced, which is (out - (d + dp)) == out - d - dp;
4300 __ sub(R3_RET, out, d);
4301 __ sub(R3_RET, R3_RET, dp);
4302
4303 __ blr();
4304
4305 __ bind(return_zero);
4306 __ li(R3_RET, 0);
4307 __ blr();
4308
4309 return start;
4310 }
4311
4312 #undef UC
4313 #undef LC
4314 #undef DIG
4315 #undef PLS
4316 #undef HYP
4317 #undef SLS
4318 #undef US
4319
4320 // This algorithm is based on the methods described in this paper:
4321 // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
4322 //
4323 // The details of this implementation vary from the paper due to the
4324 // difference in the ISA between SSE and AltiVec, especially in the
4325 // splitting bytes section where there is no need on Power to mask after
4326 // the shift because the shift is byte-wise rather than an entire an entire
4327 // 128-bit word.
4328 //
4329 // For the lookup part of the algorithm, different logic is used than
4330 // described in the paper because of the availability of vperm, which can
4331 // do a 64-byte table lookup in four instructions, while preserving the
4332 // branchless nature.
4333 //
4334 // Description of the ENCODE_CORE macro
4335 //
4336 // Expand first 12 x 8-bit data bytes into 16 x 6-bit bytes (upper 2
4337 // bits of each byte are zeros)
4338 //
4339 // (Note: e7..e0 are not shown because they follow the same pattern as
4340 // e8..e15)
4341 //
4342 // In the table below, b0, b1, .. b15 are the bytes of unencoded
4343 // binary data, the first line of each of the cells (except for
4344 // the constants) uses the bit-field nomenclature from the
4345 // above-linked paper, whereas the second line is more specific
4346 // about which exact bits are present, and is constructed using the
4347 // Power ISA 3.x document style, where:
4348 //
4349 // * The specifier after the colon depicts which bits are there.
4350 // * The bit numbering is big endian style (bit 0 is the most
4351 // significant).
4352 // * || is a concatenate operator.
4353 // * Strings of 0's are a field of zeros with the shown length, and
4354 // likewise for strings of 1's.
4355 //
4356 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4357 // | Vector | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
4358 // | Element | | | | | | | | |
4359 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4360 // | after lxv | jjjjkkkk | iiiiiijj | gghhhhhh | ffffgggg | eeeeeeff | ccdddddd | bbbbcccc | aaaaaabb |
4361 // | | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
4362 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4363 // | xxperm indexes | 0 | 10 | 11 | 12 | 0 | 13 | 14 | 15 |
4364 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4365 // | (1) after xxperm | | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb |
4366 // | | (b15) | b5 | b4 | b3 | (b15) | b2 | b1 | b0 |
4367 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4368 // | rshift_amount | 0 | 6 | 4 | 2 | 0 | 6 | 4 | 2 |
4369 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4370 // | after vsrb | | 000000gg | 0000ffff | 00eeeeee | | 000000cc | 0000bbbb | 00aaaaaa |
4371 // | | (b15) | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | (b15) | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 |
4372 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4373 // | rshift_mask | 00000000 | 000000||11 | 0000||1111 | 00||111111 | 00000000 | 000000||11 | 0000||1111 | 00||111111 |
4374 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4375 // | rshift after vand | 00000000 | 000000gg | 0000ffff | 00eeeeee | 00000000 | 000000cc | 0000bbbb | 00aaaaaa |
4376 // | | 00000000 | 000000||b5:0..1 | 0000||b4:0..3 | 00||b3:0..5 | 00000000 | 000000||b2:0..1 | 0000||b1:0..3 | 00||b0:0..5 |
4377 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4378 // | 1 octet lshift (1) | gghhhhhh | ffffgggg | eeeeeeff | | ccdddddd | bbbbcccc | aaaaaabb | 00000000 |
4379 // | | b5 | b4 | b3 | (b15) | b2 | b1 | b0 | 00000000 |
4380 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4381 // | lshift_amount | 0 | 2 | 4 | 0 | 0 | 2 | 4 | 0 |
4382 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4383 // | after vslb | gghhhhhh | ffgggg00 | eeff0000 | | ccdddddd | bbcccc00 | aabb0000 | 00000000 |
4384 // | | b5 | b4:2..7||00 | b3:4..7||0000 | (b15) | b2:0..7 | b1:2..7||00 | b0:4..7||0000 | 00000000 |
4385 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4386 // | lshift_mask | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 | 00||111111 | 00||1111||00 | 00||11||0000 | 00000000 |
4387 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4388 // | lshift after vand | 00hhhhhh | 00gggg00 | 00ff0000 | 00000000 | 00dddddd | 00cccc00 | 00bb0000 | 00000000 |
4389 // | | 00||b5:2..7 | 00||b4:4..7||00 | 00||b3:6..7||0000 | 00000000 | 00||b2:2..7 | 00||b1:4..7||00 | 00||b0:6..7||0000 | 00000000 |
4390 // +--------------------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
4391 // | after vor lshift, rshift | 00hhhhhh | 00gggggg | 00ffffff | 00eeeeee | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
4392 // | | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
4393 // +==========================+=============+======================+======================+=============+=============+======================+======================+=============+
4394 //
4395 // Expand the first 12 bytes into 16 bytes, leaving every 4th byte
4396 // blank for now.
4397 // __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute);
4398 //
4399 // Generate two bit-shifted pieces - rshift and lshift - that will
4400 // later be OR'd together.
4401 //
4402 // First the right-shifted piece
4403 // __ vsrb(rshift, input, expand_rshift);
4404 // __ vand(rshift, rshift, expand_rshift_mask);
4405 //
4406 // Now the left-shifted piece, which is done by octet shifting
4407 // the input one byte to the left, then doing a variable shift,
4408 // followed by a mask operation.
4409 //
4410 // __ vslo(lshift, input, vec_8s);
4411 // __ vslb(lshift, lshift, expand_lshift);
4412 // __ vand(lshift, lshift, expand_lshift_mask);
4413 //
4414 // Combine the two pieces by OR'ing
4415 // __ vor(expanded, rshift, lshift);
4416 //
4417 // At this point, expanded is a vector containing a 6-bit value in each
4418 // byte. These values are used as indexes into a 64-byte lookup table that
4419 // is contained in four vector registers. The lookup operation is done
4420 // using vperm instructions with the same indexes for the lower 32 and
4421 // upper 32 bytes. To figure out which of the two looked-up bytes to use
4422 // at each location, all values in expanded are compared to 31. Using
4423 // vsel, values higher than 31 use the results from the upper 32 bytes of
4424 // the lookup operation, while values less than or equal to 31 use the
4425 // lower 32 bytes of the lookup operation.
4426 //
4427 // Note: it's tempting to use a xxpermx,xxpermx,vor sequence here on
4428 // Power10 (or later), but experiments doing so on Power10 yielded a slight
4429 // performance drop, perhaps due to the need for xxpermx instruction
4430 // prefixes.
4431
4432 #define ENCODE_CORE \
4433 __ xxperm(input->to_vsr(), input->to_vsr(), expand_permute); \
4434 __ vsrb(rshift, input, expand_rshift); \
4435 __ vand(rshift, rshift, expand_rshift_mask); \
4436 __ vslo(lshift, input, vec_8s); \
4437 __ vslb(lshift, lshift, expand_lshift); \
4438 __ vand(lshift, lshift, expand_lshift_mask); \
4439 __ vor(expanded, rshift, lshift); \
4440 __ vperm(encoded_00_31, vec_base64_00_15, vec_base64_16_31, expanded); \
4441 __ vperm(encoded_32_63, vec_base64_32_47, vec_base64_48_63, expanded); \
4442 __ vcmpgtub(gt_31, expanded, vec_31s); \
4443 __ vsel(expanded, encoded_00_31, encoded_32_63, gt_31);
4444
4445 // Intrinsic function prototype in Base64.java:
4446 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
4447
4448 address generate_base64_encodeBlock() {
4449 __ align(CodeEntryAlignment);
4450 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
4451 StubCodeMark mark(this, stub_id);
4452 address start = __ function_entry();
4453
4454 typedef struct {
4455 unsigned char expand_permute_val[16];
4456 unsigned char expand_rshift_val[16];
4457 unsigned char expand_rshift_mask_val[16];
4458 unsigned char expand_lshift_val[16];
4459 unsigned char expand_lshift_mask_val[16];
4460 unsigned char base64_00_15_val[16];
4461 unsigned char base64_16_31_val[16];
4462 unsigned char base64_32_47_val[16];
4463 unsigned char base64_48_63_val[16];
4464 unsigned char base64_48_63_URL_val[16];
4465 } constant_block;
4466
4467 alignas(16) static const constant_block const_block = {
4468 .expand_permute_val = {
4469 ARRAY_TO_LXV_ORDER(
4470 0, 4, 5, 6,
4471 0, 7, 8, 9,
4472 0, 10, 11, 12,
4473 0, 13, 14, 15 ) },
4474
4475 .expand_rshift_val = {
4476 ARRAY_TO_LXV_ORDER(
4477 0, 6, 4, 2,
4478 0, 6, 4, 2,
4479 0, 6, 4, 2,
4480 0, 6, 4, 2 ) },
4481
4482 .expand_rshift_mask_val = {
4483 ARRAY_TO_LXV_ORDER(
4484 0b00000000, 0b00000011, 0b00001111, 0b00111111,
4485 0b00000000, 0b00000011, 0b00001111, 0b00111111,
4486 0b00000000, 0b00000011, 0b00001111, 0b00111111,
4487 0b00000000, 0b00000011, 0b00001111, 0b00111111 ) },
4488
4489 .expand_lshift_val = {
4490 ARRAY_TO_LXV_ORDER(
4491 0, 2, 4, 0,
4492 0, 2, 4, 0,
4493 0, 2, 4, 0,
4494 0, 2, 4, 0 ) },
4495
4496 .expand_lshift_mask_val = {
4497 ARRAY_TO_LXV_ORDER(
4498 0b00111111, 0b00111100, 0b00110000, 0b00000000,
4499 0b00111111, 0b00111100, 0b00110000, 0b00000000,
4500 0b00111111, 0b00111100, 0b00110000, 0b00000000,
4501 0b00111111, 0b00111100, 0b00110000, 0b00000000 ) },
4502
4503 .base64_00_15_val = {
4504 ARRAY_TO_LXV_ORDER(
4505 'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P' ) },
4506
4507 .base64_16_31_val = {
4508 ARRAY_TO_LXV_ORDER(
4509 'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f' ) },
4510
4511 .base64_32_47_val = {
4512 ARRAY_TO_LXV_ORDER(
4513 'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v' ) },
4514
4515 .base64_48_63_val = {
4516 ARRAY_TO_LXV_ORDER(
4517 'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/' ) },
4518
4519 .base64_48_63_URL_val = {
4520 ARRAY_TO_LXV_ORDER(
4521 'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) }
4522 };
4523
4524 // Number of bytes to process in each pass through the main loop.
4525 // 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes.
4526 const unsigned block_size = 12;
4527
4528 // According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
4529 Register src = R3_ARG1; // source starting address of Base64 characters
4530 Register sp = R4_ARG2; // source starting position
4531 Register sl = R5_ARG3; // total source length of the Base64 characters to be processed
4532 Register dst = R6_ARG4; // destination address
4533 Register dp = R7_ARG5; // destination starting position
4534 Register isURL = R8_ARG6; // boolean, if non-zero indicates use of RFC 4648 base64url encoding
4535
4536 // Local variables
4537 Register const_ptr = R12; // used for loading constants (reuses isURL's register)
4538 Register tmp_reg = R9; // used for speeding up load_constant()
4539
4540 Register size = R9; // number of bytes to process (reuses tmp_reg's register)
4541 Register blocked_size = R10; // number of bytes to process a block at a time
4542 Register block_modulo = R12; // == block_size (reuse const_ptr)
4543 Register remaining = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg)
4544 Register in = R4; // current input (source) pointer (reuse sp's register)
4545 Register num_blocks = R11; // number of blocks to be processed by the loop
4546 Register out = R8; // current output (destination) pointer (reuse const_ptr's register)
4547 Register three = R9; // constant divisor (reuse size's register)
4548 Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register)
4549 Register tmp1 = R7; // temp register for lxvl length (reuse dp's register)
4550 Register modulo_chars = R7; // number of bytes written during the final write % 4 (reuse tmp1's register)
4551 Register pad_char = R6; // literal '=' (reuse dst's register)
4552
4553 // Volatile VSRS are 0..13, 32..51 (VR0..VR13)
4554 // VR Constants
4555 VectorRegister vec_8s = VR0;
4556 VectorRegister vec_31s = VR1;
4557 VectorRegister vec_base64_00_15 = VR2;
4558 VectorRegister vec_base64_16_31 = VR3;
4559 VectorRegister vec_base64_32_47 = VR4;
4560 VectorRegister vec_base64_48_63 = VR5;
4561 VectorRegister expand_rshift = VR6;
4562 VectorRegister expand_rshift_mask = VR7;
4563 VectorRegister expand_lshift = VR8;
4564 VectorRegister expand_lshift_mask = VR9;
4565
4566 // VR variables for expand
4567 VectorRegister input = VR10;
4568 VectorRegister rshift = VR11;
4569 VectorRegister lshift = VR12;
4570 VectorRegister expanded = VR13;
4571
4572 // VR variables for lookup
4573 VectorRegister encoded_00_31 = VR10; // (reuse input)
4574 VectorRegister encoded_32_63 = VR11; // (reuse rshift)
4575 VectorRegister gt_31 = VR12; // (reuse lshift)
4576
4577 // VSR Constants
4578 VectorSRegister expand_permute = VSR0;
4579
4580 Label not_URL, calculate_size, calculate_blocked_size, skip_loop;
4581 Label loop_start, le_16_to_write, no_pad, one_pad_char;
4582
4583 // The upper 32 bits of the non-pointer parameter registers are not
4584 // guaranteed to be zero, so mask off those upper bits.
4585 __ clrldi(sp, sp, 32);
4586 __ clrldi(sl, sl, 32);
4587 __ clrldi(dp, dp, 32);
4588 __ clrldi(isURL, isURL, 32);
4589
4590 // load up the constants
4591 __ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
4592 __ lxv(expand_permute, BLK_OFFSETOF(expand_permute_val), const_ptr);
4593 __ lxv(expand_rshift->to_vsr(), BLK_OFFSETOF(expand_rshift_val), const_ptr);
4594 __ lxv(expand_rshift_mask->to_vsr(), BLK_OFFSETOF(expand_rshift_mask_val), const_ptr);
4595 __ lxv(expand_lshift->to_vsr(), BLK_OFFSETOF(expand_lshift_val), const_ptr);
4596 __ lxv(expand_lshift_mask->to_vsr(), BLK_OFFSETOF(expand_lshift_mask_val), const_ptr);
4597 __ lxv(vec_base64_00_15->to_vsr(), BLK_OFFSETOF(base64_00_15_val), const_ptr);
4598 __ lxv(vec_base64_16_31->to_vsr(), BLK_OFFSETOF(base64_16_31_val), const_ptr);
4599 __ lxv(vec_base64_32_47->to_vsr(), BLK_OFFSETOF(base64_32_47_val), const_ptr);
4600
4601 // Splat the constants that can use xxspltib
4602 __ xxspltib(vec_8s->to_vsr(), 8);
4603 __ xxspltib(vec_31s->to_vsr(), 31);
4604
4605
4606 // Use a different translation lookup table depending on the
4607 // setting of isURL
4608 __ cmpdi(CR0, isURL, 0);
4609 __ beq(CR0, not_URL);
4610 __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_URL_val), const_ptr);
4611 __ b(calculate_size);
4612
4613 __ bind(not_URL);
4614 __ lxv(vec_base64_48_63->to_vsr(), BLK_OFFSETOF(base64_48_63_val), const_ptr);
4615
4616 __ bind(calculate_size);
4617
4618 // size = sl - sp - 4 (*)
4619 // (*) Don't process the last four bytes in the main loop because
4620 // we don't want the lxv instruction to read past the end of the src
4621 // data, in case those four bytes are on the start of an unmapped or
4622 // otherwise inaccessible page.
4623 //
4624 __ sub(size, sl, sp);
4625 __ subi(size, size, 4);
4626 __ cmpdi(CR7, size, block_size);
4627 __ bgt(CR7, calculate_blocked_size);
4628 __ mr(remaining, size);
4629 // Add the 4 back into remaining again
4630 __ addi(remaining, remaining, 4);
4631 // make "in" point to the beginning of the source data: in = src + sp
4632 __ add(in, src, sp);
4633 // out = dst + dp
4634 __ add(out, dst, dp);
4635 __ b(skip_loop);
4636
4637 __ bind(calculate_blocked_size);
4638 __ li(block_modulo, block_size);
4639 // num_blocks = size / block_modulo
4640 __ divwu(num_blocks, size, block_modulo);
4641 // blocked_size = num_blocks * size
4642 __ mullw(blocked_size, num_blocks, block_modulo);
4643 // remaining = size - blocked_size
4644 __ sub(remaining, size, blocked_size);
4645 __ mtctr(num_blocks);
4646
4647 // Add the 4 back in to remaining again
4648 __ addi(remaining, remaining, 4);
4649
4650 // make "in" point to the beginning of the source data: in = src + sp
4651 __ add(in, src, sp);
4652
4653 // out = dst + dp
4654 __ add(out, dst, dp);
4655
4656 __ align(32);
4657 __ bind(loop_start);
4658
4659 __ lxv(input->to_vsr(), 0, in);
4660
4661 ENCODE_CORE
4662
4663 __ stxv(expanded->to_vsr(), 0, out);
4664 __ addi(in, in, 12);
4665 __ addi(out, out, 16);
4666 __ bdnz(loop_start);
4667
4668 __ bind(skip_loop);
4669
4670 // When there are less than 16 bytes left, we need to be careful not to
4671 // read beyond the end of the src buffer, which might be in an unmapped
4672 // page.
4673 // Load the remaining bytes using lxvl.
4674 __ rldicr(tmp1, remaining, 56, 7);
4675 __ lxvl(input->to_vsr(), in, tmp1);
4676
4677 ENCODE_CORE
4678
4679 // bytes_to_write = ((remaining * 4) + 2) / 3
4680 __ li(three, 3);
4681 __ rlwinm(bytes_to_write, remaining, 2, 0, 29); // remaining * 4
4682 __ addi(bytes_to_write, bytes_to_write, 2);
4683 __ divwu(bytes_to_write, bytes_to_write, three);
4684
4685 __ cmpwi(CR7, bytes_to_write, 16);
4686 __ ble_predict_taken(CR7, le_16_to_write);
4687 __ stxv(expanded->to_vsr(), 0, out);
4688
4689 // We've processed 12 of the 13-15 data bytes, so advance the pointers,
4690 // and do one final pass for the remaining 1-3 bytes.
4691 __ addi(in, in, 12);
4692 __ addi(out, out, 16);
4693 __ subi(remaining, remaining, 12);
4694 __ subi(bytes_to_write, bytes_to_write, 16);
4695 __ rldicr(tmp1, bytes_to_write, 56, 7);
4696 __ lxvl(input->to_vsr(), in, tmp1);
4697
4698 ENCODE_CORE
4699
4700 __ bind(le_16_to_write);
4701 // shift bytes_to_write into the upper 8 bits of t1 for use by stxvl
4702 __ rldicr(tmp1, bytes_to_write, 56, 7);
4703 __ stxvl(expanded->to_vsr(), out, tmp1);
4704 __ add(out, out, bytes_to_write);
4705
4706 __ li(pad_char, '=');
4707 __ rlwinm_(modulo_chars, bytes_to_write, 0, 30, 31); // bytes_to_write % 4, set CR0
4708 // Examples:
4709 // remaining bytes_to_write modulo_chars num pad chars
4710 // 0 0 0 0
4711 // 1 2 2 2
4712 // 2 3 3 1
4713 // 3 4 0 0
4714 // 4 6 2 2
4715 // 5 7 3 1
4716 // ...
4717 // 12 16 0 0
4718 // 13 18 2 2
4719 // 14 19 3 1
4720 // 15 20 0 0
4721 __ beq(CR0, no_pad);
4722 __ cmpwi(CR7, modulo_chars, 3);
4723 __ beq(CR7, one_pad_char);
4724
4725 // two pad chars
4726 __ stb(pad_char, out);
4727 __ addi(out, out, 1);
4728
4729 __ bind(one_pad_char);
4730 __ stb(pad_char, out);
4731
4732 __ bind(no_pad);
4733
4734 __ blr();
4735 return start;
4736 }
4737
4738 #endif // VM_LITTLE_ENDIAN
4739
4740 void generate_lookup_secondary_supers_table_stub() {
4741 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
4742 StubCodeMark mark(this, stub_id);
4743
4744 const Register
4745 r_super_klass = R4_ARG2,
4746 r_array_base = R3_ARG1,
4747 r_array_length = R7_ARG5,
4748 r_array_index = R6_ARG4,
4749 r_sub_klass = R5_ARG3,
4750 r_bitmap = R11_scratch1,
4751 result = R8_ARG6;
4752
4753 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
4754 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
4755 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
4756 r_array_base, r_array_length, r_array_index,
4757 r_bitmap, result, slot);
4758 __ blr();
4759 }
4760 }
4761
4762 // Slow path implementation for UseSecondarySupersTable.
4763 address generate_lookup_secondary_supers_table_slow_path_stub() {
4764 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
4765 StubCodeMark mark(this, stub_id);
4766
4767 address start = __ pc();
4768 const Register
4769 r_super_klass = R4_ARG2,
4770 r_array_base = R3_ARG1,
4771 temp1 = R7_ARG5,
4772 r_array_index = R6_ARG4,
4773 r_bitmap = R11_scratch1,
4774 result = R8_ARG6;
4775
4776 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
4777 __ blr();
4778
4779 return start;
4780 }
4781
4782 address generate_cont_thaw(StubId stub_id) {
4783 if (!Continuations::enabled()) return nullptr;
4784
4785 Continuation::thaw_kind kind;
4786 bool return_barrier;
4787 bool return_barrier_exception;
4788
4789 switch (stub_id) {
4790 case StubId::stubgen_cont_thaw_id:
4791 kind = Continuation::thaw_top;
4792 return_barrier = false;
4793 return_barrier_exception = false;
4794 break;
4795 case StubId::stubgen_cont_returnBarrier_id:
4796 kind = Continuation::thaw_return_barrier;
4797 return_barrier = true;
4798 return_barrier_exception = false;
4799 break;
4800 case StubId::stubgen_cont_returnBarrierExc_id:
4801 kind = Continuation::thaw_return_barrier_exception;
4802 return_barrier = true;
4803 return_barrier_exception = true;
4804 break;
4805 default:
4806 ShouldNotReachHere();
4807 }
4808 StubCodeMark mark(this, stub_id);
4809
4810 Register tmp1 = R10_ARG8;
4811 Register tmp2 = R9_ARG7;
4812 Register tmp3 = R8_ARG6;
4813 Register nvtmp = R15_esp; // nonvolatile tmp register
4814 FloatRegister nvftmp = F20; // nonvolatile fp tmp register
4815
4816 address start = __ pc();
4817
4818 if (kind == Continuation::thaw_top) {
4819 __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
4820 }
4821
4822 if (return_barrier) {
4823 assert(!InlineTypeReturnedAsFields, "unsupported");
4824 __ mr(nvtmp, R3_RET); __ fmr(nvftmp, F1_RET); // preserve possible return value from a method returning to the return barrier
4825 DEBUG_ONLY(__ ld_ptr(tmp1, _abi0(callers_sp), R1_SP);)
4826 __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
4827 #ifdef ASSERT
4828 __ ld_ptr(tmp2, _abi0(callers_sp), R1_SP);
4829 __ cmpd(CR0, tmp1, tmp2);
4830 __ asm_assert_eq(FILE_AND_LINE ": callers sp is corrupt");
4831 #endif
4832 }
4833 #ifdef ASSERT
4834 __ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread);
4835 __ cmpd(CR0, R1_SP, tmp1);
4836 __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
4837 #endif
4838
4839 __ li(R4_ARG2, return_barrier ? 1 : 0);
4840 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), R16_thread, R4_ARG2);
4841
4842 #ifdef ASSERT
4843 DEBUG_ONLY(__ ld_ptr(tmp1, JavaThread::cont_entry_offset(), R16_thread));
4844 DEBUG_ONLY(__ cmpd(CR0, R1_SP, tmp1));
4845 __ asm_assert_eq(FILE_AND_LINE ": incorrect R1_SP");
4846 #endif
4847
4848 // R3_RET contains the size of the frames to thaw, 0 if overflow or no more frames
4849 Label thaw_success;
4850 __ cmpdi(CR0, R3_RET, 0);
4851 __ bne(CR0, thaw_success);
4852 __ load_const_optimized(tmp1, (SharedRuntime::throw_StackOverflowError_entry()), R0);
4853 __ mtctr(tmp1); __ bctr();
4854 __ bind(thaw_success);
4855
4856 __ addi(R3_RET, R3_RET, frame::native_abi_reg_args_size); // Large abi required for C++ calls.
4857 __ neg(R3_RET, R3_RET);
4858 // align down resulting in a smaller negative offset
4859 __ clrrdi(R3_RET, R3_RET, exact_log2(frame::alignment_in_bytes));
4860 DEBUG_ONLY(__ mr(tmp1, R1_SP);)
4861 __ resize_frame(R3_RET, tmp2); // make room for the thawed frames
4862
4863 __ li(R4_ARG2, kind);
4864 __ call_VM_leaf(Continuation::thaw_entry(), R16_thread, R4_ARG2);
4865 __ mr(R1_SP, R3_RET); // R3_RET contains the SP of the thawed top frame
4866
4867 if (return_barrier) {
4868 assert(!InlineTypeReturnedAsFields, "unsupported");
4869 // we're now in the caller of the frame that returned to the barrier
4870 __ mr(R3_RET, nvtmp); __ fmr(F1_RET, nvftmp); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4871 } else {
4872 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
4873 __ li(R3_RET, 0); // return 0 (success) from doYield
4874 }
4875
4876 if (return_barrier_exception) {
4877 Register ex_pc = R17_tos; // nonvolatile register
4878 __ ld(ex_pc, _abi0(lr), R1_SP); // LR
4879 __ mr(nvtmp, R3_RET); // save return value containing the exception oop
4880 // The thawed top frame has got a frame::java_abi. This is not sufficient for the runtime call.
4881 __ push_frame_reg_args(0, tmp1);
4882 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), R16_thread, ex_pc);
4883 __ mtlr(R3_RET); // the exception handler
4884 __ pop_frame();
4885 // See OptoRuntime::generate_exception_blob for register arguments
4886 __ mr(R3_ARG1, nvtmp); // exception oop
4887 __ mr(R4_ARG2, ex_pc); // exception pc
4888 } else {
4889 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4890 __ ld(R0, _abi0(lr), R1_SP); // LR
4891 __ mtlr(R0);
4892 }
4893 __ blr();
4894
4895 return start;
4896 }
4897
4898 address generate_cont_thaw() {
4899 return generate_cont_thaw(StubId::stubgen_cont_thaw_id);
4900 }
4901
4902 // TODO: will probably need multiple return barriers depending on return type
4903
4904 address generate_cont_returnBarrier() {
4905 return generate_cont_thaw(StubId::stubgen_cont_returnBarrier_id);
4906 }
4907
4908 address generate_cont_returnBarrier_exception() {
4909 return generate_cont_thaw(StubId::stubgen_cont_returnBarrierExc_id);
4910 }
4911
4912 address generate_cont_preempt_stub() {
4913 if (!Continuations::enabled()) return nullptr;
4914 StubId stub_id = StubId::stubgen_cont_preempt_id;
4915 StubCodeMark mark(this, stub_id);
4916 address start = __ pc();
4917
4918 __ clobber_nonvolatile_registers(); // Except R16_thread and R29_TOC
4919
4920 __ reset_last_Java_frame(false /*check_last_java_sp*/);
4921
4922 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4923 __ ld_ptr(R1_SP, JavaThread::cont_entry_offset(), R16_thread);
4924
4925 Label preemption_cancelled;
4926 __ lbz(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
4927 __ cmpwi(CR0, R11_scratch1, 0);
4928 __ bne(CR0, preemption_cancelled);
4929
4930 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4931 SharedRuntime::continuation_enter_cleanup(_masm);
4932 __ pop_frame();
4933 __ restore_LR(R11_scratch1);
4934 __ blr();
4935
4936 // We acquired the monitor after freezing the frames so call thaw to continue execution.
4937 __ bind(preemption_cancelled);
4938 __ li(R11_scratch1, 0); // false
4939 __ stb(R11_scratch1, in_bytes(JavaThread::preemption_cancelled_offset()), R16_thread);
4940 int simm16_offs = __ load_const_optimized(R11_scratch1, ContinuationEntry::thaw_call_pc_address(), R0, true);
4941 __ ld(R11_scratch1, simm16_offs, R11_scratch1);
4942 __ mtctr(R11_scratch1);
4943 __ bctr();
4944
4945 return start;
4946 }
4947
4948 // exception handler for upcall stubs
4949 address generate_upcall_stub_exception_handler() {
4950 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
4951 StubCodeMark mark(this, stub_id);
4952 address start = __ pc();
4953
4954 // Native caller has no idea how to handle exceptions,
4955 // so we just crash here. Up to callee to catch exceptions.
4956 __ verify_oop(R3_ARG1);
4957 __ load_const_optimized(R12_scratch2, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception), R0);
4958 __ call_c(R12_scratch2);
4959 __ should_not_reach_here();
4960
4961 return start;
4962 }
4963
4964 // load Method* target of MethodHandle
4965 // R3_ARG1 = jobject receiver
4966 // R19_method = result Method*
4967 address generate_upcall_stub_load_target() {
4968
4969 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
4970 StubCodeMark mark(this, stub_id);
4971 address start = __ pc();
4972
4973 __ resolve_global_jobject(R3_ARG1, R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS);
4974 // Load target method from receiver
4975 __ load_heap_oop(R19_method, java_lang_invoke_MethodHandle::form_offset(), R3_ARG1,
4976 R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4977 __ load_heap_oop(R19_method, java_lang_invoke_LambdaForm::vmentry_offset(), R19_method,
4978 R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4979 __ load_heap_oop(R19_method, java_lang_invoke_MemberName::method_offset(), R19_method,
4980 R22_tmp2, R23_tmp3, MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS, IS_NOT_NULL);
4981 __ ld(R19_method, java_lang_invoke_ResolvedMethodName::vmtarget_offset(), R19_method);
4982 __ std(R19_method, in_bytes(JavaThread::callee_target_offset()), R16_thread); // just in case callee is deoptimized
4983
4984 __ blr();
4985
4986 return start;
4987 }
4988
4989 // Initialization
4990 void generate_preuniverse_stubs() {
4991 // preuniverse stubs are not needed for ppc
4992 }
4993
4994 void generate_initial_stubs() {
4995 // Generates all stubs and initializes the entry points
4996
4997 // Entry points that exist in all platforms.
4998 // Note: This is code that could be shared among different platforms - however the
4999 // benefit seems to be smaller than the disadvantage of having a
5000 // much more complicated generator structure. See also comment in
5001 // stubRoutines.hpp.
5002
5003 StubRoutines::_forward_exception_entry = generate_forward_exception();
5004 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
5005 StubRoutines::_catch_exception_entry = generate_catch_exception();
5006
5007 if (UnsafeMemoryAccess::_table == nullptr) {
5008 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
5009 }
5010
5011 // CRC32 Intrinsics.
5012 if (UseCRC32Intrinsics) {
5013 StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32_id);
5014 }
5015
5016 // CRC32C Intrinsics.
5017 if (UseCRC32CIntrinsics) {
5018 StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(StubId::stubgen_updateBytesCRC32C_id);
5019 }
5020
5021 if (VM_Version::supports_float16()) {
5022 // For results consistency both intrinsics should be enabled.
5023 StubRoutines::_hf2f = generate_float16ToFloat();
5024 StubRoutines::_f2hf = generate_floatToFloat16();
5025 }
5026 }
5027
5028 void generate_continuation_stubs() {
5029 // Continuation stubs:
5030 StubRoutines::_cont_thaw = generate_cont_thaw();
5031 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
5032 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
5033 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
5034 }
5035
5036 void generate_final_stubs() {
5037 // Generates all stubs and initializes the entry points
5038
5039 // support for verify_oop (must happen after universe_init)
5040 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
5041
5042 // nmethod entry barriers for concurrent class unloading
5043 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
5044
5045 // arraycopy stubs used by compilers
5046 generate_arraycopy_stubs();
5047
5048 #ifdef COMPILER2
5049 if (UseSecondarySupersTable) {
5050 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
5051 if (!InlineSecondarySupersTest) {
5052 generate_lookup_secondary_supers_table_stub();
5053 }
5054 }
5055 #endif // COMPILER2
5056
5057 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
5058 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
5059 }
5060
5061 void generate_compiler_stubs() {
5062 #ifdef COMPILER2
5063
5064 if (UseMultiplyToLenIntrinsic) {
5065 StubRoutines::_multiplyToLen = generate_multiplyToLen();
5066 }
5067 if (UseSquareToLenIntrinsic) {
5068 StubRoutines::_squareToLen = generate_squareToLen();
5069 }
5070 if (UseMulAddIntrinsic) {
5071 StubRoutines::_mulAdd = generate_mulAdd();
5072 }
5073 if (UseMontgomeryMultiplyIntrinsic) {
5074 StubRoutines::_montgomeryMultiply
5075 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
5076 }
5077 if (UseMontgomerySquareIntrinsic) {
5078 StubRoutines::_montgomerySquare
5079 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
5080 }
5081
5082 // data cache line writeback
5083 if (VM_Version::supports_data_cache_line_flush()) {
5084 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5085 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5086 }
5087
5088 if (UseGHASHIntrinsics) {
5089 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5090 }
5091
5092 if (UseAESIntrinsics) {
5093 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5094 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5095 }
5096
5097 if (UseSHA256Intrinsics) {
5098 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
5099 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
5100 }
5101 if (UseSHA512Intrinsics) {
5102 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
5103 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
5104 }
5105
5106 #ifdef VM_LITTLE_ENDIAN
5107 // Currently supported on PPC64LE only
5108 if (UseBASE64Intrinsics) {
5109 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
5110 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
5111 }
5112 #endif
5113 #endif // COMPILER2
5114 }
5115
5116 public:
5117 StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData *stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
5118 switch(blob_id) {
5119 case BlobId::stubgen_preuniverse_id:
5120 generate_preuniverse_stubs();
5121 break;
5122 case BlobId::stubgen_initial_id:
5123 generate_initial_stubs();
5124 break;
5125 case BlobId::stubgen_continuation_id:
5126 generate_continuation_stubs();
5127 break;
5128 case BlobId::stubgen_compiler_id:
5129 generate_compiler_stubs();
5130 break;
5131 case BlobId::stubgen_final_id:
5132 generate_final_stubs();
5133 break;
5134 default:
5135 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
5136 break;
5137 };
5138 }
5139 };
5140
5141 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData *stub_data) {
5142 StubGenerator g(code, blob_id, stub_data);
5143 }
5144