1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "../../share/runtime/globals.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "gc/shared/barrierSet.hpp"
29 #include "gc/shared/barrierSetAssembler.hpp"
30 #include "oops/methodData.hpp"
31 #include "opto/c2_MacroAssembler.hpp"
32 #include "opto/intrinsicnode.hpp"
33 #include "opto/output.hpp"
34 #include "opto/opcodes.hpp"
35 #include "opto/subnode.hpp"
36 #include "runtime/globals.hpp"
37 #include "runtime/objectMonitor.hpp"
38 #include "runtime/objectMonitorTable.hpp"
39 #include "runtime/stubRoutines.hpp"
40 #include "runtime/synchronizer.hpp"
41 #include "utilities/checkedCast.hpp"
42 #include "utilities/globalDefinitions.hpp"
43 #include "utilities/powerOfTwo.hpp"
44 #include "utilities/sizes.hpp"
45
46 #ifdef PRODUCT
47 #define BLOCK_COMMENT(str) /* nothing */
48 #define STOP(error) stop(error)
49 #else
50 #define BLOCK_COMMENT(str) block_comment(str)
51 #define STOP(error) block_comment(error); stop(error)
52 #endif
53
54 // C2 compiled method's prolog code.
55 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
56 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
57
58 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
59 // Remove word for return addr
60 framesize -= wordSize;
61 stack_bang_size -= wordSize;
62
63 // Calls to C2R adapters often do not accept exceptional returns.
64 // We require that their callers must bang for them. But be careful, because
65 // some VM calls (such as call site linkage) can use several kilobytes of
66 // stack. But the stack safety zone should account for that.
67 // See bugs 4446381, 4468289, 4497237.
68 if (stack_bang_size > 0) {
69 generate_stack_overflow_check(stack_bang_size);
70
71 // We always push rbp, so that on return to interpreter rbp, will be
72 // restored correctly and we can correct the stack.
73 push(rbp);
74 // Save caller's stack pointer into RBP if the frame pointer is preserved.
75 if (PreserveFramePointer) {
76 mov(rbp, rsp);
77 }
78 // Remove word for ebp
79 framesize -= wordSize;
80
81 // Create frame
82 if (framesize) {
83 subptr(rsp, framesize);
84 }
85 } else {
86 subptr(rsp, framesize);
87
88 // Save RBP register now.
89 framesize -= wordSize;
90 movptr(Address(rsp, framesize), rbp);
91 // Save caller's stack pointer into RBP if the frame pointer is preserved.
92 if (PreserveFramePointer) {
93 movptr(rbp, rsp);
94 if (framesize > 0) {
95 addptr(rbp, framesize);
96 }
97 }
98 }
99
100 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
101 framesize -= wordSize;
102 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
103 }
104
105 #ifdef ASSERT
106 if (VerifyStackAtCalls) {
107 Label L;
108 push(rax);
109 mov(rax, rsp);
110 andptr(rax, StackAlignmentInBytes-1);
111 cmpptr(rax, StackAlignmentInBytes-wordSize);
112 pop(rax);
113 jcc(Assembler::equal, L);
114 STOP("Stack is not properly aligned!");
115 bind(L);
116 }
117 #endif
118
119 if (!is_stub) {
120 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
121 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
122 Label dummy_slow_path;
123 Label dummy_continuation;
124 Label* slow_path = &dummy_slow_path;
125 Label* continuation = &dummy_continuation;
126 if (!Compile::current()->output()->in_scratch_emit_size()) {
127 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
128 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
129 Compile::current()->output()->add_stub(stub);
130 slow_path = &stub->entry();
131 continuation = &stub->continuation();
132 }
133 bs->nmethod_entry_barrier(this, slow_path, continuation);
134 }
135 }
136
137 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
138 switch (vlen_in_bytes) {
139 case 4: // fall-through
140 case 8: // fall-through
141 case 16: return Assembler::AVX_128bit;
142 case 32: return Assembler::AVX_256bit;
143 case 64: return Assembler::AVX_512bit;
144
145 default: {
146 ShouldNotReachHere();
147 return Assembler::AVX_NoVec;
148 }
149 }
150 }
151
152 // fast_lock and fast_unlock used by C2
153
154 // Because the transitions from emitted code to the runtime
155 // monitorenter/exit helper stubs are so slow it's critical that
156 // we inline both the lock-stack fast path and the inflated fast path.
157 //
158 // See also: cmpFastLock and cmpFastUnlock.
159 //
160 // What follows is a specialized inline transliteration of the code
161 // in enter() and exit(). If we're concerned about I$ bloat another
162 // option would be to emit TrySlowEnter and TrySlowExit methods
163 // at startup-time. These methods would accept arguments as
164 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
165 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
166 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
167 // In practice, however, the # of lock sites is bounded and is usually small.
168 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
169 // if the processor uses simple bimodal branch predictors keyed by EIP
170 // Since the helper routines would be called from multiple synchronization
171 // sites.
172 //
173 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
174 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
175 // to those specialized methods. That'd give us a mostly platform-independent
176 // implementation that the JITs could optimize and inline at their pleasure.
177 // Done correctly, the only time we'd need to cross to native could would be
178 // to park() or unpark() threads. We'd also need a few more unsafe operators
179 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
180 // (b) explicit barriers or fence operations.
181 //
182 // TODO:
183 //
184 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
185 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
186 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
187 // the lock operators would typically be faster than reifying Self.
188 //
189 // * Ideally I'd define the primitives as:
190 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
191 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
192 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
193 // Instead, we're stuck with a rather awkward and brittle register assignments below.
194 // Furthermore the register assignments are overconstrained, possibly resulting in
195 // sub-optimal code near the synchronization site.
196 //
197 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
198 // Alternately, use a better sp-proximity test.
199 //
200 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
201 // Either one is sufficient to uniquely identify a thread.
202 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
203 //
204 // * Intrinsify notify() and notifyAll() for the common cases where the
205 // object is locked by the calling thread but the waitlist is empty.
206 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
207 //
208 // * use jccb and jmpb instead of jcc and jmp to improve code density.
209 // But beware of excessive branch density on AMD Opterons.
210 //
211 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
212 // or failure of the fast path. If the fast path fails then we pass
213 // control to the slow path, typically in C. In fast_lock and
214 // fast_unlock we often branch to DONE_LABEL, just to find that C2
215 // will emit a conditional branch immediately after the node.
216 // So we have branches to branches and lots of ICC.ZF games.
217 // Instead, it might be better to have C2 pass a "FailureLabel"
218 // into fast_lock and fast_unlock. In the case of success, control
219 // will drop through the node. ICC.ZF is undefined at exit.
220 // In the case of failure, the node will branch directly to the
221 // FailureLabel
222
223 // obj: object to lock
224 // box: on-stack box address -- KILLED
225 // rax: tmp -- KILLED
226 // t : tmp -- KILLED
227 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
228 Register t, Register thread) {
229 assert(rax_reg == rax, "Used for CAS");
230 assert_different_registers(obj, box, rax_reg, t, thread);
231
232 // Handle inflated monitor.
233 Label inflated;
234 // Finish fast lock successfully. ZF value is irrelevant.
235 Label locked;
236 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
237 Label slow_path;
238
239 if (UseObjectMonitorTable) {
240 // Clear cache in case fast locking succeeds or we need to take the slow-path.
241 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
242 }
243
244 if (DiagnoseSyncOnValueBasedClasses != 0) {
245 load_klass(rax_reg, obj, t);
246 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
247 jcc(Assembler::notZero, slow_path);
248 }
249
250 const Register mark = t;
251
252 { // Fast Lock
253
254 Label push;
255
256 const Register top = UseObjectMonitorTable ? rax_reg : box;
257
258 // Load the mark.
259 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
260
261 // Prefetch top.
262 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
263
264 // Check for monitor (0b10).
265 testptr(mark, markWord::monitor_value);
266 jcc(Assembler::notZero, inflated);
267
268 // Check if lock-stack is full.
269 cmpl(top, LockStack::end_offset() - 1);
270 jcc(Assembler::greater, slow_path);
271
272 // Check if recursive.
273 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
274 jccb(Assembler::equal, push);
275
276 // Try to lock. Transition lock bits 0b01 => 0b00
277 movptr(rax_reg, mark);
278 orptr(rax_reg, markWord::unlocked_value);
279 andptr(mark, ~(int32_t)markWord::unlocked_value);
280 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
281 jcc(Assembler::notEqual, slow_path);
282
283 if (UseObjectMonitorTable) {
284 // Need to reload top, clobbered by CAS.
285 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
286 }
287 bind(push);
288 // After successful lock, push object on lock-stack.
289 movptr(Address(thread, top), obj);
290 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
291 jmp(locked);
292 }
293
294 { // Handle inflated monitor.
295 bind(inflated);
296
297 const Register monitor = t;
298
299 if (!UseObjectMonitorTable) {
300 assert(mark == monitor, "should be the same here");
301 } else {
302 const Register hash = t;
303 Label monitor_found;
304
305 // Look for the monitor in the om_cache.
306
307 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
308 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
309 const int num_unrolled = OMCache::CAPACITY;
310 for (int i = 0; i < num_unrolled; i++) {
311 movptr(monitor, Address(thread, cache_offset + monitor_offset));
312 cmpptr(obj, Address(thread, cache_offset));
313 jccb(Assembler::equal, monitor_found);
314 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
315 }
316
317 if (UseCompactObjectHeaders) {
318 // TODO: The fast-path table lookup currently doesn't work with Lilliput's
319 // compact identity-hashcode implementation.
320 // See: https://bugs.openjdk.org/browse/JDK-8380981
321 jmp(slow_path);
322 } else {
323 // Look for the monitor in the table.
324
325 // Get the hash code.
326 movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
327 shrq(hash, markWord::hash_shift);
328 andq(hash, markWord::hash_mask);
329
330 // Get the table and calculate the bucket's address.
331 lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
332 movptr(rax_reg, Address(rax_reg));
333 andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
334 movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
335
336 // Read the monitor from the bucket.
337 movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
338
339 // Check if the monitor in the bucket is special (empty, tombstone or removed)
340 cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
341 jcc(Assembler::below, slow_path);
342
343 // Check if object matches.
344 movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
345 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
346 bs_asm->try_peek_weak_handle_in_nmethod(this, rax_reg, rax_reg, slow_path);
347 cmpptr(rax_reg, obj);
348 jcc(Assembler::notEqual, slow_path);
349 }
350 bind(monitor_found);
351 }
352 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
353 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
354 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
355
356 Label monitor_locked;
357 // Lock the monitor.
358
359 if (UseObjectMonitorTable) {
360 // Cache the monitor for unlock before trashing box. On failure to acquire
361 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
362 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
363 }
364
365 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
366 xorptr(rax_reg, rax_reg);
367 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
368 lock(); cmpxchgptr(box, owner_address);
369 jccb(Assembler::equal, monitor_locked);
370
371 // Check if recursive.
372 cmpptr(box, rax_reg);
373 jccb(Assembler::notEqual, slow_path);
374
375 // Recursive.
376 increment(recursions_address);
377
378 bind(monitor_locked);
379 }
380
381 bind(locked);
382 // Set ZF = 1
383 xorl(rax_reg, rax_reg);
384
385 #ifdef ASSERT
386 // Check that locked label is reached with ZF set.
387 Label zf_correct;
388 Label zf_bad_zero;
389 jcc(Assembler::zero, zf_correct);
390 jmp(zf_bad_zero);
391 #endif
392
393 bind(slow_path);
394 #ifdef ASSERT
395 // Check that slow_path label is reached with ZF not set.
396 jcc(Assembler::notZero, zf_correct);
397 stop("Fast Lock ZF != 0");
398 bind(zf_bad_zero);
399 stop("Fast Lock ZF != 1");
400 bind(zf_correct);
401 #endif
402 // C2 uses the value of ZF to determine the continuation.
403 }
404
405 // obj: object to lock
406 // rax: tmp -- KILLED
407 // t : tmp - cannot be obj nor rax -- KILLED
408 //
409 // Some commentary on balanced locking:
410 //
411 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
412 // Methods that don't have provably balanced locking are forced to run in the
413 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
414 // The interpreter provides two properties:
415 // I1: At return-time the interpreter automatically and quietly unlocks any
416 // objects acquired in the current activation (frame). Recall that the
417 // interpreter maintains an on-stack list of locks currently held by
418 // a frame.
419 // I2: If a method attempts to unlock an object that is not held by the
420 // frame the interpreter throws IMSX.
421 //
422 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
423 // B() doesn't have provably balanced locking so it runs in the interpreter.
424 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
425 // is still locked by A().
426 //
427 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
428 // Specification" states that an object locked by JNI's MonitorEnter should not be
429 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
430 // specify what will occur if a program engages in such mixed-mode locking, however.
431 // Arguably given that the spec legislates the JNI case as undefined our implementation
432 // could reasonably *avoid* checking owner in fast_unlock().
433 // In the interest of performance we elide m->Owner==Self check in unlock.
434 // A perfectly viable alternative is to elide the owner check except when
435 // Xcheck:jni is enabled.
436
437 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
438 assert(reg_rax == rax, "Used for CAS");
439 assert_different_registers(obj, reg_rax, t);
440
441 // Handle inflated monitor.
442 Label inflated, inflated_check_lock_stack;
443 // Finish fast unlock successfully. MUST jump with ZF == 1
444 Label unlocked, slow_path;
445
446 const Register mark = t;
447 const Register monitor = t;
448 const Register top = UseObjectMonitorTable ? t : reg_rax;
449 const Register box = reg_rax;
450
451 Label dummy;
452 C2FastUnlockStub* stub = nullptr;
453
454 if (!Compile::current()->output()->in_scratch_emit_size()) {
455 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
456 Compile::current()->output()->add_stub(stub);
457 }
458
459 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
460
461 { // Fast Unlock
462
463 // Load top.
464 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
465
466 if (!UseObjectMonitorTable) {
467 // Prefetch mark.
468 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
469 }
470
471 // Check if obj is top of lock-stack.
472 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
473 // Top of lock stack was not obj. Must be monitor.
474 jcc(Assembler::notEqual, inflated_check_lock_stack);
475
476 // Pop lock-stack.
477 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
478 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
479
480 // Check if recursive.
481 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
482 jcc(Assembler::equal, unlocked);
483
484 // We elide the monitor check, let the CAS fail instead.
485
486 if (UseObjectMonitorTable) {
487 // Load mark.
488 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
489 }
490
491 // Try to unlock. Transition lock bits 0b00 => 0b01
492 movptr(reg_rax, mark);
493 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
494 orptr(mark, markWord::unlocked_value);
495 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
496 jcc(Assembler::notEqual, push_and_slow_path);
497 jmp(unlocked);
498 }
499
500
501 { // Handle inflated monitor.
502 bind(inflated_check_lock_stack);
503 #ifdef ASSERT
504 Label check_done;
505 subl(top, oopSize);
506 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
507 jcc(Assembler::below, check_done);
508 cmpptr(obj, Address(thread, top));
509 jcc(Assembler::notEqual, inflated_check_lock_stack);
510 stop("Fast Unlock lock on stack");
511 bind(check_done);
512 if (UseObjectMonitorTable) {
513 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
514 }
515 testptr(mark, markWord::monitor_value);
516 jcc(Assembler::notZero, inflated);
517 stop("Fast Unlock not monitor");
518 #endif
519
520 bind(inflated);
521
522 if (!UseObjectMonitorTable) {
523 assert(mark == monitor, "should be the same here");
524 } else {
525 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
526 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
527 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
528 cmpptr(monitor, alignof(ObjectMonitor*));
529 jcc(Assembler::below, slow_path);
530 }
531 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
532 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
533 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
534 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
535 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
536
537 Label recursive;
538
539 // Check if recursive.
540 cmpptr(recursions_address, 0);
541 jcc(Assembler::notZero, recursive);
542
543 // Set owner to null.
544 // Release to satisfy the JMM
545 movptr(owner_address, NULL_WORD);
546 // We need a full fence after clearing owner to avoid stranding.
547 // StoreLoad achieves this.
548 membar(StoreLoad);
549
550 // Check if the entry_list is empty.
551 cmpptr(entry_list_address, NULL_WORD);
552 jcc(Assembler::zero, unlocked); // If so we are done.
553
554 // Check if there is a successor.
555 cmpptr(succ_address, NULL_WORD);
556 jcc(Assembler::notZero, unlocked); // If so we are done.
557
558 // Save the monitor pointer in the current thread, so we can try to
559 // reacquire the lock in SharedRuntime::monitor_exit_helper().
560 if (!UseObjectMonitorTable) {
561 andptr(monitor, ~(int32_t)markWord::monitor_value);
562 }
563 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
564
565 orl(t, 1); // Fast Unlock ZF = 0
566 jmpb(slow_path);
567
568 // Recursive unlock.
569 bind(recursive);
570 decrement(recursions_address);
571 }
572
573 bind(unlocked);
574 xorl(t, t); // Fast Unlock ZF = 1
575
576 #ifdef ASSERT
577 // Check that unlocked label is reached with ZF set.
578 Label zf_correct;
579 Label zf_bad_zero;
580 jcc(Assembler::zero, zf_correct);
581 jmp(zf_bad_zero);
582 #endif
583
584 bind(slow_path);
585 if (stub != nullptr) {
586 bind(stub->slow_path_continuation());
587 }
588 #ifdef ASSERT
589 // Check that stub->continuation() label is reached with ZF not set.
590 jcc(Assembler::notZero, zf_correct);
591 stop("Fast Unlock ZF != 0");
592 bind(zf_bad_zero);
593 stop("Fast Unlock ZF != 1");
594 bind(zf_correct);
595 #endif
596 // C2 uses the value of ZF to determine the continuation.
597 }
598
599 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
600 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
601 }
602
603 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
604 const int framesize = Compile::current()->output()->frame_size_in_bytes();
605 masm->movptr(dst, rsp);
606 if (framesize > 2 * wordSize) {
607 masm->addptr(dst, framesize - 2 * wordSize);
608 }
609 }
610
611 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
612 if (PreserveFramePointer) {
613 // frame pointer is valid
614 #ifdef ASSERT
615 // Verify frame pointer value in rbp.
616 reconstruct_frame_pointer_helper(this, rtmp);
617 Label L_success;
618 cmpq(rbp, rtmp);
619 jccb(Assembler::equal, L_success);
620 STOP("frame pointer mismatch");
621 bind(L_success);
622 #endif // ASSERT
623 } else {
624 reconstruct_frame_pointer_helper(this, rbp);
625 }
626 }
627
628 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
629 jint lo = t->_lo;
630 jint hi = t->_hi;
631 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
632 if (t == TypeInt::INT) {
633 return;
634 }
635
636 BLOCK_COMMENT("CastII {");
637 Label fail;
638 Label succeed;
639
640 if (lo != min_jint) {
641 cmpl(val, lo);
642 jccb(Assembler::less, fail);
643 }
644 if (hi != max_jint) {
645 cmpl(val, hi);
646 jccb(Assembler::greater, fail);
647 }
648 jmpb(succeed);
649
650 bind(fail);
651 movl(c_rarg0, idx);
652 movl(c_rarg1, val);
653 movl(c_rarg2, lo);
654 movl(c_rarg3, hi);
655 reconstruct_frame_pointer(rscratch1);
656 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
657 hlt();
658 bind(succeed);
659 BLOCK_COMMENT("} // CastII");
660 }
661
662 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
663 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
664 }
665
666 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
667 jlong lo = t->_lo;
668 jlong hi = t->_hi;
669 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
670 if (t == TypeLong::LONG) {
671 return;
672 }
673
674 BLOCK_COMMENT("CastLL {");
675 Label fail;
676 Label succeed;
677
678 auto cmp_val = [&](jlong bound) {
679 if (is_simm32(bound)) {
680 cmpq(val, checked_cast<int>(bound));
681 } else {
682 mov64(tmp, bound);
683 cmpq(val, tmp);
684 }
685 };
686
687 if (lo != min_jlong) {
688 cmp_val(lo);
689 jccb(Assembler::less, fail);
690 }
691 if (hi != max_jlong) {
692 cmp_val(hi);
693 jccb(Assembler::greater, fail);
694 }
695 jmpb(succeed);
696
697 bind(fail);
698 movl(c_rarg0, idx);
699 movq(c_rarg1, val);
700 mov64(c_rarg2, lo);
701 mov64(c_rarg3, hi);
702 reconstruct_frame_pointer(rscratch1);
703 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
704 hlt();
705 bind(succeed);
706 BLOCK_COMMENT("} // CastLL");
707 }
708
709 //-------------------------------------------------------------------------------------------
710 // Generic instructions support for use in .ad files C2 code generation
711
712 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
713 if (dst != src) {
714 movdqu(dst, src);
715 }
716 if (opcode == Op_AbsVD) {
717 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
718 } else {
719 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
720 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
721 }
722 }
723
724 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
725 if (opcode == Op_AbsVD) {
726 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
727 } else {
728 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
729 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
730 }
731 }
732
733 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
734 if (dst != src) {
735 movdqu(dst, src);
736 }
737 if (opcode == Op_AbsVF) {
738 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
739 } else {
740 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
741 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
742 }
743 }
744
745 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
746 if (opcode == Op_AbsVF) {
747 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
748 } else {
749 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
750 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
751 }
752 }
753
754 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
755 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
756 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
757
758 if (opcode == Op_MinV) {
759 if (elem_bt == T_BYTE) {
760 pminsb(dst, src);
761 } else if (elem_bt == T_SHORT) {
762 pminsw(dst, src);
763 } else if (elem_bt == T_INT) {
764 pminsd(dst, src);
765 } else {
766 assert(elem_bt == T_LONG, "required");
767 assert(tmp == xmm0, "required");
768 assert_different_registers(dst, src, tmp);
769 movdqu(xmm0, dst);
770 pcmpgtq(xmm0, src);
771 blendvpd(dst, src); // xmm0 as mask
772 }
773 } else { // opcode == Op_MaxV
774 if (elem_bt == T_BYTE) {
775 pmaxsb(dst, src);
776 } else if (elem_bt == T_SHORT) {
777 pmaxsw(dst, src);
778 } else if (elem_bt == T_INT) {
779 pmaxsd(dst, src);
780 } else {
781 assert(elem_bt == T_LONG, "required");
782 assert(tmp == xmm0, "required");
783 assert_different_registers(dst, src, tmp);
784 movdqu(xmm0, src);
785 pcmpgtq(xmm0, dst);
786 blendvpd(dst, src); // xmm0 as mask
787 }
788 }
789 }
790
791 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
792 XMMRegister src1, Address src2, int vlen_enc) {
793 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
794 if (opcode == Op_UMinV) {
795 switch(elem_bt) {
796 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
797 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
798 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
799 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
800 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
801 }
802 } else {
803 assert(opcode == Op_UMaxV, "required");
804 switch(elem_bt) {
805 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
806 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
807 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
808 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
809 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
810 }
811 }
812 }
813
814 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
815 // For optimality, leverage a full vector width of 512 bits
816 // for operations over smaller vector sizes on AVX512 targets.
817 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
818 if (opcode == Op_UMaxV) {
819 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
820 } else {
821 assert(opcode == Op_UMinV, "required");
822 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
823 }
824 } else {
825 // T1 = -1
826 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
827 // T1 = -1 << 63
828 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
829 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
830 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
831 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
832 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
833 // Mask = T2 > T1
834 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
835 if (opcode == Op_UMaxV) {
836 // Res = Mask ? Src2 : Src1
837 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
838 } else {
839 // Res = Mask ? Src1 : Src2
840 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
841 }
842 }
843 }
844
845 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
846 XMMRegister src1, XMMRegister src2, int vlen_enc) {
847 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
848 if (opcode == Op_UMinV) {
849 switch(elem_bt) {
850 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
851 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
852 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
853 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
854 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
855 }
856 } else {
857 assert(opcode == Op_UMaxV, "required");
858 switch(elem_bt) {
859 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
860 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
861 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
862 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
863 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
864 }
865 }
866 }
867
868 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
869 XMMRegister dst, XMMRegister src1, XMMRegister src2,
870 int vlen_enc) {
871 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
872
873 if (opcode == Op_MinV) {
874 if (elem_bt == T_BYTE) {
875 vpminsb(dst, src1, src2, vlen_enc);
876 } else if (elem_bt == T_SHORT) {
877 vpminsw(dst, src1, src2, vlen_enc);
878 } else if (elem_bt == T_INT) {
879 vpminsd(dst, src1, src2, vlen_enc);
880 } else {
881 assert(elem_bt == T_LONG, "required");
882 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
883 vpminsq(dst, src1, src2, vlen_enc);
884 } else {
885 assert_different_registers(dst, src1, src2);
886 vpcmpgtq(dst, src1, src2, vlen_enc);
887 vblendvpd(dst, src1, src2, dst, vlen_enc);
888 }
889 }
890 } else { // opcode == Op_MaxV
891 if (elem_bt == T_BYTE) {
892 vpmaxsb(dst, src1, src2, vlen_enc);
893 } else if (elem_bt == T_SHORT) {
894 vpmaxsw(dst, src1, src2, vlen_enc);
895 } else if (elem_bt == T_INT) {
896 vpmaxsd(dst, src1, src2, vlen_enc);
897 } else {
898 assert(elem_bt == T_LONG, "required");
899 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
900 vpmaxsq(dst, src1, src2, vlen_enc);
901 } else {
902 assert_different_registers(dst, src1, src2);
903 vpcmpgtq(dst, src1, src2, vlen_enc);
904 vblendvpd(dst, src2, src1, dst, vlen_enc);
905 }
906 }
907 }
908 }
909
910 // Float/Double min max
911
912 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
913 XMMRegister dst, XMMRegister a, XMMRegister b,
914 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
915 int vlen_enc) {
916 assert(UseAVX > 0, "required");
917 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
918 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
919 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
920 assert_different_registers(a, tmp, atmp, btmp);
921 assert_different_registers(b, tmp, atmp, btmp);
922
923 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
924 bool is_double_word = is_double_word_type(elem_bt);
925
926 /* Note on 'non-obvious' assembly sequence:
927 *
928 * While there are vminps/vmaxps instructions, there are two important differences between hardware
929 * and Java on how they handle floats:
930 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
931 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
932 *
933 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
934 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
935 * (only useful when signs differ, noop otherwise)
936 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
937
938 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
939 * btmp = (b < +0.0) ? a : b
940 * atmp = (b < +0.0) ? b : a
941 * Tmp = Max_Float(atmp , btmp)
942 * Res = (atmp == NaN) ? atmp : Tmp
943 */
944
945 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
946 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
947 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
948 XMMRegister mask;
949
950 if (!is_double_word && is_min) {
951 mask = a;
952 vblend = &MacroAssembler::vblendvps;
953 vmaxmin = &MacroAssembler::vminps;
954 vcmp = &MacroAssembler::vcmpps;
955 } else if (!is_double_word && !is_min) {
956 mask = b;
957 vblend = &MacroAssembler::vblendvps;
958 vmaxmin = &MacroAssembler::vmaxps;
959 vcmp = &MacroAssembler::vcmpps;
960 } else if (is_double_word && is_min) {
961 mask = a;
962 vblend = &MacroAssembler::vblendvpd;
963 vmaxmin = &MacroAssembler::vminpd;
964 vcmp = &MacroAssembler::vcmppd;
965 } else {
966 assert(is_double_word && !is_min, "sanity");
967 mask = b;
968 vblend = &MacroAssembler::vblendvpd;
969 vmaxmin = &MacroAssembler::vmaxpd;
970 vcmp = &MacroAssembler::vcmppd;
971 }
972
973 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
974 XMMRegister maxmin, scratch;
975 if (dst == btmp) {
976 maxmin = btmp;
977 scratch = tmp;
978 } else {
979 maxmin = tmp;
980 scratch = btmp;
981 }
982
983 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
984 if (precompute_mask && !is_double_word) {
985 vpsrad(tmp, mask, 32, vlen_enc);
986 mask = tmp;
987 } else if (precompute_mask && is_double_word) {
988 vpxor(tmp, tmp, tmp, vlen_enc);
989 vpcmpgtq(tmp, tmp, mask, vlen_enc);
990 mask = tmp;
991 }
992
993 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
994 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
995 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
996 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
997 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
998 }
999
1000 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1001 XMMRegister dst, XMMRegister a, XMMRegister b,
1002 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1003 int vlen_enc) {
1004 assert(UseAVX > 2, "required");
1005 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1006 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1007 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1008 assert_different_registers(dst, a, atmp, btmp);
1009 assert_different_registers(dst, b, atmp, btmp);
1010
1011 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1012 bool is_double_word = is_double_word_type(elem_bt);
1013 bool merge = true;
1014
1015 if (!is_double_word && is_min) {
1016 evpmovd2m(ktmp, a, vlen_enc);
1017 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019 vminps(dst, atmp, btmp, vlen_enc);
1020 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022 } else if (!is_double_word && !is_min) {
1023 evpmovd2m(ktmp, b, vlen_enc);
1024 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1025 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1026 vmaxps(dst, atmp, btmp, vlen_enc);
1027 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1029 } else if (is_double_word && is_min) {
1030 evpmovq2m(ktmp, a, vlen_enc);
1031 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1032 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1033 vminpd(dst, atmp, btmp, vlen_enc);
1034 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1035 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1036 } else {
1037 assert(is_double_word && !is_min, "sanity");
1038 evpmovq2m(ktmp, b, vlen_enc);
1039 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1040 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1041 vmaxpd(dst, atmp, btmp, vlen_enc);
1042 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1043 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1044 }
1045 }
1046
1047 void C2_MacroAssembler::vminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1048 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1049 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1050 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1051
1052 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1053 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1054 if (elem_bt == T_FLOAT) {
1055 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1056 } else {
1057 assert(elem_bt == T_DOUBLE, "");
1058 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1059 }
1060 }
1061
1062 void C2_MacroAssembler::sminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1063 XMMRegister src1, XMMRegister src2) {
1064 assert(opc == Op_MinF || opc == Op_MaxF ||
1065 opc == Op_MinD || opc == Op_MaxD, "sanity");
1066
1067 int imm8 = (opc == Op_MinF || opc == Op_MinD) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1068 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1069 if (elem_bt == T_FLOAT) {
1070 evminmaxss(dst, mask, src1, src2, true, imm8);
1071 } else {
1072 assert(elem_bt == T_DOUBLE, "");
1073 evminmaxsd(dst, mask, src1, src2, true, imm8);
1074 }
1075 }
1076
1077 // Float/Double signum
1078 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1079 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1080
1081 Label DONE_LABEL;
1082
1083 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1084 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1085 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1086 if (opcode == Op_SignumF) {
1087 if (VM_Version::supports_avx10_2()) {
1088 evucomxss(dst, zero);
1089 jcc(Assembler::negative, DONE_LABEL);
1090 } else {
1091 ucomiss(dst, zero);
1092 jcc(Assembler::equal, DONE_LABEL);
1093 }
1094 movflt(dst, one);
1095 jcc(Assembler::above, DONE_LABEL);
1096 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1097 } else if (opcode == Op_SignumD) {
1098 if (VM_Version::supports_avx10_2()) {
1099 evucomxsd(dst, zero);
1100 jcc(Assembler::negative, DONE_LABEL);
1101 } else {
1102 ucomisd(dst, zero);
1103 jcc(Assembler::equal, DONE_LABEL);
1104 }
1105 movdbl(dst, one);
1106 jcc(Assembler::above, DONE_LABEL);
1107 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1108 }
1109
1110 bind(DONE_LABEL);
1111 }
1112
1113 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1114 if (sign) {
1115 pmovsxbw(dst, src);
1116 } else {
1117 pmovzxbw(dst, src);
1118 }
1119 }
1120
1121 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1122 if (sign) {
1123 vpmovsxbw(dst, src, vector_len);
1124 } else {
1125 vpmovzxbw(dst, src, vector_len);
1126 }
1127 }
1128
1129 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1130 if (sign) {
1131 vpmovsxbd(dst, src, vector_len);
1132 } else {
1133 vpmovzxbd(dst, src, vector_len);
1134 }
1135 }
1136
1137 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1138 if (sign) {
1139 vpmovsxwd(dst, src, vector_len);
1140 } else {
1141 vpmovzxwd(dst, src, vector_len);
1142 }
1143 }
1144
1145 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1146 int shift, int vector_len) {
1147 if (opcode == Op_RotateLeftV) {
1148 if (etype == T_INT) {
1149 evprold(dst, src, shift, vector_len);
1150 } else {
1151 assert(etype == T_LONG, "expected type T_LONG");
1152 evprolq(dst, src, shift, vector_len);
1153 }
1154 } else {
1155 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1156 if (etype == T_INT) {
1157 evprord(dst, src, shift, vector_len);
1158 } else {
1159 assert(etype == T_LONG, "expected type T_LONG");
1160 evprorq(dst, src, shift, vector_len);
1161 }
1162 }
1163 }
1164
1165 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1166 XMMRegister shift, int vector_len) {
1167 if (opcode == Op_RotateLeftV) {
1168 if (etype == T_INT) {
1169 evprolvd(dst, src, shift, vector_len);
1170 } else {
1171 assert(etype == T_LONG, "expected type T_LONG");
1172 evprolvq(dst, src, shift, vector_len);
1173 }
1174 } else {
1175 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1176 if (etype == T_INT) {
1177 evprorvd(dst, src, shift, vector_len);
1178 } else {
1179 assert(etype == T_LONG, "expected type T_LONG");
1180 evprorvq(dst, src, shift, vector_len);
1181 }
1182 }
1183 }
1184
1185 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1186 if (opcode == Op_RShiftVI) {
1187 psrad(dst, shift);
1188 } else if (opcode == Op_LShiftVI) {
1189 pslld(dst, shift);
1190 } else {
1191 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1192 psrld(dst, shift);
1193 }
1194 }
1195
1196 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1197 switch (opcode) {
1198 case Op_RShiftVI: psrad(dst, shift); break;
1199 case Op_LShiftVI: pslld(dst, shift); break;
1200 case Op_URShiftVI: psrld(dst, shift); break;
1201
1202 default: assert(false, "%s", NodeClassNames[opcode]);
1203 }
1204 }
1205
1206 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1207 if (opcode == Op_RShiftVI) {
1208 vpsrad(dst, nds, shift, vector_len);
1209 } else if (opcode == Op_LShiftVI) {
1210 vpslld(dst, nds, shift, vector_len);
1211 } else {
1212 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1213 vpsrld(dst, nds, shift, vector_len);
1214 }
1215 }
1216
1217 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1218 switch (opcode) {
1219 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1220 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1221 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1222
1223 default: assert(false, "%s", NodeClassNames[opcode]);
1224 }
1225 }
1226
1227 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1228 switch (opcode) {
1229 case Op_RShiftVB: // fall-through
1230 case Op_RShiftVS: psraw(dst, shift); break;
1231
1232 case Op_LShiftVB: // fall-through
1233 case Op_LShiftVS: psllw(dst, shift); break;
1234
1235 case Op_URShiftVS: // fall-through
1236 case Op_URShiftVB: psrlw(dst, shift); break;
1237
1238 default: assert(false, "%s", NodeClassNames[opcode]);
1239 }
1240 }
1241
1242 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1243 switch (opcode) {
1244 case Op_RShiftVB: // fall-through
1245 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1246
1247 case Op_LShiftVB: // fall-through
1248 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1249
1250 case Op_URShiftVS: // fall-through
1251 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1252
1253 default: assert(false, "%s", NodeClassNames[opcode]);
1254 }
1255 }
1256
1257 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1258 switch (opcode) {
1259 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1260 case Op_LShiftVL: psllq(dst, shift); break;
1261 case Op_URShiftVL: psrlq(dst, shift); break;
1262
1263 default: assert(false, "%s", NodeClassNames[opcode]);
1264 }
1265 }
1266
1267 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1268 if (opcode == Op_RShiftVL) {
1269 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1270 } else if (opcode == Op_LShiftVL) {
1271 psllq(dst, shift);
1272 } else {
1273 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1274 psrlq(dst, shift);
1275 }
1276 }
1277
1278 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1279 switch (opcode) {
1280 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1281 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1282 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1283
1284 default: assert(false, "%s", NodeClassNames[opcode]);
1285 }
1286 }
1287
1288 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1289 if (opcode == Op_RShiftVL) {
1290 evpsraq(dst, nds, shift, vector_len);
1291 } else if (opcode == Op_LShiftVL) {
1292 vpsllq(dst, nds, shift, vector_len);
1293 } else {
1294 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1295 vpsrlq(dst, nds, shift, vector_len);
1296 }
1297 }
1298
1299 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1300 switch (opcode) {
1301 case Op_RShiftVB: // fall-through
1302 case Op_RShiftVS: // fall-through
1303 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1304
1305 case Op_LShiftVB: // fall-through
1306 case Op_LShiftVS: // fall-through
1307 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1308
1309 case Op_URShiftVB: // fall-through
1310 case Op_URShiftVS: // fall-through
1311 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1312
1313 default: assert(false, "%s", NodeClassNames[opcode]);
1314 }
1315 }
1316
1317 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1318 switch (opcode) {
1319 case Op_RShiftVB: // fall-through
1320 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1321
1322 case Op_LShiftVB: // fall-through
1323 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1324
1325 case Op_URShiftVB: // fall-through
1326 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1327
1328 default: assert(false, "%s", NodeClassNames[opcode]);
1329 }
1330 }
1331
1332 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1333 assert(UseAVX >= 2, "required");
1334 switch (opcode) {
1335 case Op_RShiftVL: {
1336 if (UseAVX > 2) {
1337 assert(tmp == xnoreg, "not used");
1338 if (!VM_Version::supports_avx512vl()) {
1339 vlen_enc = Assembler::AVX_512bit;
1340 }
1341 evpsravq(dst, src, shift, vlen_enc);
1342 } else {
1343 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1344 vpsrlvq(dst, src, shift, vlen_enc);
1345 vpsrlvq(tmp, tmp, shift, vlen_enc);
1346 vpxor(dst, dst, tmp, vlen_enc);
1347 vpsubq(dst, dst, tmp, vlen_enc);
1348 }
1349 break;
1350 }
1351 case Op_LShiftVL: {
1352 assert(tmp == xnoreg, "not used");
1353 vpsllvq(dst, src, shift, vlen_enc);
1354 break;
1355 }
1356 case Op_URShiftVL: {
1357 assert(tmp == xnoreg, "not used");
1358 vpsrlvq(dst, src, shift, vlen_enc);
1359 break;
1360 }
1361 default: assert(false, "%s", NodeClassNames[opcode]);
1362 }
1363 }
1364
1365 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1366 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1367 assert(opcode == Op_LShiftVB ||
1368 opcode == Op_RShiftVB ||
1369 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1370 bool sign = (opcode != Op_URShiftVB);
1371 assert(vector_len == 0, "required");
1372 vextendbd(sign, dst, src, 1);
1373 vpmovzxbd(vtmp, shift, 1);
1374 varshiftd(opcode, dst, dst, vtmp, 1);
1375 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1376 vextracti128_high(vtmp, dst);
1377 vpackusdw(dst, dst, vtmp, 0);
1378 }
1379
1380 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1381 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1382 assert(opcode == Op_LShiftVB ||
1383 opcode == Op_RShiftVB ||
1384 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1385 bool sign = (opcode != Op_URShiftVB);
1386 int ext_vector_len = vector_len + 1;
1387 vextendbw(sign, dst, src, ext_vector_len);
1388 vpmovzxbw(vtmp, shift, ext_vector_len);
1389 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1390 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1391 if (vector_len == 0) {
1392 vextracti128_high(vtmp, dst);
1393 vpackuswb(dst, dst, vtmp, vector_len);
1394 } else {
1395 vextracti64x4_high(vtmp, dst);
1396 vpackuswb(dst, dst, vtmp, vector_len);
1397 vpermq(dst, dst, 0xD8, vector_len);
1398 }
1399 }
1400
1401 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1402 switch(typ) {
1403 case T_BYTE:
1404 pinsrb(dst, val, idx);
1405 break;
1406 case T_SHORT:
1407 pinsrw(dst, val, idx);
1408 break;
1409 case T_INT:
1410 pinsrd(dst, val, idx);
1411 break;
1412 case T_LONG:
1413 pinsrq(dst, val, idx);
1414 break;
1415 default:
1416 assert(false,"Should not reach here.");
1417 break;
1418 }
1419 }
1420
1421 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1422 switch(typ) {
1423 case T_BYTE:
1424 vpinsrb(dst, src, val, idx);
1425 break;
1426 case T_SHORT:
1427 vpinsrw(dst, src, val, idx);
1428 break;
1429 case T_INT:
1430 vpinsrd(dst, src, val, idx);
1431 break;
1432 case T_LONG:
1433 vpinsrq(dst, src, val, idx);
1434 break;
1435 default:
1436 assert(false,"Should not reach here.");
1437 break;
1438 }
1439 }
1440
1441 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1442 Register base, Register idx_base,
1443 Register mask, Register mask_idx,
1444 Register rtmp, int vlen_enc) {
1445 vpxor(dst, dst, dst, vlen_enc);
1446 if (elem_bt == T_SHORT) {
1447 for (int i = 0; i < 4; i++) {
1448 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1449 Label skip_load;
1450 btq(mask, mask_idx);
1451 jccb(Assembler::carryClear, skip_load);
1452 movl(rtmp, Address(idx_base, i * 4));
1453 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1454 bind(skip_load);
1455 incq(mask_idx);
1456 }
1457 } else {
1458 assert(elem_bt == T_BYTE, "");
1459 for (int i = 0; i < 8; i++) {
1460 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1461 Label skip_load;
1462 btq(mask, mask_idx);
1463 jccb(Assembler::carryClear, skip_load);
1464 movl(rtmp, Address(idx_base, i * 4));
1465 pinsrb(dst, Address(base, rtmp), i);
1466 bind(skip_load);
1467 incq(mask_idx);
1468 }
1469 }
1470 }
1471
1472 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1473 Register base, Register idx_base,
1474 Register rtmp, int vlen_enc) {
1475 vpxor(dst, dst, dst, vlen_enc);
1476 if (elem_bt == T_SHORT) {
1477 for (int i = 0; i < 4; i++) {
1478 // dst[i] = src[idx_base[i]]
1479 movl(rtmp, Address(idx_base, i * 4));
1480 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1481 }
1482 } else {
1483 assert(elem_bt == T_BYTE, "");
1484 for (int i = 0; i < 8; i++) {
1485 // dst[i] = src[idx_base[i]]
1486 movl(rtmp, Address(idx_base, i * 4));
1487 pinsrb(dst, Address(base, rtmp), i);
1488 }
1489 }
1490 }
1491
1492 /*
1493 * Gather using hybrid algorithm, first partially unroll scalar loop
1494 * to accumulate values from gather indices into a quad-word(64bit) slice.
1495 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1496 * permutation to place the slice into appropriate vector lane
1497 * locations in destination vector. Following pseudo code describes the
1498 * algorithm in detail:
1499 *
1500 * DST_VEC = ZERO_VEC
1501 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1502 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1503 * FOREACH_ITER:
1504 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1505 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1506 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1507 * PERM_INDEX = PERM_INDEX - TWO_VEC
1508 *
1509 * With each iteration, doubleword permute indices (0,1) corresponding
1510 * to gathered quadword gets right shifted by two lane positions.
1511 *
1512 */
1513 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1514 Register base, Register idx_base,
1515 Register mask, XMMRegister xtmp1,
1516 XMMRegister xtmp2, XMMRegister temp_dst,
1517 Register rtmp, Register mask_idx,
1518 Register length, int vector_len, int vlen_enc) {
1519 Label GATHER8_LOOP;
1520 assert(is_subword_type(elem_ty), "");
1521 movl(length, vector_len);
1522 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1523 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1524 vallones(xtmp2, vlen_enc);
1525 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1526 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1527 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1528
1529 bind(GATHER8_LOOP);
1530 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1531 if (mask == noreg) {
1532 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1533 } else {
1534 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1535 }
1536 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1537 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1538 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1539 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1540 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1541 vpor(dst, dst, temp_dst, vlen_enc);
1542 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1543 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1544 jcc(Assembler::notEqual, GATHER8_LOOP);
1545 }
1546
1547 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1548 switch(typ) {
1549 case T_INT:
1550 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1551 break;
1552 case T_FLOAT:
1553 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1554 break;
1555 case T_LONG:
1556 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1557 break;
1558 case T_DOUBLE:
1559 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1560 break;
1561 default:
1562 assert(false,"Should not reach here.");
1563 break;
1564 }
1565 }
1566
1567 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1568 switch(typ) {
1569 case T_INT:
1570 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1571 break;
1572 case T_FLOAT:
1573 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1574 break;
1575 case T_LONG:
1576 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1577 break;
1578 case T_DOUBLE:
1579 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1580 break;
1581 default:
1582 assert(false,"Should not reach here.");
1583 break;
1584 }
1585 }
1586
1587 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1588 switch(typ) {
1589 case T_INT:
1590 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1591 break;
1592 case T_FLOAT:
1593 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1594 break;
1595 case T_LONG:
1596 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1597 break;
1598 case T_DOUBLE:
1599 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1600 break;
1601 default:
1602 assert(false,"Should not reach here.");
1603 break;
1604 }
1605 }
1606
1607 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1608 if (vlen_in_bytes <= 16) {
1609 pxor (dst, dst);
1610 psubb(dst, src);
1611 switch (elem_bt) {
1612 case T_BYTE: /* nothing to do */ break;
1613 case T_SHORT: pmovsxbw(dst, dst); break;
1614 case T_INT: pmovsxbd(dst, dst); break;
1615 case T_FLOAT: pmovsxbd(dst, dst); break;
1616 case T_LONG: pmovsxbq(dst, dst); break;
1617 case T_DOUBLE: pmovsxbq(dst, dst); break;
1618
1619 default: assert(false, "%s", type2name(elem_bt));
1620 }
1621 } else {
1622 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1623 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1624
1625 vpxor (dst, dst, dst, vlen_enc);
1626 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1627
1628 switch (elem_bt) {
1629 case T_BYTE: /* nothing to do */ break;
1630 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1631 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1632 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1633 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1634 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1635
1636 default: assert(false, "%s", type2name(elem_bt));
1637 }
1638 }
1639 }
1640
1641 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1642 if (novlbwdq) {
1643 vpmovsxbd(xtmp, src, vlen_enc);
1644 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1645 Assembler::eq, true, vlen_enc, noreg);
1646 } else {
1647 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1648 vpsubb(xtmp, xtmp, src, vlen_enc);
1649 evpmovb2m(dst, xtmp, vlen_enc);
1650 }
1651 }
1652
1653 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1654 if (is_integral_type(bt)) {
1655 switch (vlen_in_bytes) {
1656 case 4: movdl(dst, src); break;
1657 case 8: movq(dst, src); break;
1658 case 16: movdqu(dst, src); break;
1659 case 32: vmovdqu(dst, src); break;
1660 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1661 default: ShouldNotReachHere();
1662 }
1663 } else {
1664 switch (vlen_in_bytes) {
1665 case 4: movflt(dst, src); break;
1666 case 8: movdbl(dst, src); break;
1667 case 16: movups(dst, src); break;
1668 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1669 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1670 default: ShouldNotReachHere();
1671 }
1672 }
1673 }
1674
1675 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1676 assert(rscratch != noreg || always_reachable(src), "missing");
1677
1678 if (reachable(src)) {
1679 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1680 } else {
1681 lea(rscratch, src);
1682 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1683 }
1684 }
1685
1686 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1687 int vlen_enc = vector_length_encoding(vlen);
1688 if (VM_Version::supports_avx()) {
1689 if (bt == T_LONG) {
1690 if (VM_Version::supports_avx2()) {
1691 vpbroadcastq(dst, src, vlen_enc);
1692 } else {
1693 vmovddup(dst, src, vlen_enc);
1694 }
1695 } else if (bt == T_DOUBLE) {
1696 if (vlen_enc != Assembler::AVX_128bit) {
1697 vbroadcastsd(dst, src, vlen_enc, noreg);
1698 } else {
1699 vmovddup(dst, src, vlen_enc);
1700 }
1701 } else {
1702 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1703 vpbroadcastd(dst, src, vlen_enc);
1704 } else {
1705 vbroadcastss(dst, src, vlen_enc);
1706 }
1707 }
1708 } else if (VM_Version::supports_sse3()) {
1709 movddup(dst, src);
1710 } else {
1711 load_vector(bt, dst, src, vlen);
1712 }
1713 }
1714
1715 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1716 int entry_idx = vector_iota_entry_index(bt);
1717 ExternalAddress addr(StubRoutines::x86::vector_iota_indices(entry_idx));
1718 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1719 }
1720
1721 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1722
1723 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1724 int vector_len = Assembler::AVX_128bit;
1725
1726 switch (opcode) {
1727 case Op_AndReductionV: pand(dst, src); break;
1728 case Op_OrReductionV: por (dst, src); break;
1729 case Op_XorReductionV: pxor(dst, src); break;
1730 case Op_MinReductionV:
1731 switch (typ) {
1732 case T_BYTE: pminsb(dst, src); break;
1733 case T_SHORT: pminsw(dst, src); break;
1734 case T_INT: pminsd(dst, src); break;
1735 case T_LONG: assert(UseAVX > 2, "required");
1736 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1737 default: assert(false, "wrong type");
1738 }
1739 break;
1740 case Op_MaxReductionV:
1741 switch (typ) {
1742 case T_BYTE: pmaxsb(dst, src); break;
1743 case T_SHORT: pmaxsw(dst, src); break;
1744 case T_INT: pmaxsd(dst, src); break;
1745 case T_LONG: assert(UseAVX > 2, "required");
1746 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1747 default: assert(false, "wrong type");
1748 }
1749 break;
1750 case Op_UMinReductionV:
1751 switch (typ) {
1752 case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1753 case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1754 case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1755 case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1756 default: assert(false, "wrong type");
1757 }
1758 break;
1759 case Op_UMaxReductionV:
1760 switch (typ) {
1761 case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1762 case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1763 case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1764 case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1765 default: assert(false, "wrong type");
1766 }
1767 break;
1768 case Op_AddReductionVF: addss(dst, src); break;
1769 case Op_AddReductionVD: addsd(dst, src); break;
1770 case Op_AddReductionVI:
1771 switch (typ) {
1772 case T_BYTE: paddb(dst, src); break;
1773 case T_SHORT: paddw(dst, src); break;
1774 case T_INT: paddd(dst, src); break;
1775 default: assert(false, "wrong type");
1776 }
1777 break;
1778 case Op_AddReductionVL: paddq(dst, src); break;
1779 case Op_MulReductionVF: mulss(dst, src); break;
1780 case Op_MulReductionVD: mulsd(dst, src); break;
1781 case Op_MulReductionVI:
1782 switch (typ) {
1783 case T_SHORT: pmullw(dst, src); break;
1784 case T_INT: pmulld(dst, src); break;
1785 default: assert(false, "wrong type");
1786 }
1787 break;
1788 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1789 evpmullq(dst, dst, src, vector_len); break;
1790 default: assert(false, "wrong opcode");
1791 }
1792 }
1793
1794 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1795 switch (opcode) {
1796 case Op_AddReductionVF: addps(dst, src); break;
1797 case Op_AddReductionVD: addpd(dst, src); break;
1798 case Op_MulReductionVF: mulps(dst, src); break;
1799 case Op_MulReductionVD: mulpd(dst, src); break;
1800 default: assert(false, "%s", NodeClassNames[opcode]);
1801 }
1802 }
1803
1804 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1805 int vector_len = Assembler::AVX_256bit;
1806
1807 switch (opcode) {
1808 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1809 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1810 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1811 case Op_MinReductionV:
1812 switch (typ) {
1813 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1814 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1815 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1816 case T_LONG: assert(UseAVX > 2, "required");
1817 vpminsq(dst, src1, src2, vector_len); break;
1818 default: assert(false, "wrong type");
1819 }
1820 break;
1821 case Op_MaxReductionV:
1822 switch (typ) {
1823 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1824 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1825 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1826 case T_LONG: assert(UseAVX > 2, "required");
1827 vpmaxsq(dst, src1, src2, vector_len); break;
1828 default: assert(false, "wrong type");
1829 }
1830 break;
1831 case Op_UMinReductionV:
1832 switch (typ) {
1833 case T_BYTE: vpminub(dst, src1, src2, vector_len); break;
1834 case T_SHORT: vpminuw(dst, src1, src2, vector_len); break;
1835 case T_INT: vpminud(dst, src1, src2, vector_len); break;
1836 case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break;
1837 default: assert(false, "wrong type");
1838 }
1839 break;
1840 case Op_UMaxReductionV:
1841 switch (typ) {
1842 case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break;
1843 case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break;
1844 case T_INT: vpmaxud(dst, src1, src2, vector_len); break;
1845 case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1846 default: assert(false, "wrong type");
1847 }
1848 break;
1849 case Op_AddReductionVI:
1850 switch (typ) {
1851 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1852 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1853 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1854 default: assert(false, "wrong type");
1855 }
1856 break;
1857 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1858 case Op_MulReductionVI:
1859 switch (typ) {
1860 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1861 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1862 default: assert(false, "wrong type");
1863 }
1864 break;
1865 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1866 default: assert(false, "wrong opcode");
1867 }
1868 }
1869
1870 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1871 int vector_len = Assembler::AVX_256bit;
1872
1873 switch (opcode) {
1874 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1875 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1876 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1877 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1878 default: assert(false, "%s", NodeClassNames[opcode]);
1879 }
1880 }
1881
1882 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1883 XMMRegister dst, XMMRegister src,
1884 XMMRegister vtmp1, XMMRegister vtmp2) {
1885 switch (opcode) {
1886 case Op_AddReductionVF:
1887 case Op_MulReductionVF:
1888 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1889 break;
1890
1891 case Op_AddReductionVD:
1892 case Op_MulReductionVD:
1893 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1894 break;
1895
1896 default: assert(false, "wrong opcode");
1897 }
1898 }
1899
1900 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1901 XMMRegister dst, XMMRegister src,
1902 XMMRegister vtmp1, XMMRegister vtmp2) {
1903 switch (opcode) {
1904 case Op_AddReductionVF:
1905 case Op_MulReductionVF:
1906 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1907 break;
1908
1909 case Op_AddReductionVD:
1910 case Op_MulReductionVD:
1911 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1912 break;
1913
1914 default: assert(false, "%s", NodeClassNames[opcode]);
1915 }
1916 }
1917
1918 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1919 Register dst, Register src1, XMMRegister src2,
1920 XMMRegister vtmp1, XMMRegister vtmp2) {
1921 switch (vlen) {
1922 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1923 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1924 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1925 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1926
1927 default: assert(false, "wrong vector length");
1928 }
1929 }
1930
1931 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1932 Register dst, Register src1, XMMRegister src2,
1933 XMMRegister vtmp1, XMMRegister vtmp2) {
1934 switch (vlen) {
1935 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1936 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1937 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1938 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1939
1940 default: assert(false, "wrong vector length");
1941 }
1942 }
1943
1944 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1945 Register dst, Register src1, XMMRegister src2,
1946 XMMRegister vtmp1, XMMRegister vtmp2) {
1947 switch (vlen) {
1948 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1949 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1950 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1951 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1952
1953 default: assert(false, "wrong vector length");
1954 }
1955 }
1956
1957 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1958 Register dst, Register src1, XMMRegister src2,
1959 XMMRegister vtmp1, XMMRegister vtmp2) {
1960 switch (vlen) {
1961 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1963 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1964 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1965
1966 default: assert(false, "wrong vector length");
1967 }
1968 }
1969
1970 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1971 Register dst, Register src1, XMMRegister src2,
1972 XMMRegister vtmp1, XMMRegister vtmp2) {
1973 switch (vlen) {
1974 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1975 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1976 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1977
1978 default: assert(false, "wrong vector length");
1979 }
1980 }
1981
1982 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1983 switch (vlen) {
1984 case 2:
1985 assert(vtmp2 == xnoreg, "");
1986 reduce2F(opcode, dst, src, vtmp1);
1987 break;
1988 case 4:
1989 assert(vtmp2 == xnoreg, "");
1990 reduce4F(opcode, dst, src, vtmp1);
1991 break;
1992 case 8:
1993 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1994 break;
1995 case 16:
1996 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1997 break;
1998 default: assert(false, "wrong vector length");
1999 }
2000 }
2001
2002 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2003 switch (vlen) {
2004 case 2:
2005 assert(vtmp2 == xnoreg, "");
2006 reduce2D(opcode, dst, src, vtmp1);
2007 break;
2008 case 4:
2009 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2010 break;
2011 case 8:
2012 reduce8D(opcode, dst, src, vtmp1, vtmp2);
2013 break;
2014 default: assert(false, "wrong vector length");
2015 }
2016 }
2017
2018 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2019 switch (vlen) {
2020 case 2:
2021 assert(vtmp1 == xnoreg, "");
2022 assert(vtmp2 == xnoreg, "");
2023 unorderedReduce2F(opcode, dst, src);
2024 break;
2025 case 4:
2026 assert(vtmp2 == xnoreg, "");
2027 unorderedReduce4F(opcode, dst, src, vtmp1);
2028 break;
2029 case 8:
2030 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2031 break;
2032 case 16:
2033 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2034 break;
2035 default: assert(false, "wrong vector length");
2036 }
2037 }
2038
2039 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2040 switch (vlen) {
2041 case 2:
2042 assert(vtmp1 == xnoreg, "");
2043 assert(vtmp2 == xnoreg, "");
2044 unorderedReduce2D(opcode, dst, src);
2045 break;
2046 case 4:
2047 assert(vtmp2 == xnoreg, "");
2048 unorderedReduce4D(opcode, dst, src, vtmp1);
2049 break;
2050 case 8:
2051 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2052 break;
2053 default: assert(false, "wrong vector length");
2054 }
2055 }
2056
2057 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2058 if (opcode == Op_AddReductionVI) {
2059 if (vtmp1 != src2) {
2060 movdqu(vtmp1, src2);
2061 }
2062 phaddd(vtmp1, vtmp1);
2063 } else {
2064 pshufd(vtmp1, src2, 0x1);
2065 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2066 }
2067 movdl(vtmp2, src1);
2068 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2069 movdl(dst, vtmp1);
2070 }
2071
2072 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2073 if (opcode == Op_AddReductionVI) {
2074 if (vtmp1 != src2) {
2075 movdqu(vtmp1, src2);
2076 }
2077 phaddd(vtmp1, src2);
2078 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2079 } else {
2080 pshufd(vtmp2, src2, 0xE);
2081 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2082 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2083 }
2084 }
2085
2086 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2087 if (opcode == Op_AddReductionVI) {
2088 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2089 vextracti128_high(vtmp2, vtmp1);
2090 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2091 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2092 } else {
2093 vextracti128_high(vtmp1, src2);
2094 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2095 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2096 }
2097 }
2098
2099 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2100 vextracti64x4_high(vtmp2, src2);
2101 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2102 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2103 }
2104
2105 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2106 pshufd(vtmp2, src2, 0x1);
2107 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2108 movdqu(vtmp1, vtmp2);
2109 psrldq(vtmp1, 2);
2110 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2111 movdqu(vtmp2, vtmp1);
2112 psrldq(vtmp2, 1);
2113 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2114 movdl(vtmp2, src1);
2115 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2116 pmovzxbd(vtmp1, vtmp1);
2117 } else {
2118 pmovsxbd(vtmp1, vtmp1);
2119 }
2120 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2121 pextrb(dst, vtmp1, 0x0);
2122 movsbl(dst, dst);
2123 }
2124
2125 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2126 pshufd(vtmp1, src2, 0xE);
2127 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2128 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2129 }
2130
2131 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2132 vextracti128_high(vtmp2, src2);
2133 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2134 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2135 }
2136
2137 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2138 vextracti64x4_high(vtmp1, src2);
2139 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2140 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2141 }
2142
2143 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2144 pmovsxbw(vtmp2, src2);
2145 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2146 }
2147
2148 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2149 if (UseAVX > 1) {
2150 int vector_len = Assembler::AVX_256bit;
2151 vpmovsxbw(vtmp1, src2, vector_len);
2152 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2153 } else {
2154 pmovsxbw(vtmp2, src2);
2155 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2156 pshufd(vtmp2, src2, 0xe);
2157 pmovsxbw(vtmp2, vtmp2);
2158 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2159 }
2160 }
2161
2162 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2163 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2164 int vector_len = Assembler::AVX_512bit;
2165 vpmovsxbw(vtmp1, src2, vector_len);
2166 reduce32S(opcode, dst, src1, vtmp1, vtmp2, vtmp1);
2167 } else {
2168 assert(UseAVX >= 2,"Should not reach here.");
2169 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2170 vextracti128_high(vtmp2, src2);
2171 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2172 }
2173 }
2174
2175 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2176 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2177 vextracti64x4_high(vtmp2, src2);
2178 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2179 }
2180
2181 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2182 if (opcode == Op_AddReductionVI) {
2183 if (vtmp1 != src2) {
2184 movdqu(vtmp1, src2);
2185 }
2186 phaddw(vtmp1, vtmp1);
2187 phaddw(vtmp1, vtmp1);
2188 } else {
2189 pshufd(vtmp2, src2, 0x1);
2190 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2191 movdqu(vtmp1, vtmp2);
2192 psrldq(vtmp1, 2);
2193 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2194 }
2195 movdl(vtmp2, src1);
2196 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2197 pmovzxwd(vtmp1, vtmp1);
2198 } else {
2199 pmovsxwd(vtmp1, vtmp1);
2200 }
2201 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2202 pextrw(dst, vtmp1, 0x0);
2203 movswl(dst, dst);
2204 }
2205
2206 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2207 if (opcode == Op_AddReductionVI) {
2208 if (vtmp1 != src2) {
2209 movdqu(vtmp1, src2);
2210 }
2211 phaddw(vtmp1, src2);
2212 } else {
2213 assert_different_registers(src2, vtmp1);
2214 pshufd(vtmp1, src2, 0xE);
2215 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2216 }
2217 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2218 }
2219
2220 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2221 if (opcode == Op_AddReductionVI) {
2222 int vector_len = Assembler::AVX_256bit;
2223 vphaddw(vtmp2, src2, src2, vector_len);
2224 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2225 } else {
2226 assert_different_registers(src2, vtmp2);
2227 vextracti128_high(vtmp2, src2);
2228 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2229 }
2230 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2231 }
2232
2233 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2234 assert_different_registers(src2, vtmp1);
2235 int vector_len = Assembler::AVX_256bit;
2236 vextracti64x4_high(vtmp1, src2);
2237 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2238 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2239 }
2240
2241 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2242 pshufd(vtmp2, src2, 0xE);
2243 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2244 movdq(vtmp1, src1);
2245 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2246 movdq(dst, vtmp1);
2247 }
2248
2249 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2250 vextracti128_high(vtmp1, src2);
2251 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2252 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2253 }
2254
2255 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2256 vextracti64x4_high(vtmp2, src2);
2257 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2258 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2259 }
2260
2261 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2262 mov64(temp, -1L);
2263 bzhiq(temp, temp, len);
2264 kmovql(dst, temp);
2265 }
2266
2267 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2268 reduce_operation_128(T_FLOAT, opcode, dst, src);
2269 pshufd(vtmp, src, 0x1);
2270 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2271 }
2272
2273 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2274 reduce2F(opcode, dst, src, vtmp);
2275 pshufd(vtmp, src, 0x2);
2276 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2277 pshufd(vtmp, src, 0x3);
2278 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2279 }
2280
2281 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2282 reduce4F(opcode, dst, src, vtmp2);
2283 vextractf128_high(vtmp2, src);
2284 reduce4F(opcode, dst, vtmp2, vtmp1);
2285 }
2286
2287 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2288 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2289 vextracti64x4_high(vtmp1, src);
2290 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2291 }
2292
2293 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2294 pshufd(dst, src, 0x1);
2295 reduce_operation_128(T_FLOAT, opcode, dst, src);
2296 }
2297
2298 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2299 pshufd(vtmp, src, 0xE);
2300 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2301 unorderedReduce2F(opcode, dst, vtmp);
2302 }
2303
2304 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2305 vextractf128_high(vtmp1, src);
2306 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2307 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2308 }
2309
2310 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2311 vextractf64x4_high(vtmp2, src);
2312 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2313 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2314 }
2315
2316 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2317 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2318 pshufd(vtmp, src, 0xE);
2319 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2320 }
2321
2322 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2323 reduce2D(opcode, dst, src, vtmp2);
2324 vextractf128_high(vtmp2, src);
2325 reduce2D(opcode, dst, vtmp2, vtmp1);
2326 }
2327
2328 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2329 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2330 vextracti64x4_high(vtmp1, src);
2331 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2332 }
2333
2334 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2335 pshufd(dst, src, 0xE);
2336 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2337 }
2338
2339 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2340 vextractf128_high(vtmp, src);
2341 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2342 unorderedReduce2D(opcode, dst, vtmp);
2343 }
2344
2345 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2346 vextractf64x4_high(vtmp2, src);
2347 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2348 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2349 }
2350
2351 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2352 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2353 }
2354
2355 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2356 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2357 }
2358
2359 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2360 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2361 }
2362
2363 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2364 int vec_enc) {
2365 switch(elem_bt) {
2366 case T_INT:
2367 case T_FLOAT:
2368 vmaskmovps(dst, src, mask, vec_enc);
2369 break;
2370 case T_LONG:
2371 case T_DOUBLE:
2372 vmaskmovpd(dst, src, mask, vec_enc);
2373 break;
2374 default:
2375 fatal("Unsupported type %s", type2name(elem_bt));
2376 break;
2377 }
2378 }
2379
2380 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2381 int vec_enc) {
2382 switch(elem_bt) {
2383 case T_INT:
2384 case T_FLOAT:
2385 vmaskmovps(dst, src, mask, vec_enc);
2386 break;
2387 case T_LONG:
2388 case T_DOUBLE:
2389 vmaskmovpd(dst, src, mask, vec_enc);
2390 break;
2391 default:
2392 fatal("Unsupported type %s", type2name(elem_bt));
2393 break;
2394 }
2395 }
2396
2397 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2398 XMMRegister dst, XMMRegister src,
2399 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2400 XMMRegister xmm_0, XMMRegister xmm_1) {
2401 const int permconst[] = {1, 14};
2402 XMMRegister wsrc = src;
2403 XMMRegister wdst = xmm_0;
2404 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2405
2406 int vlen_enc = Assembler::AVX_128bit;
2407 if (vlen == 16) {
2408 vlen_enc = Assembler::AVX_256bit;
2409 }
2410
2411 for (int i = log2(vlen) - 1; i >=0; i--) {
2412 if (i == 0 && !is_dst_valid) {
2413 wdst = dst;
2414 }
2415 if (i == 3) {
2416 vextracti64x4_high(wtmp, wsrc);
2417 } else if (i == 2) {
2418 vextracti128_high(wtmp, wsrc);
2419 } else { // i = [0,1]
2420 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2421 }
2422
2423 if (VM_Version::supports_avx10_2()) {
2424 vminmax_fp_avx10_2(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2425 } else {
2426 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2427 }
2428 wsrc = wdst;
2429 vlen_enc = Assembler::AVX_128bit;
2430 }
2431 if (is_dst_valid) {
2432 if (VM_Version::supports_avx10_2()) {
2433 vminmax_fp_avx10_2(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2434 } else {
2435 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2436 }
2437 }
2438 }
2439
2440 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2441 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2442 XMMRegister xmm_0, XMMRegister xmm_1) {
2443 XMMRegister wsrc = src;
2444 XMMRegister wdst = xmm_0;
2445 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2446 int vlen_enc = Assembler::AVX_128bit;
2447 if (vlen == 8) {
2448 vlen_enc = Assembler::AVX_256bit;
2449 }
2450 for (int i = log2(vlen) - 1; i >=0; i--) {
2451 if (i == 0 && !is_dst_valid) {
2452 wdst = dst;
2453 }
2454 if (i == 1) {
2455 vextracti128_high(wtmp, wsrc);
2456 } else if (i == 2) {
2457 vextracti64x4_high(wtmp, wsrc);
2458 } else {
2459 assert(i == 0, "%d", i);
2460 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2461 }
2462
2463 if (VM_Version::supports_avx10_2()) {
2464 vminmax_fp_avx10_2(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2465 } else {
2466 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2467 }
2468
2469 wsrc = wdst;
2470 vlen_enc = Assembler::AVX_128bit;
2471 }
2472
2473 if (is_dst_valid) {
2474 if (VM_Version::supports_avx10_2()) {
2475 vminmax_fp_avx10_2(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2476 } else {
2477 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2478 }
2479 }
2480 }
2481
2482 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2483 switch (bt) {
2484 case T_BYTE: pextrb(dst, src, idx); break;
2485 case T_SHORT: pextrw(dst, src, idx); break;
2486 case T_INT: pextrd(dst, src, idx); break;
2487 case T_LONG: pextrq(dst, src, idx); break;
2488
2489 default:
2490 assert(false,"Should not reach here.");
2491 break;
2492 }
2493 }
2494
2495 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2496 int esize = type2aelembytes(typ);
2497 int elem_per_lane = 16/esize;
2498 int lane = elemindex / elem_per_lane;
2499 int eindex = elemindex % elem_per_lane;
2500
2501 if (lane >= 2) {
2502 assert(UseAVX > 2, "required");
2503 vextractf32x4(dst, src, lane & 3);
2504 return dst;
2505 } else if (lane > 0) {
2506 assert(UseAVX > 0, "required");
2507 vextractf128(dst, src, lane);
2508 return dst;
2509 } else {
2510 return src;
2511 }
2512 }
2513
2514 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2515 if (typ == T_BYTE) {
2516 movsbl(dst, dst);
2517 } else if (typ == T_SHORT) {
2518 movswl(dst, dst);
2519 }
2520 }
2521
2522 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2523 int esize = type2aelembytes(typ);
2524 int elem_per_lane = 16/esize;
2525 int eindex = elemindex % elem_per_lane;
2526 assert(is_integral_type(typ),"required");
2527
2528 if (eindex == 0) {
2529 if (typ == T_LONG) {
2530 movq(dst, src);
2531 } else {
2532 movdl(dst, src);
2533 movsxl(typ, dst);
2534 }
2535 } else {
2536 extract(typ, dst, src, eindex);
2537 movsxl(typ, dst);
2538 }
2539 }
2540
2541 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2542 int esize = type2aelembytes(typ);
2543 int elem_per_lane = 16/esize;
2544 int eindex = elemindex % elem_per_lane;
2545 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2546
2547 if (eindex == 0) {
2548 movq(dst, src);
2549 } else {
2550 if (typ == T_FLOAT) {
2551 if (UseAVX == 0) {
2552 movdqu(dst, src);
2553 shufps(dst, dst, eindex);
2554 } else {
2555 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2556 }
2557 } else {
2558 if (UseAVX == 0) {
2559 movdqu(dst, src);
2560 psrldq(dst, eindex*esize);
2561 } else {
2562 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2563 }
2564 movq(dst, dst);
2565 }
2566 }
2567 // Zero upper bits
2568 if (typ == T_FLOAT) {
2569 if (UseAVX == 0) {
2570 assert(vtmp != xnoreg, "required.");
2571 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2572 pand(dst, vtmp);
2573 } else {
2574 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2575 }
2576 }
2577 }
2578
2579 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2580 switch(typ) {
2581 case T_BYTE:
2582 case T_BOOLEAN:
2583 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2584 break;
2585 case T_SHORT:
2586 case T_CHAR:
2587 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2588 break;
2589 case T_INT:
2590 case T_FLOAT:
2591 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2592 break;
2593 case T_LONG:
2594 case T_DOUBLE:
2595 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2596 break;
2597 default:
2598 assert(false,"Should not reach here.");
2599 break;
2600 }
2601 }
2602
2603 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2604 assert(rscratch != noreg || always_reachable(src2), "missing");
2605
2606 switch(typ) {
2607 case T_BOOLEAN:
2608 case T_BYTE:
2609 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2610 break;
2611 case T_CHAR:
2612 case T_SHORT:
2613 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2614 break;
2615 case T_INT:
2616 case T_FLOAT:
2617 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2618 break;
2619 case T_LONG:
2620 case T_DOUBLE:
2621 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2622 break;
2623 default:
2624 assert(false,"Should not reach here.");
2625 break;
2626 }
2627 }
2628
2629 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2630 switch(typ) {
2631 case T_BYTE:
2632 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2633 break;
2634 case T_SHORT:
2635 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2636 break;
2637 case T_INT:
2638 case T_FLOAT:
2639 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2640 break;
2641 case T_LONG:
2642 case T_DOUBLE:
2643 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2644 break;
2645 default:
2646 assert(false,"Should not reach here.");
2647 break;
2648 }
2649 }
2650
2651 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2652 assert(vlen_in_bytes <= 32, "");
2653 int esize = type2aelembytes(bt);
2654 if (vlen_in_bytes == 32) {
2655 assert(vtmp == xnoreg, "required.");
2656 if (esize >= 4) {
2657 vtestps(src1, src2, AVX_256bit);
2658 } else {
2659 vptest(src1, src2, AVX_256bit);
2660 }
2661 return;
2662 }
2663 if (vlen_in_bytes < 16) {
2664 // Duplicate the lower part to fill the whole register,
2665 // Don't need to do so for src2
2666 assert(vtmp != xnoreg, "required");
2667 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2668 pshufd(vtmp, src1, shuffle_imm);
2669 } else {
2670 assert(vtmp == xnoreg, "required");
2671 vtmp = src1;
2672 }
2673 if (esize >= 4 && VM_Version::supports_avx()) {
2674 vtestps(vtmp, src2, AVX_128bit);
2675 } else {
2676 ptest(vtmp, src2);
2677 }
2678 }
2679
2680 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2681 #ifdef ASSERT
2682 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2683 bool is_bw_supported = VM_Version::supports_avx512bw();
2684 if (is_bw && !is_bw_supported) {
2685 assert(vlen_enc != Assembler::AVX_512bit, "required");
2686 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2687 "XMM register should be 0-15");
2688 }
2689 #endif // ASSERT
2690 switch (elem_bt) {
2691 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2692 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2693 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2694 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2695 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2696 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2697 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2698 }
2699 }
2700
2701 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2702 assert(UseAVX >= 2, "required");
2703 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2704 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2705 if ((UseAVX > 2) &&
2706 (!is_bw || VM_Version::supports_avx512bw()) &&
2707 (!is_vl || VM_Version::supports_avx512vl())) {
2708 switch (elem_bt) {
2709 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2710 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2711 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2712 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2713 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2714 }
2715 } else {
2716 assert(vlen_enc != Assembler::AVX_512bit, "required");
2717 assert((dst->encoding() < 16),"XMM register should be 0-15");
2718 switch (elem_bt) {
2719 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2720 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2721 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2722 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2723 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2724 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2725 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2726 }
2727 }
2728 }
2729
2730 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2731 switch (to_elem_bt) {
2732 case T_SHORT:
2733 vpmovsxbw(dst, src, vlen_enc);
2734 break;
2735 case T_INT:
2736 vpmovsxbd(dst, src, vlen_enc);
2737 break;
2738 case T_FLOAT:
2739 vpmovsxbd(dst, src, vlen_enc);
2740 vcvtdq2ps(dst, dst, vlen_enc);
2741 break;
2742 case T_LONG:
2743 vpmovsxbq(dst, src, vlen_enc);
2744 break;
2745 case T_DOUBLE: {
2746 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2747 vpmovsxbd(dst, src, mid_vlen_enc);
2748 vcvtdq2pd(dst, dst, vlen_enc);
2749 break;
2750 }
2751 default:
2752 fatal("Unsupported type %s", type2name(to_elem_bt));
2753 break;
2754 }
2755 }
2756
2757 //-------------------------------------------------------------------------------------------
2758
2759 // IndexOf for constant substrings with size >= 8 chars
2760 // which don't need to be loaded through stack.
2761 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2762 Register cnt1, Register cnt2,
2763 int int_cnt2, Register result,
2764 XMMRegister vec, Register tmp,
2765 int ae) {
2766 ShortBranchVerifier sbv(this);
2767 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2768 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2769
2770 // This method uses the pcmpestri instruction with bound registers
2771 // inputs:
2772 // xmm - substring
2773 // rax - substring length (elements count)
2774 // mem - scanned string
2775 // rdx - string length (elements count)
2776 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2777 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2778 // outputs:
2779 // rcx - matched index in string
2780 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2781 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2782 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2783 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2784 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2785
2786 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2787 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2788 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2789
2790 // Note, inline_string_indexOf() generates checks:
2791 // if (substr.count > string.count) return -1;
2792 // if (substr.count == 0) return 0;
2793 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2794
2795 // Load substring.
2796 if (ae == StrIntrinsicNode::UL) {
2797 pmovzxbw(vec, Address(str2, 0));
2798 } else {
2799 movdqu(vec, Address(str2, 0));
2800 }
2801 movl(cnt2, int_cnt2);
2802 movptr(result, str1); // string addr
2803
2804 if (int_cnt2 > stride) {
2805 jmpb(SCAN_TO_SUBSTR);
2806
2807 // Reload substr for rescan, this code
2808 // is executed only for large substrings (> 8 chars)
2809 bind(RELOAD_SUBSTR);
2810 if (ae == StrIntrinsicNode::UL) {
2811 pmovzxbw(vec, Address(str2, 0));
2812 } else {
2813 movdqu(vec, Address(str2, 0));
2814 }
2815 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2816
2817 bind(RELOAD_STR);
2818 // We came here after the beginning of the substring was
2819 // matched but the rest of it was not so we need to search
2820 // again. Start from the next element after the previous match.
2821
2822 // cnt2 is number of substring reminding elements and
2823 // cnt1 is number of string reminding elements when cmp failed.
2824 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2825 subl(cnt1, cnt2);
2826 addl(cnt1, int_cnt2);
2827 movl(cnt2, int_cnt2); // Now restore cnt2
2828
2829 decrementl(cnt1); // Shift to next element
2830 cmpl(cnt1, cnt2);
2831 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2832
2833 addptr(result, (1<<scale1));
2834
2835 } // (int_cnt2 > 8)
2836
2837 // Scan string for start of substr in 16-byte vectors
2838 bind(SCAN_TO_SUBSTR);
2839 pcmpestri(vec, Address(result, 0), mode);
2840 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2841 subl(cnt1, stride);
2842 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2843 cmpl(cnt1, cnt2);
2844 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2845 addptr(result, 16);
2846 jmpb(SCAN_TO_SUBSTR);
2847
2848 // Found a potential substr
2849 bind(FOUND_CANDIDATE);
2850 // Matched whole vector if first element matched (tmp(rcx) == 0).
2851 if (int_cnt2 == stride) {
2852 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2853 } else { // int_cnt2 > 8
2854 jccb(Assembler::overflow, FOUND_SUBSTR);
2855 }
2856 // After pcmpestri tmp(rcx) contains matched element index
2857 // Compute start addr of substr
2858 lea(result, Address(result, tmp, scale1));
2859
2860 // Make sure string is still long enough
2861 subl(cnt1, tmp);
2862 cmpl(cnt1, cnt2);
2863 if (int_cnt2 == stride) {
2864 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2865 } else { // int_cnt2 > 8
2866 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2867 }
2868 // Left less then substring.
2869
2870 bind(RET_NOT_FOUND);
2871 movl(result, -1);
2872 jmp(EXIT);
2873
2874 if (int_cnt2 > stride) {
2875 // This code is optimized for the case when whole substring
2876 // is matched if its head is matched.
2877 bind(MATCH_SUBSTR_HEAD);
2878 pcmpestri(vec, Address(result, 0), mode);
2879 // Reload only string if does not match
2880 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2881
2882 Label CONT_SCAN_SUBSTR;
2883 // Compare the rest of substring (> 8 chars).
2884 bind(FOUND_SUBSTR);
2885 // First 8 chars are already matched.
2886 negptr(cnt2);
2887 addptr(cnt2, stride);
2888
2889 bind(SCAN_SUBSTR);
2890 subl(cnt1, stride);
2891 cmpl(cnt2, -stride); // Do not read beyond substring
2892 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2893 // Back-up strings to avoid reading beyond substring:
2894 // cnt1 = cnt1 - cnt2 + 8
2895 addl(cnt1, cnt2); // cnt2 is negative
2896 addl(cnt1, stride);
2897 movl(cnt2, stride); negptr(cnt2);
2898 bind(CONT_SCAN_SUBSTR);
2899 if (int_cnt2 < (int)G) {
2900 int tail_off1 = int_cnt2<<scale1;
2901 int tail_off2 = int_cnt2<<scale2;
2902 if (ae == StrIntrinsicNode::UL) {
2903 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2904 } else {
2905 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2906 }
2907 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2908 } else {
2909 // calculate index in register to avoid integer overflow (int_cnt2*2)
2910 movl(tmp, int_cnt2);
2911 addptr(tmp, cnt2);
2912 if (ae == StrIntrinsicNode::UL) {
2913 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2914 } else {
2915 movdqu(vec, Address(str2, tmp, scale2, 0));
2916 }
2917 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2918 }
2919 // Need to reload strings pointers if not matched whole vector
2920 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2921 addptr(cnt2, stride);
2922 jcc(Assembler::negative, SCAN_SUBSTR);
2923 // Fall through if found full substring
2924
2925 } // (int_cnt2 > 8)
2926
2927 bind(RET_FOUND);
2928 // Found result if we matched full small substring.
2929 // Compute substr offset
2930 subptr(result, str1);
2931 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2932 shrl(result, 1); // index
2933 }
2934 bind(EXIT);
2935
2936 } // string_indexofC8
2937
2938 // Small strings are loaded through stack if they cross page boundary.
2939 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2940 Register cnt1, Register cnt2,
2941 int int_cnt2, Register result,
2942 XMMRegister vec, Register tmp,
2943 int ae) {
2944 ShortBranchVerifier sbv(this);
2945 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2946 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2947
2948 //
2949 // int_cnt2 is length of small (< 8 chars) constant substring
2950 // or (-1) for non constant substring in which case its length
2951 // is in cnt2 register.
2952 //
2953 // Note, inline_string_indexOf() generates checks:
2954 // if (substr.count > string.count) return -1;
2955 // if (substr.count == 0) return 0;
2956 //
2957 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2958 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2959 // This method uses the pcmpestri instruction with bound registers
2960 // inputs:
2961 // xmm - substring
2962 // rax - substring length (elements count)
2963 // mem - scanned string
2964 // rdx - string length (elements count)
2965 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2966 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2967 // outputs:
2968 // rcx - matched index in string
2969 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2970 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2971 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2972 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2973
2974 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2975 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2976 FOUND_CANDIDATE;
2977
2978 { //========================================================
2979 // We don't know where these strings are located
2980 // and we can't read beyond them. Load them through stack.
2981 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2982
2983 movptr(tmp, rsp); // save old SP
2984
2985 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2986 if (int_cnt2 == (1>>scale2)) { // One byte
2987 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2988 load_unsigned_byte(result, Address(str2, 0));
2989 movdl(vec, result); // move 32 bits
2990 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2991 // Not enough header space in 32-bit VM: 12+3 = 15.
2992 movl(result, Address(str2, -1));
2993 shrl(result, 8);
2994 movdl(vec, result); // move 32 bits
2995 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2996 load_unsigned_short(result, Address(str2, 0));
2997 movdl(vec, result); // move 32 bits
2998 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2999 movdl(vec, Address(str2, 0)); // move 32 bits
3000 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3001 movq(vec, Address(str2, 0)); // move 64 bits
3002 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3003 // Array header size is 12 bytes in 32-bit VM
3004 // + 6 bytes for 3 chars == 18 bytes,
3005 // enough space to load vec and shift.
3006 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3007 if (ae == StrIntrinsicNode::UL) {
3008 int tail_off = int_cnt2-8;
3009 pmovzxbw(vec, Address(str2, tail_off));
3010 psrldq(vec, -2*tail_off);
3011 }
3012 else {
3013 int tail_off = int_cnt2*(1<<scale2);
3014 movdqu(vec, Address(str2, tail_off-16));
3015 psrldq(vec, 16-tail_off);
3016 }
3017 }
3018 } else { // not constant substring
3019 cmpl(cnt2, stride);
3020 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3021
3022 // We can read beyond string if srt+16 does not cross page boundary
3023 // since heaps are aligned and mapped by pages.
3024 assert(os::vm_page_size() < (int)G, "default page should be small");
3025 movl(result, str2); // We need only low 32 bits
3026 andl(result, ((int)os::vm_page_size()-1));
3027 cmpl(result, ((int)os::vm_page_size()-16));
3028 jccb(Assembler::belowEqual, CHECK_STR);
3029
3030 // Move small strings to stack to allow load 16 bytes into vec.
3031 subptr(rsp, 16);
3032 int stk_offset = wordSize-(1<<scale2);
3033 push(cnt2);
3034
3035 bind(COPY_SUBSTR);
3036 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3037 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3038 movb(Address(rsp, cnt2, scale2, stk_offset), result);
3039 } else if (ae == StrIntrinsicNode::UU) {
3040 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3041 movw(Address(rsp, cnt2, scale2, stk_offset), result);
3042 }
3043 decrement(cnt2);
3044 jccb(Assembler::notZero, COPY_SUBSTR);
3045
3046 pop(cnt2);
3047 movptr(str2, rsp); // New substring address
3048 } // non constant
3049
3050 bind(CHECK_STR);
3051 cmpl(cnt1, stride);
3052 jccb(Assembler::aboveEqual, BIG_STRINGS);
3053
3054 // Check cross page boundary.
3055 movl(result, str1); // We need only low 32 bits
3056 andl(result, ((int)os::vm_page_size()-1));
3057 cmpl(result, ((int)os::vm_page_size()-16));
3058 jccb(Assembler::belowEqual, BIG_STRINGS);
3059
3060 subptr(rsp, 16);
3061 int stk_offset = -(1<<scale1);
3062 if (int_cnt2 < 0) { // not constant
3063 push(cnt2);
3064 stk_offset += wordSize;
3065 }
3066 movl(cnt2, cnt1);
3067
3068 bind(COPY_STR);
3069 if (ae == StrIntrinsicNode::LL) {
3070 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3071 movb(Address(rsp, cnt2, scale1, stk_offset), result);
3072 } else {
3073 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3074 movw(Address(rsp, cnt2, scale1, stk_offset), result);
3075 }
3076 decrement(cnt2);
3077 jccb(Assembler::notZero, COPY_STR);
3078
3079 if (int_cnt2 < 0) { // not constant
3080 pop(cnt2);
3081 }
3082 movptr(str1, rsp); // New string address
3083
3084 bind(BIG_STRINGS);
3085 // Load substring.
3086 if (int_cnt2 < 0) { // -1
3087 if (ae == StrIntrinsicNode::UL) {
3088 pmovzxbw(vec, Address(str2, 0));
3089 } else {
3090 movdqu(vec, Address(str2, 0));
3091 }
3092 push(cnt2); // substr count
3093 push(str2); // substr addr
3094 push(str1); // string addr
3095 } else {
3096 // Small (< 8 chars) constant substrings are loaded already.
3097 movl(cnt2, int_cnt2);
3098 }
3099 push(tmp); // original SP
3100
3101 } // Finished loading
3102
3103 //========================================================
3104 // Start search
3105 //
3106
3107 movptr(result, str1); // string addr
3108
3109 if (int_cnt2 < 0) { // Only for non constant substring
3110 jmpb(SCAN_TO_SUBSTR);
3111
3112 // SP saved at sp+0
3113 // String saved at sp+1*wordSize
3114 // Substr saved at sp+2*wordSize
3115 // Substr count saved at sp+3*wordSize
3116
3117 // Reload substr for rescan, this code
3118 // is executed only for large substrings (> 8 chars)
3119 bind(RELOAD_SUBSTR);
3120 movptr(str2, Address(rsp, 2*wordSize));
3121 movl(cnt2, Address(rsp, 3*wordSize));
3122 if (ae == StrIntrinsicNode::UL) {
3123 pmovzxbw(vec, Address(str2, 0));
3124 } else {
3125 movdqu(vec, Address(str2, 0));
3126 }
3127 // We came here after the beginning of the substring was
3128 // matched but the rest of it was not so we need to search
3129 // again. Start from the next element after the previous match.
3130 subptr(str1, result); // Restore counter
3131 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3132 shrl(str1, 1);
3133 }
3134 addl(cnt1, str1);
3135 decrementl(cnt1); // Shift to next element
3136 cmpl(cnt1, cnt2);
3137 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3138
3139 addptr(result, (1<<scale1));
3140 } // non constant
3141
3142 // Scan string for start of substr in 16-byte vectors
3143 bind(SCAN_TO_SUBSTR);
3144 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3145 pcmpestri(vec, Address(result, 0), mode);
3146 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3147 subl(cnt1, stride);
3148 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3149 cmpl(cnt1, cnt2);
3150 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3151 addptr(result, 16);
3152
3153 bind(ADJUST_STR);
3154 cmpl(cnt1, stride); // Do not read beyond string
3155 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3156 // Back-up string to avoid reading beyond string.
3157 lea(result, Address(result, cnt1, scale1, -16));
3158 movl(cnt1, stride);
3159 jmpb(SCAN_TO_SUBSTR);
3160
3161 // Found a potential substr
3162 bind(FOUND_CANDIDATE);
3163 // After pcmpestri tmp(rcx) contains matched element index
3164
3165 // Make sure string is still long enough
3166 subl(cnt1, tmp);
3167 cmpl(cnt1, cnt2);
3168 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3169 // Left less then substring.
3170
3171 bind(RET_NOT_FOUND);
3172 movl(result, -1);
3173 jmp(CLEANUP);
3174
3175 bind(FOUND_SUBSTR);
3176 // Compute start addr of substr
3177 lea(result, Address(result, tmp, scale1));
3178 if (int_cnt2 > 0) { // Constant substring
3179 // Repeat search for small substring (< 8 chars)
3180 // from new point without reloading substring.
3181 // Have to check that we don't read beyond string.
3182 cmpl(tmp, stride-int_cnt2);
3183 jccb(Assembler::greater, ADJUST_STR);
3184 // Fall through if matched whole substring.
3185 } else { // non constant
3186 assert(int_cnt2 == -1, "should be != 0");
3187
3188 addl(tmp, cnt2);
3189 // Found result if we matched whole substring.
3190 cmpl(tmp, stride);
3191 jcc(Assembler::lessEqual, RET_FOUND);
3192
3193 // Repeat search for small substring (<= 8 chars)
3194 // from new point 'str1' without reloading substring.
3195 cmpl(cnt2, stride);
3196 // Have to check that we don't read beyond string.
3197 jccb(Assembler::lessEqual, ADJUST_STR);
3198
3199 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3200 // Compare the rest of substring (> 8 chars).
3201 movptr(str1, result);
3202
3203 cmpl(tmp, cnt2);
3204 // First 8 chars are already matched.
3205 jccb(Assembler::equal, CHECK_NEXT);
3206
3207 bind(SCAN_SUBSTR);
3208 pcmpestri(vec, Address(str1, 0), mode);
3209 // Need to reload strings pointers if not matched whole vector
3210 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3211
3212 bind(CHECK_NEXT);
3213 subl(cnt2, stride);
3214 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3215 addptr(str1, 16);
3216 if (ae == StrIntrinsicNode::UL) {
3217 addptr(str2, 8);
3218 } else {
3219 addptr(str2, 16);
3220 }
3221 subl(cnt1, stride);
3222 cmpl(cnt2, stride); // Do not read beyond substring
3223 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3224 // Back-up strings to avoid reading beyond substring.
3225
3226 if (ae == StrIntrinsicNode::UL) {
3227 lea(str2, Address(str2, cnt2, scale2, -8));
3228 lea(str1, Address(str1, cnt2, scale1, -16));
3229 } else {
3230 lea(str2, Address(str2, cnt2, scale2, -16));
3231 lea(str1, Address(str1, cnt2, scale1, -16));
3232 }
3233 subl(cnt1, cnt2);
3234 movl(cnt2, stride);
3235 addl(cnt1, stride);
3236 bind(CONT_SCAN_SUBSTR);
3237 if (ae == StrIntrinsicNode::UL) {
3238 pmovzxbw(vec, Address(str2, 0));
3239 } else {
3240 movdqu(vec, Address(str2, 0));
3241 }
3242 jmp(SCAN_SUBSTR);
3243
3244 bind(RET_FOUND_LONG);
3245 movptr(str1, Address(rsp, wordSize));
3246 } // non constant
3247
3248 bind(RET_FOUND);
3249 // Compute substr offset
3250 subptr(result, str1);
3251 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3252 shrl(result, 1); // index
3253 }
3254 bind(CLEANUP);
3255 pop(rsp); // restore SP
3256
3257 } // string_indexof
3258
3259 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3260 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3261 ShortBranchVerifier sbv(this);
3262 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3263
3264 int stride = 8;
3265
3266 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3267 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3268 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3269 FOUND_SEQ_CHAR, DONE_LABEL;
3270
3271 movptr(result, str1);
3272 if (UseAVX >= 2) {
3273 cmpl(cnt1, stride);
3274 jcc(Assembler::less, SCAN_TO_CHAR);
3275 cmpl(cnt1, 2*stride);
3276 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3277 movdl(vec1, ch);
3278 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3279 vpxor(vec2, vec2);
3280 movl(tmp, cnt1);
3281 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3282 andl(cnt1,0x0000000F); //tail count (in chars)
3283
3284 bind(SCAN_TO_16_CHAR_LOOP);
3285 vmovdqu(vec3, Address(result, 0));
3286 vpcmpeqw(vec3, vec3, vec1, 1);
3287 vptest(vec2, vec3);
3288 jcc(Assembler::carryClear, FOUND_CHAR);
3289 addptr(result, 32);
3290 subl(tmp, 2*stride);
3291 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3292 jmp(SCAN_TO_8_CHAR);
3293 bind(SCAN_TO_8_CHAR_INIT);
3294 movdl(vec1, ch);
3295 pshuflw(vec1, vec1, 0x00);
3296 pshufd(vec1, vec1, 0);
3297 pxor(vec2, vec2);
3298 }
3299 bind(SCAN_TO_8_CHAR);
3300 cmpl(cnt1, stride);
3301 jcc(Assembler::less, SCAN_TO_CHAR);
3302 if (UseAVX < 2) {
3303 movdl(vec1, ch);
3304 pshuflw(vec1, vec1, 0x00);
3305 pshufd(vec1, vec1, 0);
3306 pxor(vec2, vec2);
3307 }
3308 movl(tmp, cnt1);
3309 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3310 andl(cnt1,0x00000007); //tail count (in chars)
3311
3312 bind(SCAN_TO_8_CHAR_LOOP);
3313 movdqu(vec3, Address(result, 0));
3314 pcmpeqw(vec3, vec1);
3315 ptest(vec2, vec3);
3316 jcc(Assembler::carryClear, FOUND_CHAR);
3317 addptr(result, 16);
3318 subl(tmp, stride);
3319 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3320 bind(SCAN_TO_CHAR);
3321 testl(cnt1, cnt1);
3322 jcc(Assembler::zero, RET_NOT_FOUND);
3323 bind(SCAN_TO_CHAR_LOOP);
3324 load_unsigned_short(tmp, Address(result, 0));
3325 cmpl(ch, tmp);
3326 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3327 addptr(result, 2);
3328 subl(cnt1, 1);
3329 jccb(Assembler::zero, RET_NOT_FOUND);
3330 jmp(SCAN_TO_CHAR_LOOP);
3331
3332 bind(RET_NOT_FOUND);
3333 movl(result, -1);
3334 jmpb(DONE_LABEL);
3335
3336 bind(FOUND_CHAR);
3337 if (UseAVX >= 2) {
3338 vpmovmskb(tmp, vec3);
3339 } else {
3340 pmovmskb(tmp, vec3);
3341 }
3342 bsfl(ch, tmp);
3343 addptr(result, ch);
3344
3345 bind(FOUND_SEQ_CHAR);
3346 subptr(result, str1);
3347 shrl(result, 1);
3348
3349 bind(DONE_LABEL);
3350 } // string_indexof_char
3351
3352 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3353 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3354 ShortBranchVerifier sbv(this);
3355 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3356
3357 int stride = 16;
3358
3359 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3360 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3361 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3362 FOUND_SEQ_CHAR, DONE_LABEL;
3363
3364 movptr(result, str1);
3365 if (UseAVX >= 2) {
3366 cmpl(cnt1, stride);
3367 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3368 cmpl(cnt1, stride*2);
3369 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3370 movdl(vec1, ch);
3371 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3372 vpxor(vec2, vec2);
3373 movl(tmp, cnt1);
3374 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3375 andl(cnt1,0x0000001F); //tail count (in chars)
3376
3377 bind(SCAN_TO_32_CHAR_LOOP);
3378 vmovdqu(vec3, Address(result, 0));
3379 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3380 vptest(vec2, vec3);
3381 jcc(Assembler::carryClear, FOUND_CHAR);
3382 addptr(result, 32);
3383 subl(tmp, stride*2);
3384 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3385 jmp(SCAN_TO_16_CHAR);
3386
3387 bind(SCAN_TO_16_CHAR_INIT);
3388 movdl(vec1, ch);
3389 pxor(vec2, vec2);
3390 pshufb(vec1, vec2);
3391 }
3392
3393 bind(SCAN_TO_16_CHAR);
3394 cmpl(cnt1, stride);
3395 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3396 if (UseAVX < 2) {
3397 movdl(vec1, ch);
3398 pxor(vec2, vec2);
3399 pshufb(vec1, vec2);
3400 }
3401 movl(tmp, cnt1);
3402 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3403 andl(cnt1,0x0000000F); //tail count (in bytes)
3404
3405 bind(SCAN_TO_16_CHAR_LOOP);
3406 movdqu(vec3, Address(result, 0));
3407 pcmpeqb(vec3, vec1);
3408 ptest(vec2, vec3);
3409 jcc(Assembler::carryClear, FOUND_CHAR);
3410 addptr(result, 16);
3411 subl(tmp, stride);
3412 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3413
3414 bind(SCAN_TO_CHAR_INIT);
3415 testl(cnt1, cnt1);
3416 jcc(Assembler::zero, RET_NOT_FOUND);
3417 bind(SCAN_TO_CHAR_LOOP);
3418 load_unsigned_byte(tmp, Address(result, 0));
3419 cmpl(ch, tmp);
3420 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3421 addptr(result, 1);
3422 subl(cnt1, 1);
3423 jccb(Assembler::zero, RET_NOT_FOUND);
3424 jmp(SCAN_TO_CHAR_LOOP);
3425
3426 bind(RET_NOT_FOUND);
3427 movl(result, -1);
3428 jmpb(DONE_LABEL);
3429
3430 bind(FOUND_CHAR);
3431 if (UseAVX >= 2) {
3432 vpmovmskb(tmp, vec3);
3433 } else {
3434 pmovmskb(tmp, vec3);
3435 }
3436 bsfl(ch, tmp);
3437 addptr(result, ch);
3438
3439 bind(FOUND_SEQ_CHAR);
3440 subptr(result, str1);
3441
3442 bind(DONE_LABEL);
3443 } // stringL_indexof_char
3444
3445 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3446 switch (eltype) {
3447 case T_BOOLEAN: return sizeof(jboolean);
3448 case T_BYTE: return sizeof(jbyte);
3449 case T_SHORT: return sizeof(jshort);
3450 case T_CHAR: return sizeof(jchar);
3451 case T_INT: return sizeof(jint);
3452 default:
3453 ShouldNotReachHere();
3454 return -1;
3455 }
3456 }
3457
3458 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3459 switch (eltype) {
3460 // T_BOOLEAN used as surrogate for unsigned byte
3461 case T_BOOLEAN: movzbl(dst, src); break;
3462 case T_BYTE: movsbl(dst, src); break;
3463 case T_SHORT: movswl(dst, src); break;
3464 case T_CHAR: movzwl(dst, src); break;
3465 case T_INT: movl(dst, src); break;
3466 default:
3467 ShouldNotReachHere();
3468 }
3469 }
3470
3471 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3472 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3473 }
3474
3475 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3476 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3477 }
3478
3479 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3480 const int vlen = Assembler::AVX_256bit;
3481 switch (eltype) {
3482 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3483 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3484 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3485 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3486 case T_INT:
3487 // do nothing
3488 break;
3489 default:
3490 ShouldNotReachHere();
3491 }
3492 }
3493
3494 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3495 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3496 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3497 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3498 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3499 BasicType eltype) {
3500 ShortBranchVerifier sbv(this);
3501 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3502 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3503 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3504
3505 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3506 SHORT_UNROLLED_LOOP_EXIT,
3507 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3508 UNROLLED_VECTOR_LOOP_BEGIN,
3509 END;
3510 switch (eltype) {
3511 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3512 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3513 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3514 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3515 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3516 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3517 }
3518
3519 // For "renaming" for readibility of the code
3520 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3521 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3522 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3523
3524 const int elsize = arrays_hashcode_elsize(eltype);
3525
3526 /*
3527 if (cnt1 >= 2) {
3528 if (cnt1 >= 32) {
3529 UNROLLED VECTOR LOOP
3530 }
3531 UNROLLED SCALAR LOOP
3532 }
3533 SINGLE SCALAR
3534 */
3535
3536 cmpl(cnt1, 32);
3537 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3538
3539 // cnt1 >= 32 && generate_vectorized_loop
3540 xorl(index, index);
3541
3542 // vresult = IntVector.zero(I256);
3543 for (int idx = 0; idx < 4; idx++) {
3544 vpxor(vresult[idx], vresult[idx]);
3545 }
3546 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3547 Register bound = tmp2;
3548 Register next = tmp3;
3549 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3550 movl(next, Address(tmp2, 0));
3551 movdl(vnext, next);
3552 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3553
3554 // index = 0;
3555 // bound = cnt1 & ~(32 - 1);
3556 movl(bound, cnt1);
3557 andl(bound, ~(32 - 1));
3558 // for (; index < bound; index += 32) {
3559 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3560 // result *= next;
3561 imull(result, next);
3562 // loop fission to upfront the cost of fetching from memory, OOO execution
3563 // can then hopefully do a better job of prefetching
3564 for (int idx = 0; idx < 4; idx++) {
3565 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3566 }
3567 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3568 for (int idx = 0; idx < 4; idx++) {
3569 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3570 arrays_hashcode_elvcast(vtmp[idx], eltype);
3571 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3572 }
3573 // index += 32;
3574 addl(index, 32);
3575 // index < bound;
3576 cmpl(index, bound);
3577 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3578 // }
3579
3580 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3581 subl(cnt1, bound);
3582 // release bound
3583
3584 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3585 for (int idx = 0; idx < 4; idx++) {
3586 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3587 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3588 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3589 }
3590 // result += vresult.reduceLanes(ADD);
3591 for (int idx = 0; idx < 4; idx++) {
3592 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3593 }
3594
3595 // } else if (cnt1 < 32) {
3596
3597 bind(SHORT_UNROLLED_BEGIN);
3598 // int i = 1;
3599 movl(index, 1);
3600 cmpl(index, cnt1);
3601 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3602
3603 // for (; i < cnt1 ; i += 2) {
3604 bind(SHORT_UNROLLED_LOOP_BEGIN);
3605 movl(tmp3, 961);
3606 imull(result, tmp3);
3607 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3608 movl(tmp3, tmp2);
3609 shll(tmp3, 5);
3610 subl(tmp3, tmp2);
3611 addl(result, tmp3);
3612 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3613 addl(result, tmp3);
3614 addl(index, 2);
3615 cmpl(index, cnt1);
3616 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3617
3618 // }
3619 // if (i >= cnt1) {
3620 bind(SHORT_UNROLLED_LOOP_EXIT);
3621 jccb(Assembler::greater, END);
3622 movl(tmp2, result);
3623 shll(result, 5);
3624 subl(result, tmp2);
3625 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3626 addl(result, tmp3);
3627 // }
3628 bind(END);
3629
3630 BLOCK_COMMENT("} // arrays_hashcode");
3631
3632 } // arrays_hashcode
3633
3634 // helper function for string_compare
3635 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3636 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3637 Address::ScaleFactor scale2, Register index, int ae) {
3638 if (ae == StrIntrinsicNode::LL) {
3639 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3640 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3641 } else if (ae == StrIntrinsicNode::UU) {
3642 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3643 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3644 } else {
3645 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3646 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3647 }
3648 }
3649
3650 // Compare strings, used for char[] and byte[].
3651 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3652 Register cnt1, Register cnt2, Register result,
3653 XMMRegister vec1, int ae, KRegister mask) {
3654 ShortBranchVerifier sbv(this);
3655 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3656 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3657 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3658 int stride2x2 = 0x40;
3659 Address::ScaleFactor scale = Address::no_scale;
3660 Address::ScaleFactor scale1 = Address::no_scale;
3661 Address::ScaleFactor scale2 = Address::no_scale;
3662
3663 if (ae != StrIntrinsicNode::LL) {
3664 stride2x2 = 0x20;
3665 }
3666
3667 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3668 shrl(cnt2, 1);
3669 }
3670 // Compute the minimum of the string lengths and the
3671 // difference of the string lengths (stack).
3672 // Do the conditional move stuff
3673 movl(result, cnt1);
3674 subl(cnt1, cnt2);
3675 push(cnt1);
3676 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3677
3678 // Is the minimum length zero?
3679 testl(cnt2, cnt2);
3680 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3681 if (ae == StrIntrinsicNode::LL) {
3682 // Load first bytes
3683 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3684 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3685 } else if (ae == StrIntrinsicNode::UU) {
3686 // Load first characters
3687 load_unsigned_short(result, Address(str1, 0));
3688 load_unsigned_short(cnt1, Address(str2, 0));
3689 } else {
3690 load_unsigned_byte(result, Address(str1, 0));
3691 load_unsigned_short(cnt1, Address(str2, 0));
3692 }
3693 subl(result, cnt1);
3694 jcc(Assembler::notZero, POP_LABEL);
3695
3696 if (ae == StrIntrinsicNode::UU) {
3697 // Divide length by 2 to get number of chars
3698 shrl(cnt2, 1);
3699 }
3700 cmpl(cnt2, 1);
3701 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3702
3703 // Check if the strings start at the same location and setup scale and stride
3704 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3705 cmpptr(str1, str2);
3706 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3707 if (ae == StrIntrinsicNode::LL) {
3708 scale = Address::times_1;
3709 stride = 16;
3710 } else {
3711 scale = Address::times_2;
3712 stride = 8;
3713 }
3714 } else {
3715 scale1 = Address::times_1;
3716 scale2 = Address::times_2;
3717 // scale not used
3718 stride = 8;
3719 }
3720
3721 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3722 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3723 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3724 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3725 Label COMPARE_TAIL_LONG;
3726 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3727
3728 int pcmpmask = 0x19;
3729 if (ae == StrIntrinsicNode::LL) {
3730 pcmpmask &= ~0x01;
3731 }
3732
3733 // Setup to compare 16-chars (32-bytes) vectors,
3734 // start from first character again because it has aligned address.
3735 if (ae == StrIntrinsicNode::LL) {
3736 stride2 = 32;
3737 } else {
3738 stride2 = 16;
3739 }
3740 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3741 adr_stride = stride << scale;
3742 } else {
3743 adr_stride1 = 8; //stride << scale1;
3744 adr_stride2 = 16; //stride << scale2;
3745 }
3746
3747 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3748 // rax and rdx are used by pcmpestri as elements counters
3749 movl(result, cnt2);
3750 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3751 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3752
3753 // fast path : compare first 2 8-char vectors.
3754 bind(COMPARE_16_CHARS);
3755 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3756 movdqu(vec1, Address(str1, 0));
3757 } else {
3758 pmovzxbw(vec1, Address(str1, 0));
3759 }
3760 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3761 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3762
3763 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3764 movdqu(vec1, Address(str1, adr_stride));
3765 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3766 } else {
3767 pmovzxbw(vec1, Address(str1, adr_stride1));
3768 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3769 }
3770 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3771 addl(cnt1, stride);
3772
3773 // Compare the characters at index in cnt1
3774 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3775 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3776 subl(result, cnt2);
3777 jmp(POP_LABEL);
3778
3779 // Setup the registers to start vector comparison loop
3780 bind(COMPARE_WIDE_VECTORS);
3781 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3782 lea(str1, Address(str1, result, scale));
3783 lea(str2, Address(str2, result, scale));
3784 } else {
3785 lea(str1, Address(str1, result, scale1));
3786 lea(str2, Address(str2, result, scale2));
3787 }
3788 subl(result, stride2);
3789 subl(cnt2, stride2);
3790 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3791 negptr(result);
3792
3793 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3794 bind(COMPARE_WIDE_VECTORS_LOOP);
3795
3796 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3797 cmpl(cnt2, stride2x2);
3798 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3799 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3800 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3801
3802 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3803 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3804 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3805 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3806 } else {
3807 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3808 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3809 }
3810 kortestql(mask, mask);
3811 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3812 addptr(result, stride2x2); // update since we already compared at this addr
3813 subl(cnt2, stride2x2); // and sub the size too
3814 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3815
3816 vpxor(vec1, vec1);
3817 jmpb(COMPARE_WIDE_TAIL);
3818 }//if (VM_Version::supports_avx512vlbw())
3819
3820 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3821 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3822 vmovdqu(vec1, Address(str1, result, scale));
3823 vpxor(vec1, Address(str2, result, scale));
3824 } else {
3825 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3826 vpxor(vec1, Address(str2, result, scale2));
3827 }
3828 vptest(vec1, vec1);
3829 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3830 addptr(result, stride2);
3831 subl(cnt2, stride2);
3832 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3833 // clean upper bits of YMM registers
3834 vpxor(vec1, vec1);
3835
3836 // compare wide vectors tail
3837 bind(COMPARE_WIDE_TAIL);
3838 testptr(result, result);
3839 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3840
3841 movl(result, stride2);
3842 movl(cnt2, result);
3843 negptr(result);
3844 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3845
3846 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3847 bind(VECTOR_NOT_EQUAL);
3848 // clean upper bits of YMM registers
3849 vpxor(vec1, vec1);
3850 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3851 lea(str1, Address(str1, result, scale));
3852 lea(str2, Address(str2, result, scale));
3853 } else {
3854 lea(str1, Address(str1, result, scale1));
3855 lea(str2, Address(str2, result, scale2));
3856 }
3857 jmp(COMPARE_16_CHARS);
3858
3859 // Compare tail chars, length between 1 to 15 chars
3860 bind(COMPARE_TAIL_LONG);
3861 movl(cnt2, result);
3862 cmpl(cnt2, stride);
3863 jcc(Assembler::less, COMPARE_SMALL_STR);
3864
3865 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3866 movdqu(vec1, Address(str1, 0));
3867 } else {
3868 pmovzxbw(vec1, Address(str1, 0));
3869 }
3870 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3871 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3872 subptr(cnt2, stride);
3873 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3874 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3875 lea(str1, Address(str1, result, scale));
3876 lea(str2, Address(str2, result, scale));
3877 } else {
3878 lea(str1, Address(str1, result, scale1));
3879 lea(str2, Address(str2, result, scale2));
3880 }
3881 negptr(cnt2);
3882 jmpb(WHILE_HEAD_LABEL);
3883
3884 bind(COMPARE_SMALL_STR);
3885 } else if (UseSSE42Intrinsics) {
3886 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3887 int pcmpmask = 0x19;
3888 // Setup to compare 8-char (16-byte) vectors,
3889 // start from first character again because it has aligned address.
3890 movl(result, cnt2);
3891 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3892 if (ae == StrIntrinsicNode::LL) {
3893 pcmpmask &= ~0x01;
3894 }
3895 jcc(Assembler::zero, COMPARE_TAIL);
3896 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3897 lea(str1, Address(str1, result, scale));
3898 lea(str2, Address(str2, result, scale));
3899 } else {
3900 lea(str1, Address(str1, result, scale1));
3901 lea(str2, Address(str2, result, scale2));
3902 }
3903 negptr(result);
3904
3905 // pcmpestri
3906 // inputs:
3907 // vec1- substring
3908 // rax - negative string length (elements count)
3909 // mem - scanned string
3910 // rdx - string length (elements count)
3911 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3912 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3913 // outputs:
3914 // rcx - first mismatched element index
3915 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3916
3917 bind(COMPARE_WIDE_VECTORS);
3918 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3919 movdqu(vec1, Address(str1, result, scale));
3920 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3921 } else {
3922 pmovzxbw(vec1, Address(str1, result, scale1));
3923 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3924 }
3925 // After pcmpestri cnt1(rcx) contains mismatched element index
3926
3927 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3928 addptr(result, stride);
3929 subptr(cnt2, stride);
3930 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3931
3932 // compare wide vectors tail
3933 testptr(result, result);
3934 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3935
3936 movl(cnt2, stride);
3937 movl(result, stride);
3938 negptr(result);
3939 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3940 movdqu(vec1, Address(str1, result, scale));
3941 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3942 } else {
3943 pmovzxbw(vec1, Address(str1, result, scale1));
3944 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3945 }
3946 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3947
3948 // Mismatched characters in the vectors
3949 bind(VECTOR_NOT_EQUAL);
3950 addptr(cnt1, result);
3951 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3952 subl(result, cnt2);
3953 jmpb(POP_LABEL);
3954
3955 bind(COMPARE_TAIL); // limit is zero
3956 movl(cnt2, result);
3957 // Fallthru to tail compare
3958 }
3959 // Shift str2 and str1 to the end of the arrays, negate min
3960 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3961 lea(str1, Address(str1, cnt2, scale));
3962 lea(str2, Address(str2, cnt2, scale));
3963 } else {
3964 lea(str1, Address(str1, cnt2, scale1));
3965 lea(str2, Address(str2, cnt2, scale2));
3966 }
3967 decrementl(cnt2); // first character was compared already
3968 negptr(cnt2);
3969
3970 // Compare the rest of the elements
3971 bind(WHILE_HEAD_LABEL);
3972 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3973 subl(result, cnt1);
3974 jccb(Assembler::notZero, POP_LABEL);
3975 increment(cnt2);
3976 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3977
3978 // Strings are equal up to min length. Return the length difference.
3979 bind(LENGTH_DIFF_LABEL);
3980 pop(result);
3981 if (ae == StrIntrinsicNode::UU) {
3982 // Divide diff by 2 to get number of chars
3983 sarl(result, 1);
3984 }
3985 jmpb(DONE_LABEL);
3986
3987 if (VM_Version::supports_avx512vlbw()) {
3988
3989 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3990
3991 kmovql(cnt1, mask);
3992 notq(cnt1);
3993 bsfq(cnt2, cnt1);
3994 if (ae != StrIntrinsicNode::LL) {
3995 // Divide diff by 2 to get number of chars
3996 sarl(cnt2, 1);
3997 }
3998 addq(result, cnt2);
3999 if (ae == StrIntrinsicNode::LL) {
4000 load_unsigned_byte(cnt1, Address(str2, result));
4001 load_unsigned_byte(result, Address(str1, result));
4002 } else if (ae == StrIntrinsicNode::UU) {
4003 load_unsigned_short(cnt1, Address(str2, result, scale));
4004 load_unsigned_short(result, Address(str1, result, scale));
4005 } else {
4006 load_unsigned_short(cnt1, Address(str2, result, scale2));
4007 load_unsigned_byte(result, Address(str1, result, scale1));
4008 }
4009 subl(result, cnt1);
4010 jmpb(POP_LABEL);
4011 }//if (VM_Version::supports_avx512vlbw())
4012
4013 // Discard the stored length difference
4014 bind(POP_LABEL);
4015 pop(cnt1);
4016
4017 // That's it
4018 bind(DONE_LABEL);
4019 if(ae == StrIntrinsicNode::UL) {
4020 negl(result);
4021 }
4022
4023 }
4024
4025 // Search for Non-ASCII character (Negative byte value) in a byte array,
4026 // return the index of the first such character, otherwise the length
4027 // of the array segment searched.
4028 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4029 // @IntrinsicCandidate
4030 // public static int countPositives(byte[] ba, int off, int len) {
4031 // for (int i = off; i < off + len; i++) {
4032 // if (ba[i] < 0) {
4033 // return i - off;
4034 // }
4035 // }
4036 // return len;
4037 // }
4038 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4039 Register result, Register tmp1,
4040 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4041 // rsi: byte array
4042 // rcx: len
4043 // rax: result
4044 ShortBranchVerifier sbv(this);
4045 assert_different_registers(ary1, len, result, tmp1);
4046 assert_different_registers(vec1, vec2);
4047 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4048
4049 movl(result, len); // copy
4050 // len == 0
4051 testl(len, len);
4052 jcc(Assembler::zero, DONE);
4053
4054 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4055 VM_Version::supports_avx512vlbw() &&
4056 VM_Version::supports_bmi2()) {
4057
4058 Label test_64_loop, test_tail, BREAK_LOOP;
4059 movl(tmp1, len);
4060 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4061
4062 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4063 andl(len, 0xffffffc0); // vector count (in chars)
4064 jccb(Assembler::zero, test_tail);
4065
4066 lea(ary1, Address(ary1, len, Address::times_1));
4067 negptr(len);
4068
4069 bind(test_64_loop);
4070 // Check whether our 64 elements of size byte contain negatives
4071 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4072 kortestql(mask1, mask1);
4073 jcc(Assembler::notZero, BREAK_LOOP);
4074
4075 addptr(len, 64);
4076 jccb(Assembler::notZero, test_64_loop);
4077
4078 bind(test_tail);
4079 // bail out when there is nothing to be done
4080 testl(tmp1, -1);
4081 jcc(Assembler::zero, DONE);
4082
4083
4084 // check the tail for absense of negatives
4085 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4086 {
4087 Register tmp3_aliased = len;
4088 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4089 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4090 notq(tmp3_aliased);
4091 kmovql(mask2, tmp3_aliased);
4092 }
4093
4094 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4095 ktestq(mask1, mask2);
4096 jcc(Assembler::zero, DONE);
4097
4098 // do a full check for negative registers in the tail
4099 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4100 // ary1 already pointing to the right place
4101 jmpb(TAIL_START);
4102
4103 bind(BREAK_LOOP);
4104 // At least one byte in the last 64 byte block was negative.
4105 // Set up to look at the last 64 bytes as if they were a tail
4106 lea(ary1, Address(ary1, len, Address::times_1));
4107 addptr(result, len);
4108 // Ignore the very last byte: if all others are positive,
4109 // it must be negative, so we can skip right to the 2+1 byte
4110 // end comparison at this point
4111 orl(result, 63);
4112 movl(len, 63);
4113 // Fallthru to tail compare
4114 } else {
4115
4116 if (UseAVX >= 2) {
4117 // With AVX2, use 32-byte vector compare
4118 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4119
4120 // Compare 32-byte vectors
4121 testl(len, 0xffffffe0); // vector count (in bytes)
4122 jccb(Assembler::zero, TAIL_START);
4123
4124 andl(len, 0xffffffe0);
4125 lea(ary1, Address(ary1, len, Address::times_1));
4126 negptr(len);
4127
4128 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4129 movdl(vec2, tmp1);
4130 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4131
4132 bind(COMPARE_WIDE_VECTORS);
4133 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4134 vptest(vec1, vec2);
4135 jccb(Assembler::notZero, BREAK_LOOP);
4136 addptr(len, 32);
4137 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4138
4139 testl(result, 0x0000001f); // any bytes remaining?
4140 jcc(Assembler::zero, DONE);
4141
4142 // Quick test using the already prepared vector mask
4143 movl(len, result);
4144 andl(len, 0x0000001f);
4145 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4146 vptest(vec1, vec2);
4147 jcc(Assembler::zero, DONE);
4148 // There are zeros, jump to the tail to determine exactly where
4149 jmpb(TAIL_START);
4150
4151 bind(BREAK_LOOP);
4152 // At least one byte in the last 32-byte vector is negative.
4153 // Set up to look at the last 32 bytes as if they were a tail
4154 lea(ary1, Address(ary1, len, Address::times_1));
4155 addptr(result, len);
4156 // Ignore the very last byte: if all others are positive,
4157 // it must be negative, so we can skip right to the 2+1 byte
4158 // end comparison at this point
4159 orl(result, 31);
4160 movl(len, 31);
4161 // Fallthru to tail compare
4162 } else if (UseSSE42Intrinsics) {
4163 // With SSE4.2, use double quad vector compare
4164 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4165
4166 // Compare 16-byte vectors
4167 testl(len, 0xfffffff0); // vector count (in bytes)
4168 jcc(Assembler::zero, TAIL_START);
4169
4170 andl(len, 0xfffffff0);
4171 lea(ary1, Address(ary1, len, Address::times_1));
4172 negptr(len);
4173
4174 movl(tmp1, 0x80808080);
4175 movdl(vec2, tmp1);
4176 pshufd(vec2, vec2, 0);
4177
4178 bind(COMPARE_WIDE_VECTORS);
4179 movdqu(vec1, Address(ary1, len, Address::times_1));
4180 ptest(vec1, vec2);
4181 jccb(Assembler::notZero, BREAK_LOOP);
4182 addptr(len, 16);
4183 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4184
4185 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4186 jcc(Assembler::zero, DONE);
4187
4188 // Quick test using the already prepared vector mask
4189 movl(len, result);
4190 andl(len, 0x0000000f); // tail count (in bytes)
4191 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4192 ptest(vec1, vec2);
4193 jcc(Assembler::zero, DONE);
4194 jmpb(TAIL_START);
4195
4196 bind(BREAK_LOOP);
4197 // At least one byte in the last 16-byte vector is negative.
4198 // Set up and look at the last 16 bytes as if they were a tail
4199 lea(ary1, Address(ary1, len, Address::times_1));
4200 addptr(result, len);
4201 // Ignore the very last byte: if all others are positive,
4202 // it must be negative, so we can skip right to the 2+1 byte
4203 // end comparison at this point
4204 orl(result, 15);
4205 movl(len, 15);
4206 // Fallthru to tail compare
4207 }
4208 }
4209
4210 bind(TAIL_START);
4211 // Compare 4-byte vectors
4212 andl(len, 0xfffffffc); // vector count (in bytes)
4213 jccb(Assembler::zero, COMPARE_CHAR);
4214
4215 lea(ary1, Address(ary1, len, Address::times_1));
4216 negptr(len);
4217
4218 bind(COMPARE_VECTORS);
4219 movl(tmp1, Address(ary1, len, Address::times_1));
4220 andl(tmp1, 0x80808080);
4221 jccb(Assembler::notZero, TAIL_ADJUST);
4222 addptr(len, 4);
4223 jccb(Assembler::notZero, COMPARE_VECTORS);
4224
4225 // Compare trailing char (final 2-3 bytes), if any
4226 bind(COMPARE_CHAR);
4227
4228 testl(result, 0x2); // tail char
4229 jccb(Assembler::zero, COMPARE_BYTE);
4230 load_unsigned_short(tmp1, Address(ary1, 0));
4231 andl(tmp1, 0x00008080);
4232 jccb(Assembler::notZero, CHAR_ADJUST);
4233 lea(ary1, Address(ary1, 2));
4234
4235 bind(COMPARE_BYTE);
4236 testl(result, 0x1); // tail byte
4237 jccb(Assembler::zero, DONE);
4238 load_unsigned_byte(tmp1, Address(ary1, 0));
4239 testl(tmp1, 0x00000080);
4240 jccb(Assembler::zero, DONE);
4241 subptr(result, 1);
4242 jmpb(DONE);
4243
4244 bind(TAIL_ADJUST);
4245 // there are negative bits in the last 4 byte block.
4246 // Adjust result and check the next three bytes
4247 addptr(result, len);
4248 orl(result, 3);
4249 lea(ary1, Address(ary1, len, Address::times_1));
4250 jmpb(COMPARE_CHAR);
4251
4252 bind(CHAR_ADJUST);
4253 // We are looking at a char + optional byte tail, and found that one
4254 // of the bytes in the char is negative. Adjust the result, check the
4255 // first byte and readjust if needed.
4256 andl(result, 0xfffffffc);
4257 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4258 jccb(Assembler::notZero, DONE);
4259 addptr(result, 1);
4260
4261 // That's it
4262 bind(DONE);
4263 if (UseAVX >= 2) {
4264 // clean upper bits of YMM registers
4265 vpxor(vec1, vec1);
4266 vpxor(vec2, vec2);
4267 }
4268 }
4269
4270 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4271 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4272 Register limit, Register result, Register chr,
4273 XMMRegister vec1, XMMRegister vec2, bool is_char,
4274 KRegister mask, bool expand_ary2) {
4275 // for expand_ary2, limit is the (smaller) size of the second array.
4276 ShortBranchVerifier sbv(this);
4277 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4278
4279 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4280 "Expansion only implemented for AVX2");
4281
4282 int length_offset = arrayOopDesc::length_offset_in_bytes();
4283 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4284
4285 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4286 int scaleIncr = expand_ary2 ? 8 : 16;
4287
4288 if (is_array_equ) {
4289 // Check the input args
4290 cmpoop(ary1, ary2);
4291 jcc(Assembler::equal, TRUE_LABEL);
4292
4293 // Need additional checks for arrays_equals.
4294 testptr(ary1, ary1);
4295 jcc(Assembler::zero, FALSE_LABEL);
4296 testptr(ary2, ary2);
4297 jcc(Assembler::zero, FALSE_LABEL);
4298
4299 // Check the lengths
4300 movl(limit, Address(ary1, length_offset));
4301 cmpl(limit, Address(ary2, length_offset));
4302 jcc(Assembler::notEqual, FALSE_LABEL);
4303 }
4304
4305 // count == 0
4306 testl(limit, limit);
4307 jcc(Assembler::zero, TRUE_LABEL);
4308
4309 if (is_array_equ) {
4310 // Load array address
4311 lea(ary1, Address(ary1, base_offset));
4312 lea(ary2, Address(ary2, base_offset));
4313 }
4314
4315 if (is_array_equ && is_char) {
4316 // arrays_equals when used for char[].
4317 shll(limit, 1); // byte count != 0
4318 }
4319 movl(result, limit); // copy
4320
4321 if (UseAVX >= 2) {
4322 // With AVX2, use 32-byte vector compare
4323 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4324
4325 // Compare 32-byte vectors
4326 if (expand_ary2) {
4327 andl(result, 0x0000000f); // tail count (in bytes)
4328 andl(limit, 0xfffffff0); // vector count (in bytes)
4329 jcc(Assembler::zero, COMPARE_TAIL);
4330 } else {
4331 andl(result, 0x0000001f); // tail count (in bytes)
4332 andl(limit, 0xffffffe0); // vector count (in bytes)
4333 jcc(Assembler::zero, COMPARE_TAIL_16);
4334 }
4335
4336 lea(ary1, Address(ary1, limit, scaleFactor));
4337 lea(ary2, Address(ary2, limit, Address::times_1));
4338 negptr(limit);
4339
4340 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4341 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4342
4343 cmpl(limit, -64);
4344 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4345
4346 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4347
4348 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4349 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4350 kortestql(mask, mask);
4351 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4352 addptr(limit, 64); // update since we already compared at this addr
4353 cmpl(limit, -64);
4354 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4355
4356 // At this point we may still need to compare -limit+result bytes.
4357 // We could execute the next two instruction and just continue via non-wide path:
4358 // cmpl(limit, 0);
4359 // jcc(Assembler::equal, COMPARE_TAIL); // true
4360 // But since we stopped at the points ary{1,2}+limit which are
4361 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4362 // (|limit| <= 32 and result < 32),
4363 // we may just compare the last 64 bytes.
4364 //
4365 addptr(result, -64); // it is safe, bc we just came from this area
4366 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4367 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4368 kortestql(mask, mask);
4369 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4370
4371 jmp(TRUE_LABEL);
4372
4373 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4374
4375 }//if (VM_Version::supports_avx512vlbw())
4376
4377 bind(COMPARE_WIDE_VECTORS);
4378 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4379 if (expand_ary2) {
4380 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4381 } else {
4382 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4383 }
4384 vpxor(vec1, vec2);
4385
4386 vptest(vec1, vec1);
4387 jcc(Assembler::notZero, FALSE_LABEL);
4388 addptr(limit, scaleIncr * 2);
4389 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4390
4391 testl(result, result);
4392 jcc(Assembler::zero, TRUE_LABEL);
4393
4394 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4395 if (expand_ary2) {
4396 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4397 } else {
4398 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4399 }
4400 vpxor(vec1, vec2);
4401
4402 vptest(vec1, vec1);
4403 jcc(Assembler::notZero, FALSE_LABEL);
4404 jmp(TRUE_LABEL);
4405
4406 bind(COMPARE_TAIL_16); // limit is zero
4407 movl(limit, result);
4408
4409 // Compare 16-byte chunks
4410 andl(result, 0x0000000f); // tail count (in bytes)
4411 andl(limit, 0xfffffff0); // vector count (in bytes)
4412 jcc(Assembler::zero, COMPARE_TAIL);
4413
4414 lea(ary1, Address(ary1, limit, scaleFactor));
4415 lea(ary2, Address(ary2, limit, Address::times_1));
4416 negptr(limit);
4417
4418 bind(COMPARE_WIDE_VECTORS_16);
4419 movdqu(vec1, Address(ary1, limit, scaleFactor));
4420 if (expand_ary2) {
4421 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4422 } else {
4423 movdqu(vec2, Address(ary2, limit, Address::times_1));
4424 }
4425 pxor(vec1, vec2);
4426
4427 ptest(vec1, vec1);
4428 jcc(Assembler::notZero, FALSE_LABEL);
4429 addptr(limit, scaleIncr);
4430 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4431
4432 bind(COMPARE_TAIL); // limit is zero
4433 movl(limit, result);
4434 // Fallthru to tail compare
4435 } else if (UseSSE42Intrinsics) {
4436 // With SSE4.2, use double quad vector compare
4437 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4438
4439 // Compare 16-byte vectors
4440 andl(result, 0x0000000f); // tail count (in bytes)
4441 andl(limit, 0xfffffff0); // vector count (in bytes)
4442 jcc(Assembler::zero, COMPARE_TAIL);
4443
4444 lea(ary1, Address(ary1, limit, Address::times_1));
4445 lea(ary2, Address(ary2, limit, Address::times_1));
4446 negptr(limit);
4447
4448 bind(COMPARE_WIDE_VECTORS);
4449 movdqu(vec1, Address(ary1, limit, Address::times_1));
4450 movdqu(vec2, Address(ary2, limit, Address::times_1));
4451 pxor(vec1, vec2);
4452
4453 ptest(vec1, vec1);
4454 jcc(Assembler::notZero, FALSE_LABEL);
4455 addptr(limit, 16);
4456 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4457
4458 testl(result, result);
4459 jcc(Assembler::zero, TRUE_LABEL);
4460
4461 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4462 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4463 pxor(vec1, vec2);
4464
4465 ptest(vec1, vec1);
4466 jccb(Assembler::notZero, FALSE_LABEL);
4467 jmpb(TRUE_LABEL);
4468
4469 bind(COMPARE_TAIL); // limit is zero
4470 movl(limit, result);
4471 // Fallthru to tail compare
4472 }
4473
4474 // Compare 4-byte vectors
4475 if (expand_ary2) {
4476 testl(result, result);
4477 jccb(Assembler::zero, TRUE_LABEL);
4478 } else {
4479 andl(limit, 0xfffffffc); // vector count (in bytes)
4480 jccb(Assembler::zero, COMPARE_CHAR);
4481 }
4482
4483 lea(ary1, Address(ary1, limit, scaleFactor));
4484 lea(ary2, Address(ary2, limit, Address::times_1));
4485 negptr(limit);
4486
4487 bind(COMPARE_VECTORS);
4488 if (expand_ary2) {
4489 // There are no "vector" operations for bytes to shorts
4490 movzbl(chr, Address(ary2, limit, Address::times_1));
4491 cmpw(Address(ary1, limit, Address::times_2), chr);
4492 jccb(Assembler::notEqual, FALSE_LABEL);
4493 addptr(limit, 1);
4494 jcc(Assembler::notZero, COMPARE_VECTORS);
4495 jmp(TRUE_LABEL);
4496 } else {
4497 movl(chr, Address(ary1, limit, Address::times_1));
4498 cmpl(chr, Address(ary2, limit, Address::times_1));
4499 jccb(Assembler::notEqual, FALSE_LABEL);
4500 addptr(limit, 4);
4501 jcc(Assembler::notZero, COMPARE_VECTORS);
4502 }
4503
4504 // Compare trailing char (final 2 bytes), if any
4505 bind(COMPARE_CHAR);
4506 testl(result, 0x2); // tail char
4507 jccb(Assembler::zero, COMPARE_BYTE);
4508 load_unsigned_short(chr, Address(ary1, 0));
4509 load_unsigned_short(limit, Address(ary2, 0));
4510 cmpl(chr, limit);
4511 jccb(Assembler::notEqual, FALSE_LABEL);
4512
4513 if (is_array_equ && is_char) {
4514 bind(COMPARE_BYTE);
4515 } else {
4516 lea(ary1, Address(ary1, 2));
4517 lea(ary2, Address(ary2, 2));
4518
4519 bind(COMPARE_BYTE);
4520 testl(result, 0x1); // tail byte
4521 jccb(Assembler::zero, TRUE_LABEL);
4522 load_unsigned_byte(chr, Address(ary1, 0));
4523 load_unsigned_byte(limit, Address(ary2, 0));
4524 cmpl(chr, limit);
4525 jccb(Assembler::notEqual, FALSE_LABEL);
4526 }
4527 bind(TRUE_LABEL);
4528 movl(result, 1); // return true
4529 jmpb(DONE);
4530
4531 bind(FALSE_LABEL);
4532 xorl(result, result); // return false
4533
4534 // That's it
4535 bind(DONE);
4536 if (UseAVX >= 2) {
4537 // clean upper bits of YMM registers
4538 vpxor(vec1, vec1);
4539 vpxor(vec2, vec2);
4540 }
4541 }
4542
4543 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4544 #define __ masm.
4545 Register dst = stub.data<0>();
4546 XMMRegister src = stub.data<1>();
4547 address target = stub.data<2>();
4548 __ bind(stub.entry());
4549 __ subptr(rsp, 8);
4550 __ movdbl(Address(rsp), src);
4551 __ call(RuntimeAddress(target));
4552 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4553 __ pop(dst);
4554 __ jmp(stub.continuation());
4555 #undef __
4556 }
4557
4558 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4559 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4560 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4561
4562 address slowpath_target;
4563 if (dst_bt == T_INT) {
4564 if (src_bt == T_FLOAT) {
4565 cvttss2sil(dst, src);
4566 cmpl(dst, 0x80000000);
4567 slowpath_target = StubRoutines::x86::f2i_fixup();
4568 } else {
4569 cvttsd2sil(dst, src);
4570 cmpl(dst, 0x80000000);
4571 slowpath_target = StubRoutines::x86::d2i_fixup();
4572 }
4573 } else {
4574 if (src_bt == T_FLOAT) {
4575 cvttss2siq(dst, src);
4576 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4577 slowpath_target = StubRoutines::x86::f2l_fixup();
4578 } else {
4579 cvttsd2siq(dst, src);
4580 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4581 slowpath_target = StubRoutines::x86::d2l_fixup();
4582 }
4583 }
4584
4585 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4586 int max_size = 23 + (UseAPX ? 1 : 0);
4587 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4588 jcc(Assembler::equal, stub->entry());
4589 bind(stub->continuation());
4590 }
4591
4592 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4593 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4594 switch(ideal_opc) {
4595 case Op_LShiftVS:
4596 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4597 case Op_LShiftVI:
4598 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4599 case Op_LShiftVL:
4600 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4601 case Op_RShiftVS:
4602 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4603 case Op_RShiftVI:
4604 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4605 case Op_RShiftVL:
4606 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4607 case Op_URShiftVS:
4608 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4609 case Op_URShiftVI:
4610 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4611 case Op_URShiftVL:
4612 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4613 case Op_RotateRightV:
4614 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4615 case Op_RotateLeftV:
4616 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4617 default:
4618 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4619 break;
4620 }
4621 }
4622
4623 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4624 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4625 if (is_unsigned) {
4626 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4627 } else {
4628 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4629 }
4630 }
4631
4632 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4633 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4634 switch (elem_bt) {
4635 case T_BYTE:
4636 if (ideal_opc == Op_SaturatingAddV) {
4637 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4638 } else {
4639 assert(ideal_opc == Op_SaturatingSubV, "");
4640 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4641 }
4642 break;
4643 case T_SHORT:
4644 if (ideal_opc == Op_SaturatingAddV) {
4645 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4646 } else {
4647 assert(ideal_opc == Op_SaturatingSubV, "");
4648 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4649 }
4650 break;
4651 default:
4652 fatal("Unsupported type %s", type2name(elem_bt));
4653 break;
4654 }
4655 }
4656
4657 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4658 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4659 switch (elem_bt) {
4660 case T_BYTE:
4661 if (ideal_opc == Op_SaturatingAddV) {
4662 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4663 } else {
4664 assert(ideal_opc == Op_SaturatingSubV, "");
4665 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4666 }
4667 break;
4668 case T_SHORT:
4669 if (ideal_opc == Op_SaturatingAddV) {
4670 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4671 } else {
4672 assert(ideal_opc == Op_SaturatingSubV, "");
4673 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4674 }
4675 break;
4676 default:
4677 fatal("Unsupported type %s", type2name(elem_bt));
4678 break;
4679 }
4680 }
4681
4682 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4683 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4684 if (is_unsigned) {
4685 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4686 } else {
4687 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4688 }
4689 }
4690
4691 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4692 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4693 switch (elem_bt) {
4694 case T_BYTE:
4695 if (ideal_opc == Op_SaturatingAddV) {
4696 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4697 } else {
4698 assert(ideal_opc == Op_SaturatingSubV, "");
4699 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4700 }
4701 break;
4702 case T_SHORT:
4703 if (ideal_opc == Op_SaturatingAddV) {
4704 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4705 } else {
4706 assert(ideal_opc == Op_SaturatingSubV, "");
4707 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4708 }
4709 break;
4710 default:
4711 fatal("Unsupported type %s", type2name(elem_bt));
4712 break;
4713 }
4714 }
4715
4716 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4717 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4718 switch (elem_bt) {
4719 case T_BYTE:
4720 if (ideal_opc == Op_SaturatingAddV) {
4721 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4722 } else {
4723 assert(ideal_opc == Op_SaturatingSubV, "");
4724 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4725 }
4726 break;
4727 case T_SHORT:
4728 if (ideal_opc == Op_SaturatingAddV) {
4729 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4730 } else {
4731 assert(ideal_opc == Op_SaturatingSubV, "");
4732 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4733 }
4734 break;
4735 default:
4736 fatal("Unsupported type %s", type2name(elem_bt));
4737 break;
4738 }
4739 }
4740
4741 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4742 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4743 bool is_varshift) {
4744 switch (ideal_opc) {
4745 case Op_AddVB:
4746 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4747 case Op_AddVS:
4748 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4749 case Op_AddVI:
4750 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4751 case Op_AddVL:
4752 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4753 case Op_AddVF:
4754 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4755 case Op_AddVD:
4756 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4757 case Op_SubVB:
4758 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4759 case Op_SubVS:
4760 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4761 case Op_SubVI:
4762 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4763 case Op_SubVL:
4764 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4765 case Op_SubVF:
4766 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4767 case Op_SubVD:
4768 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4769 case Op_MulVS:
4770 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4771 case Op_MulVI:
4772 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4773 case Op_MulVL:
4774 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4775 case Op_MulVF:
4776 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4777 case Op_MulVD:
4778 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4779 case Op_DivVF:
4780 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4781 case Op_DivVD:
4782 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4783 case Op_SqrtVF:
4784 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4785 case Op_SqrtVD:
4786 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4787 case Op_AbsVB:
4788 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4789 case Op_AbsVS:
4790 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4791 case Op_AbsVI:
4792 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4793 case Op_AbsVL:
4794 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4795 case Op_FmaVF:
4796 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4797 case Op_FmaVD:
4798 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4799 case Op_VectorRearrange:
4800 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4801 case Op_LShiftVS:
4802 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4803 case Op_LShiftVI:
4804 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4805 case Op_LShiftVL:
4806 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4807 case Op_RShiftVS:
4808 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4809 case Op_RShiftVI:
4810 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4811 case Op_RShiftVL:
4812 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4813 case Op_URShiftVS:
4814 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4815 case Op_URShiftVI:
4816 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4817 case Op_URShiftVL:
4818 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4819 case Op_RotateLeftV:
4820 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4821 case Op_RotateRightV:
4822 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4823 case Op_MaxV:
4824 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4825 case Op_MinV:
4826 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4827 case Op_UMinV:
4828 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4829 case Op_UMaxV:
4830 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4831 case Op_XorV:
4832 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4833 case Op_OrV:
4834 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4835 case Op_AndV:
4836 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4837 default:
4838 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4839 break;
4840 }
4841 }
4842
4843 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4844 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4845 switch (ideal_opc) {
4846 case Op_AddVB:
4847 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4848 case Op_AddVS:
4849 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4850 case Op_AddVI:
4851 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4852 case Op_AddVL:
4853 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4854 case Op_AddVF:
4855 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4856 case Op_AddVD:
4857 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4858 case Op_SubVB:
4859 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4860 case Op_SubVS:
4861 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4862 case Op_SubVI:
4863 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4864 case Op_SubVL:
4865 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4866 case Op_SubVF:
4867 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4868 case Op_SubVD:
4869 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4870 case Op_MulVS:
4871 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4872 case Op_MulVI:
4873 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4874 case Op_MulVL:
4875 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4876 case Op_MulVF:
4877 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4878 case Op_MulVD:
4879 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4880 case Op_DivVF:
4881 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4882 case Op_DivVD:
4883 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4884 case Op_FmaVF:
4885 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4886 case Op_FmaVD:
4887 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4888 case Op_MaxV:
4889 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4890 case Op_MinV:
4891 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4892 case Op_UMaxV:
4893 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4894 case Op_UMinV:
4895 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4896 case Op_XorV:
4897 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4898 case Op_OrV:
4899 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4900 case Op_AndV:
4901 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4902 default:
4903 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4904 break;
4905 }
4906 }
4907
4908 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4909 KRegister src1, KRegister src2) {
4910 BasicType etype = T_ILLEGAL;
4911 switch(mask_len) {
4912 case 2:
4913 case 4:
4914 case 8: etype = T_BYTE; break;
4915 case 16: etype = T_SHORT; break;
4916 case 32: etype = T_INT; break;
4917 case 64: etype = T_LONG; break;
4918 default: fatal("Unsupported type"); break;
4919 }
4920 assert(etype != T_ILLEGAL, "");
4921 switch(ideal_opc) {
4922 case Op_AndVMask:
4923 kand(etype, dst, src1, src2); break;
4924 case Op_OrVMask:
4925 kor(etype, dst, src1, src2); break;
4926 case Op_XorVMask:
4927 kxor(etype, dst, src1, src2); break;
4928 default:
4929 fatal("Unsupported masked operation"); break;
4930 }
4931 }
4932
4933 /*
4934 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4935 * If src is NaN, the result is 0.
4936 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4937 * the result is equal to the value of Integer.MIN_VALUE.
4938 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4939 * the result is equal to the value of Integer.MAX_VALUE.
4940 */
4941 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4942 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4943 Register rscratch, AddressLiteral float_sign_flip,
4944 int vec_enc) {
4945 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4946 Label done;
4947 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4948 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4949 vptest(xtmp2, xtmp2, vec_enc);
4950 jccb(Assembler::equal, done);
4951
4952 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4953 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4954
4955 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4956 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4957 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4958
4959 // Recompute the mask for remaining special value.
4960 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4961 // Extract SRC values corresponding to TRUE mask lanes.
4962 vpand(xtmp4, xtmp2, src, vec_enc);
4963 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4964 // values are set.
4965 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4966
4967 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4968 bind(done);
4969 }
4970
4971 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4972 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4973 Register rscratch, AddressLiteral float_sign_flip,
4974 int vec_enc) {
4975 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4976 Label done;
4977 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4978 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4979 kortestwl(ktmp1, ktmp1);
4980 jccb(Assembler::equal, done);
4981
4982 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4983 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4984 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4985
4986 kxorwl(ktmp1, ktmp1, ktmp2);
4987 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4988 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4989 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4990 bind(done);
4991 }
4992
4993 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4994 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4995 Register rscratch, AddressLiteral double_sign_flip,
4996 int vec_enc) {
4997 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4998
4999 Label done;
5000 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5001 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5002 kortestwl(ktmp1, ktmp1);
5003 jccb(Assembler::equal, done);
5004
5005 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5006 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5007 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5008
5009 kxorwl(ktmp1, ktmp1, ktmp2);
5010 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5011 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5012 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5013 bind(done);
5014 }
5015
5016 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5017 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5018 Register rscratch, AddressLiteral float_sign_flip,
5019 int vec_enc) {
5020 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5021 Label done;
5022 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5023 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5024 kortestwl(ktmp1, ktmp1);
5025 jccb(Assembler::equal, done);
5026
5027 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5028 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5029 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5030
5031 kxorwl(ktmp1, ktmp1, ktmp2);
5032 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5033 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5034 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5035 bind(done);
5036 }
5037
5038 /*
5039 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5040 * If src is NaN, the result is 0.
5041 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5042 * the result is equal to the value of Long.MIN_VALUE.
5043 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5044 * the result is equal to the value of Long.MAX_VALUE.
5045 */
5046 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5047 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5048 Register rscratch, AddressLiteral double_sign_flip,
5049 int vec_enc) {
5050 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5051
5052 Label done;
5053 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5054 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5055 kortestwl(ktmp1, ktmp1);
5056 jccb(Assembler::equal, done);
5057
5058 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5059 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5060 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5061
5062 kxorwl(ktmp1, ktmp1, ktmp2);
5063 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5064 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5065 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5066 bind(done);
5067 }
5068
5069 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5070 XMMRegister xtmp, int index, int vec_enc) {
5071 assert(vec_enc < Assembler::AVX_512bit, "");
5072 if (vec_enc == Assembler::AVX_256bit) {
5073 vextractf128_high(xtmp, src);
5074 vshufps(dst, src, xtmp, index, vec_enc);
5075 } else {
5076 vshufps(dst, src, zero, index, vec_enc);
5077 }
5078 }
5079
5080 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5081 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5082 AddressLiteral float_sign_flip, int src_vec_enc) {
5083 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5084
5085 Label done;
5086 // Compare the destination lanes with float_sign_flip
5087 // value to get mask for all special values.
5088 movdqu(xtmp1, float_sign_flip, rscratch);
5089 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5090 ptest(xtmp2, xtmp2);
5091 jccb(Assembler::equal, done);
5092
5093 // Flip float_sign_flip to get max integer value.
5094 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5095 pxor(xtmp1, xtmp4);
5096
5097 // Set detination lanes corresponding to unordered source lanes as zero.
5098 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5099 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5100
5101 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5102 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5103 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5104
5105 // Recompute the mask for remaining special value.
5106 pxor(xtmp2, xtmp3);
5107 // Extract mask corresponding to non-negative source lanes.
5108 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5109
5110 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5111 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5112 pand(xtmp3, xtmp2);
5113
5114 // Replace destination lanes holding special value(0x80000000) with max int
5115 // if corresponding source lane holds a +ve value.
5116 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5117 bind(done);
5118 }
5119
5120
5121 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5122 XMMRegister xtmp, Register rscratch, int vec_enc) {
5123 switch(to_elem_bt) {
5124 case T_SHORT:
5125 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5126 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5127 vpackusdw(dst, dst, zero, vec_enc);
5128 if (vec_enc == Assembler::AVX_256bit) {
5129 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5130 }
5131 break;
5132 case T_BYTE:
5133 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5134 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5135 vpackusdw(dst, dst, zero, vec_enc);
5136 if (vec_enc == Assembler::AVX_256bit) {
5137 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5138 }
5139 vpackuswb(dst, dst, zero, vec_enc);
5140 break;
5141 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5142 }
5143 }
5144
5145 /*
5146 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5147 * a) Perform vector D2L/F2I cast.
5148 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5149 * It signifies that source value could be any of the special floating point
5150 * values(NaN,-Inf,Inf,Max,-Min).
5151 * c) Set destination to zero if source is NaN value.
5152 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5153 */
5154
5155 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5156 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5157 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5158 int to_elem_sz = type2aelembytes(to_elem_bt);
5159 assert(to_elem_sz <= 4, "");
5160 vcvttps2dq(dst, src, vec_enc);
5161 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5162 if (to_elem_sz < 4) {
5163 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5164 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5165 }
5166 }
5167
5168 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5169 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5170 Register rscratch, int vec_enc) {
5171 int to_elem_sz = type2aelembytes(to_elem_bt);
5172 assert(to_elem_sz <= 4, "");
5173 vcvttps2dq(dst, src, vec_enc);
5174 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5175 switch(to_elem_bt) {
5176 case T_INT:
5177 break;
5178 case T_SHORT:
5179 evpmovdw(dst, dst, vec_enc);
5180 break;
5181 case T_BYTE:
5182 evpmovdb(dst, dst, vec_enc);
5183 break;
5184 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5185 }
5186 }
5187
5188 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5189 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5190 Register rscratch, int vec_enc) {
5191 evcvttps2qq(dst, src, vec_enc);
5192 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5193 }
5194
5195 // Handling for downcasting from double to integer or sub-word types on AVX2.
5196 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5197 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5198 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5199 int to_elem_sz = type2aelembytes(to_elem_bt);
5200 assert(to_elem_sz < 8, "");
5201 vcvttpd2dq(dst, src, vec_enc);
5202 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5203 float_sign_flip, vec_enc);
5204 if (to_elem_sz < 4) {
5205 // xtmp4 holds all zero lanes.
5206 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5207 }
5208 }
5209
5210 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5211 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5212 KRegister ktmp2, AddressLiteral sign_flip,
5213 Register rscratch, int vec_enc) {
5214 if (VM_Version::supports_avx512dq()) {
5215 evcvttpd2qq(dst, src, vec_enc);
5216 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5217 switch(to_elem_bt) {
5218 case T_LONG:
5219 break;
5220 case T_INT:
5221 evpmovsqd(dst, dst, vec_enc);
5222 break;
5223 case T_SHORT:
5224 evpmovsqd(dst, dst, vec_enc);
5225 evpmovdw(dst, dst, vec_enc);
5226 break;
5227 case T_BYTE:
5228 evpmovsqd(dst, dst, vec_enc);
5229 evpmovdb(dst, dst, vec_enc);
5230 break;
5231 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5232 }
5233 } else {
5234 assert(type2aelembytes(to_elem_bt) <= 4, "");
5235 vcvttpd2dq(dst, src, vec_enc);
5236 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5237 switch(to_elem_bt) {
5238 case T_INT:
5239 break;
5240 case T_SHORT:
5241 evpmovdw(dst, dst, vec_enc);
5242 break;
5243 case T_BYTE:
5244 evpmovdb(dst, dst, vec_enc);
5245 break;
5246 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5247 }
5248 }
5249 }
5250
5251 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5252 switch(to_elem_bt) {
5253 case T_LONG:
5254 evcvttps2qqs(dst, src, vec_enc);
5255 break;
5256 case T_INT:
5257 evcvttps2dqs(dst, src, vec_enc);
5258 break;
5259 case T_SHORT:
5260 evcvttps2dqs(dst, src, vec_enc);
5261 evpmovdw(dst, dst, vec_enc);
5262 break;
5263 case T_BYTE:
5264 evcvttps2dqs(dst, src, vec_enc);
5265 evpmovdb(dst, dst, vec_enc);
5266 break;
5267 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5268 }
5269 }
5270
5271 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5272 switch(to_elem_bt) {
5273 case T_LONG:
5274 evcvttps2qqs(dst, src, vec_enc);
5275 break;
5276 case T_INT:
5277 evcvttps2dqs(dst, src, vec_enc);
5278 break;
5279 case T_SHORT:
5280 evcvttps2dqs(dst, src, vec_enc);
5281 evpmovdw(dst, dst, vec_enc);
5282 break;
5283 case T_BYTE:
5284 evcvttps2dqs(dst, src, vec_enc);
5285 evpmovdb(dst, dst, vec_enc);
5286 break;
5287 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5288 }
5289 }
5290
5291 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5292 switch(to_elem_bt) {
5293 case T_LONG:
5294 evcvttpd2qqs(dst, src, vec_enc);
5295 break;
5296 case T_INT:
5297 evcvttpd2dqs(dst, src, vec_enc);
5298 break;
5299 case T_SHORT:
5300 evcvttpd2dqs(dst, src, vec_enc);
5301 evpmovdw(dst, dst, vec_enc);
5302 break;
5303 case T_BYTE:
5304 evcvttpd2dqs(dst, src, vec_enc);
5305 evpmovdb(dst, dst, vec_enc);
5306 break;
5307 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5308 }
5309 }
5310
5311 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5312 switch(to_elem_bt) {
5313 case T_LONG:
5314 evcvttpd2qqs(dst, src, vec_enc);
5315 break;
5316 case T_INT:
5317 evcvttpd2dqs(dst, src, vec_enc);
5318 break;
5319 case T_SHORT:
5320 evcvttpd2dqs(dst, src, vec_enc);
5321 evpmovdw(dst, dst, vec_enc);
5322 break;
5323 case T_BYTE:
5324 evcvttpd2dqs(dst, src, vec_enc);
5325 evpmovdb(dst, dst, vec_enc);
5326 break;
5327 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5328 }
5329 }
5330
5331 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5332 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5333 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5334 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5335 // and re-instantiate original MXCSR.RC mode after that.
5336 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5337
5338 mov64(tmp, julong_cast(0.5L));
5339 evpbroadcastq(xtmp1, tmp, vec_enc);
5340 vaddpd(xtmp1, src , xtmp1, vec_enc);
5341 evcvtpd2qq(dst, xtmp1, vec_enc);
5342 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5343 double_sign_flip, vec_enc);;
5344
5345 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5346 }
5347
5348 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5349 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5350 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5351 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5352 // and re-instantiate original MXCSR.RC mode after that.
5353 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5354
5355 movl(tmp, jint_cast(0.5));
5356 movq(xtmp1, tmp);
5357 vbroadcastss(xtmp1, xtmp1, vec_enc);
5358 vaddps(xtmp1, src , xtmp1, vec_enc);
5359 vcvtps2dq(dst, xtmp1, vec_enc);
5360 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5361 float_sign_flip, vec_enc);
5362
5363 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5364 }
5365
5366 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5367 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5368 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5369 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5370 // and re-instantiate original MXCSR.RC mode after that.
5371 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5372
5373 movl(tmp, jint_cast(0.5));
5374 movq(xtmp1, tmp);
5375 vbroadcastss(xtmp1, xtmp1, vec_enc);
5376 vaddps(xtmp1, src , xtmp1, vec_enc);
5377 vcvtps2dq(dst, xtmp1, vec_enc);
5378 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5379
5380 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5381 }
5382
5383 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5384 BasicType from_elem_bt, BasicType to_elem_bt) {
5385 switch (from_elem_bt) {
5386 case T_BYTE:
5387 switch (to_elem_bt) {
5388 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5389 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5390 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5391 default: ShouldNotReachHere();
5392 }
5393 break;
5394 case T_SHORT:
5395 switch (to_elem_bt) {
5396 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5397 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5398 default: ShouldNotReachHere();
5399 }
5400 break;
5401 case T_INT:
5402 assert(to_elem_bt == T_LONG, "");
5403 vpmovzxdq(dst, src, vlen_enc);
5404 break;
5405 default:
5406 ShouldNotReachHere();
5407 }
5408 }
5409
5410 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5411 BasicType from_elem_bt, BasicType to_elem_bt) {
5412 switch (from_elem_bt) {
5413 case T_BYTE:
5414 switch (to_elem_bt) {
5415 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5416 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5417 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5418 default: ShouldNotReachHere();
5419 }
5420 break;
5421 case T_SHORT:
5422 switch (to_elem_bt) {
5423 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5424 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5425 default: ShouldNotReachHere();
5426 }
5427 break;
5428 case T_INT:
5429 assert(to_elem_bt == T_LONG, "");
5430 vpmovsxdq(dst, src, vlen_enc);
5431 break;
5432 default:
5433 ShouldNotReachHere();
5434 }
5435 }
5436
5437 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5438 BasicType dst_bt, BasicType src_bt, int vlen) {
5439 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5440 assert(vlen_enc != AVX_512bit, "");
5441
5442 int dst_bt_size = type2aelembytes(dst_bt);
5443 int src_bt_size = type2aelembytes(src_bt);
5444 if (dst_bt_size > src_bt_size) {
5445 switch (dst_bt_size / src_bt_size) {
5446 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5447 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5448 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5449 default: ShouldNotReachHere();
5450 }
5451 } else {
5452 assert(dst_bt_size < src_bt_size, "");
5453 switch (src_bt_size / dst_bt_size) {
5454 case 2: {
5455 if (vlen_enc == AVX_128bit) {
5456 vpacksswb(dst, src, src, vlen_enc);
5457 } else {
5458 vpacksswb(dst, src, src, vlen_enc);
5459 vpermq(dst, dst, 0x08, vlen_enc);
5460 }
5461 break;
5462 }
5463 case 4: {
5464 if (vlen_enc == AVX_128bit) {
5465 vpackssdw(dst, src, src, vlen_enc);
5466 vpacksswb(dst, dst, dst, vlen_enc);
5467 } else {
5468 vpackssdw(dst, src, src, vlen_enc);
5469 vpermq(dst, dst, 0x08, vlen_enc);
5470 vpacksswb(dst, dst, dst, AVX_128bit);
5471 }
5472 break;
5473 }
5474 case 8: {
5475 if (vlen_enc == AVX_128bit) {
5476 vpshufd(dst, src, 0x08, vlen_enc);
5477 vpackssdw(dst, dst, dst, vlen_enc);
5478 vpacksswb(dst, dst, dst, vlen_enc);
5479 } else {
5480 vpshufd(dst, src, 0x08, vlen_enc);
5481 vpermq(dst, dst, 0x08, vlen_enc);
5482 vpackssdw(dst, dst, dst, AVX_128bit);
5483 vpacksswb(dst, dst, dst, AVX_128bit);
5484 }
5485 break;
5486 }
5487 default: ShouldNotReachHere();
5488 }
5489 }
5490 }
5491
5492 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5493 bool merge, BasicType bt, int vlen_enc) {
5494 if (bt == T_INT) {
5495 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5496 } else {
5497 assert(bt == T_LONG, "");
5498 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5499 }
5500 }
5501
5502 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5503 bool merge, BasicType bt, int vlen_enc) {
5504 if (bt == T_INT) {
5505 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5506 } else {
5507 assert(bt == T_LONG, "");
5508 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5509 }
5510 }
5511
5512 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5513 Register rtmp2, XMMRegister xtmp, int mask_len,
5514 int vec_enc) {
5515 int index = 0;
5516 int vindex = 0;
5517 mov64(rtmp1, 0x0101010101010101L);
5518 pdepq(rtmp1, src, rtmp1);
5519 if (mask_len > 8) {
5520 movq(rtmp2, src);
5521 vpxor(xtmp, xtmp, xtmp, vec_enc);
5522 movq(xtmp, rtmp1);
5523 }
5524 movq(dst, rtmp1);
5525
5526 mask_len -= 8;
5527 while (mask_len > 0) {
5528 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5529 index++;
5530 if ((index % 2) == 0) {
5531 pxor(xtmp, xtmp);
5532 }
5533 mov64(rtmp1, 0x0101010101010101L);
5534 shrq(rtmp2, 8);
5535 pdepq(rtmp1, rtmp2, rtmp1);
5536 pinsrq(xtmp, rtmp1, index % 2);
5537 vindex = index / 2;
5538 if (vindex) {
5539 // Write entire 16 byte vector when both 64 bit
5540 // lanes are update to save redundant instructions.
5541 if (index % 2) {
5542 vinsertf128(dst, dst, xtmp, vindex);
5543 }
5544 } else {
5545 vmovdqu(dst, xtmp);
5546 }
5547 mask_len -= 8;
5548 }
5549 }
5550
5551 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5552 switch(opc) {
5553 case Op_VectorMaskTrueCount:
5554 popcntq(dst, tmp);
5555 break;
5556 case Op_VectorMaskLastTrue:
5557 if (VM_Version::supports_lzcnt()) {
5558 lzcntq(tmp, tmp);
5559 movl(dst, 63);
5560 subl(dst, tmp);
5561 } else {
5562 movl(dst, -1);
5563 bsrq(tmp, tmp);
5564 cmov32(Assembler::notZero, dst, tmp);
5565 }
5566 break;
5567 case Op_VectorMaskFirstTrue:
5568 if (UseCountTrailingZerosInstruction) {
5569 if (masklen < 32) {
5570 orl(tmp, 1 << masklen);
5571 tzcntl(dst, tmp);
5572 } else if (masklen == 32) {
5573 tzcntl(dst, tmp);
5574 } else {
5575 assert(masklen == 64, "");
5576 tzcntq(dst, tmp);
5577 }
5578 } else {
5579 if (masklen < 32) {
5580 orl(tmp, 1 << masklen);
5581 bsfl(dst, tmp);
5582 } else {
5583 assert(masklen == 32 || masklen == 64, "");
5584 movl(dst, masklen);
5585 if (masklen == 32) {
5586 bsfl(tmp, tmp);
5587 } else {
5588 bsfq(tmp, tmp);
5589 }
5590 cmov32(Assembler::notZero, dst, tmp);
5591 }
5592 }
5593 break;
5594 case Op_VectorMaskToLong:
5595 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5596 break;
5597 default: assert(false, "Unhandled mask operation");
5598 }
5599 }
5600
5601 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5602 int masklen, int masksize, int vec_enc) {
5603 assert(VM_Version::supports_popcnt(), "");
5604
5605 if(VM_Version::supports_avx512bw()) {
5606 kmovql(tmp, mask);
5607 } else {
5608 assert(masklen <= 16, "");
5609 kmovwl(tmp, mask);
5610 }
5611
5612 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5613 // operations needs to be clipped.
5614 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5615 andq(tmp, (1 << masklen) - 1);
5616 }
5617
5618 vector_mask_operation_helper(opc, dst, tmp, masklen);
5619 }
5620
5621 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5622 Register tmp, int masklen, BasicType bt, int vec_enc) {
5623 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5624 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5625 assert(VM_Version::supports_popcnt(), "");
5626
5627 bool need_clip = false;
5628 switch(bt) {
5629 case T_BOOLEAN:
5630 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5631 vpxor(xtmp, xtmp, xtmp, vec_enc);
5632 vpsubb(xtmp, xtmp, mask, vec_enc);
5633 vpmovmskb(tmp, xtmp, vec_enc);
5634 need_clip = masklen < 16;
5635 break;
5636 case T_BYTE:
5637 vpmovmskb(tmp, mask, vec_enc);
5638 need_clip = masklen < 16;
5639 break;
5640 case T_SHORT:
5641 vpacksswb(xtmp, mask, mask, vec_enc);
5642 if (masklen >= 16) {
5643 vpermpd(xtmp, xtmp, 8, vec_enc);
5644 }
5645 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5646 need_clip = masklen < 16;
5647 break;
5648 case T_INT:
5649 case T_FLOAT:
5650 vmovmskps(tmp, mask, vec_enc);
5651 need_clip = masklen < 4;
5652 break;
5653 case T_LONG:
5654 case T_DOUBLE:
5655 vmovmskpd(tmp, mask, vec_enc);
5656 need_clip = masklen < 2;
5657 break;
5658 default: assert(false, "Unhandled type, %s", type2name(bt));
5659 }
5660
5661 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5662 // operations needs to be clipped.
5663 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5664 // need_clip implies masklen < 32
5665 andq(tmp, (1 << masklen) - 1);
5666 }
5667
5668 vector_mask_operation_helper(opc, dst, tmp, masklen);
5669 }
5670
5671 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5672 Register rtmp2, int mask_len) {
5673 kmov(rtmp1, src);
5674 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5675 mov64(rtmp2, -1L);
5676 pextq(rtmp2, rtmp2, rtmp1);
5677 kmov(dst, rtmp2);
5678 }
5679
5680 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5681 XMMRegister mask, Register rtmp, Register rscratch,
5682 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5683 int vec_enc) {
5684 assert(type2aelembytes(bt) >= 4, "");
5685 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5686 address compress_perm_table = nullptr;
5687 address expand_perm_table = nullptr;
5688 if (type2aelembytes(bt) == 8) {
5689 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5690 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5691 vmovmskpd(rtmp, mask, vec_enc);
5692 } else {
5693 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5694 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5695 vmovmskps(rtmp, mask, vec_enc);
5696 }
5697 shlq(rtmp, 5); // for 32 byte permute row.
5698 if (opcode == Op_CompressV) {
5699 lea(rscratch, ExternalAddress(compress_perm_table));
5700 } else {
5701 lea(rscratch, ExternalAddress(expand_perm_table));
5702 }
5703 addptr(rtmp, rscratch);
5704 vmovdqu(permv, Address(rtmp));
5705 vpermps(dst, permv, src, Assembler::AVX_256bit);
5706 vpxor(xtmp, xtmp, xtmp, vec_enc);
5707 // Blend the result with zero vector using permute mask, each column entry
5708 // in a permute table row contains either a valid permute index or a -1 (default)
5709 // value, this can potentially be used as a blending mask after
5710 // compressing/expanding the source vector lanes.
5711 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5712 }
5713
5714 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5715 bool merge, BasicType bt, int vec_enc) {
5716 if (opcode == Op_CompressV) {
5717 switch(bt) {
5718 case T_BYTE:
5719 evpcompressb(dst, mask, src, merge, vec_enc);
5720 break;
5721 case T_CHAR:
5722 case T_SHORT:
5723 evpcompressw(dst, mask, src, merge, vec_enc);
5724 break;
5725 case T_INT:
5726 evpcompressd(dst, mask, src, merge, vec_enc);
5727 break;
5728 case T_FLOAT:
5729 evcompressps(dst, mask, src, merge, vec_enc);
5730 break;
5731 case T_LONG:
5732 evpcompressq(dst, mask, src, merge, vec_enc);
5733 break;
5734 case T_DOUBLE:
5735 evcompresspd(dst, mask, src, merge, vec_enc);
5736 break;
5737 default:
5738 fatal("Unsupported type %s", type2name(bt));
5739 break;
5740 }
5741 } else {
5742 assert(opcode == Op_ExpandV, "");
5743 switch(bt) {
5744 case T_BYTE:
5745 evpexpandb(dst, mask, src, merge, vec_enc);
5746 break;
5747 case T_CHAR:
5748 case T_SHORT:
5749 evpexpandw(dst, mask, src, merge, vec_enc);
5750 break;
5751 case T_INT:
5752 evpexpandd(dst, mask, src, merge, vec_enc);
5753 break;
5754 case T_FLOAT:
5755 evexpandps(dst, mask, src, merge, vec_enc);
5756 break;
5757 case T_LONG:
5758 evpexpandq(dst, mask, src, merge, vec_enc);
5759 break;
5760 case T_DOUBLE:
5761 evexpandpd(dst, mask, src, merge, vec_enc);
5762 break;
5763 default:
5764 fatal("Unsupported type %s", type2name(bt));
5765 break;
5766 }
5767 }
5768 }
5769
5770 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5771 KRegister ktmp1, int vec_enc) {
5772 if (opcode == Op_SignumVD) {
5773 vsubpd(dst, zero, one, vec_enc);
5774 // if src < 0 ? -1 : 1
5775 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5776 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5777 // if src == NaN, -0.0 or 0.0 return src.
5778 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5779 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5780 } else {
5781 assert(opcode == Op_SignumVF, "");
5782 vsubps(dst, zero, one, vec_enc);
5783 // if src < 0 ? -1 : 1
5784 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5785 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5786 // if src == NaN, -0.0 or 0.0 return src.
5787 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5788 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5789 }
5790 }
5791
5792 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5793 XMMRegister xtmp1, int vec_enc) {
5794 if (opcode == Op_SignumVD) {
5795 vsubpd(dst, zero, one, vec_enc);
5796 // if src < 0 ? -1 : 1
5797 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5798 // if src == NaN, -0.0 or 0.0 return src.
5799 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5800 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5801 } else {
5802 assert(opcode == Op_SignumVF, "");
5803 vsubps(dst, zero, one, vec_enc);
5804 // if src < 0 ? -1 : 1
5805 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5806 // if src == NaN, -0.0 or 0.0 return src.
5807 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5808 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5809 }
5810 }
5811
5812 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5813 if (VM_Version::supports_avx512bw()) {
5814 if (mask_len > 32) {
5815 kmovql(dst, src);
5816 } else {
5817 kmovdl(dst, src);
5818 if (mask_len != 32) {
5819 kshiftrdl(dst, dst, 32 - mask_len);
5820 }
5821 }
5822 } else {
5823 assert(mask_len <= 16, "");
5824 kmovwl(dst, src);
5825 if (mask_len != 16) {
5826 kshiftrwl(dst, dst, 16 - mask_len);
5827 }
5828 }
5829 }
5830
5831 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5832 int lane_size = type2aelembytes(bt);
5833 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5834 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5835 movptr(rtmp, imm32);
5836 switch(lane_size) {
5837 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5838 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5839 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5840 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5841 fatal("Unsupported lane size %d", lane_size);
5842 break;
5843 }
5844 } else {
5845 movptr(rtmp, imm32);
5846 movq(dst, rtmp);
5847 switch(lane_size) {
5848 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5849 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5850 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5851 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5852 fatal("Unsupported lane size %d", lane_size);
5853 break;
5854 }
5855 }
5856 }
5857
5858 //
5859 // Following is lookup table based popcount computation algorithm:-
5860 // Index Bit set count
5861 // [ 0000 -> 0,
5862 // 0001 -> 1,
5863 // 0010 -> 1,
5864 // 0011 -> 2,
5865 // 0100 -> 1,
5866 // 0101 -> 2,
5867 // 0110 -> 2,
5868 // 0111 -> 3,
5869 // 1000 -> 1,
5870 // 1001 -> 2,
5871 // 1010 -> 3,
5872 // 1011 -> 3,
5873 // 1100 -> 2,
5874 // 1101 -> 3,
5875 // 1111 -> 4 ]
5876 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5877 // shuffle indices for lookup table access.
5878 // b. Right shift each byte of vector lane by 4 positions.
5879 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5880 // shuffle indices for lookup table access.
5881 // d. Add the bitset count of upper and lower 4 bits of each byte.
5882 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5883 // count of all the bytes of a quadword.
5884 // f. Perform step e. for upper 128bit vector lane.
5885 // g. Pack the bitset count of quadwords back to double word.
5886 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5887
5888 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5889 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5890 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5891 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5892 vpsrlw(dst, src, 4, vec_enc);
5893 vpand(dst, dst, xtmp1, vec_enc);
5894 vpand(xtmp1, src, xtmp1, vec_enc);
5895 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5896 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5897 vpshufb(dst, xtmp2, dst, vec_enc);
5898 vpaddb(dst, dst, xtmp1, vec_enc);
5899 }
5900
5901 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5902 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5903 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5904 // Following code is as per steps e,f,g and h of above algorithm.
5905 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5906 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5907 vpsadbw(dst, dst, xtmp2, vec_enc);
5908 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5909 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5910 vpackuswb(dst, xtmp1, dst, vec_enc);
5911 }
5912
5913 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5914 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5915 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5916 // Add the popcount of upper and lower bytes of word.
5917 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5918 vpsrlw(dst, xtmp1, 8, vec_enc);
5919 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5920 vpaddw(dst, dst, xtmp1, vec_enc);
5921 }
5922
5923 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5924 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5925 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5926 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5927 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5928 }
5929
5930 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5931 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5932 switch(bt) {
5933 case T_LONG:
5934 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5935 break;
5936 case T_INT:
5937 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5938 break;
5939 case T_CHAR:
5940 case T_SHORT:
5941 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5942 break;
5943 case T_BYTE:
5944 case T_BOOLEAN:
5945 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5946 break;
5947 default:
5948 fatal("Unsupported type %s", type2name(bt));
5949 break;
5950 }
5951 }
5952
5953 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5954 KRegister mask, bool merge, int vec_enc) {
5955 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5956 switch(bt) {
5957 case T_LONG:
5958 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5959 evpopcntq(dst, mask, src, merge, vec_enc);
5960 break;
5961 case T_INT:
5962 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5963 evpopcntd(dst, mask, src, merge, vec_enc);
5964 break;
5965 case T_CHAR:
5966 case T_SHORT:
5967 assert(VM_Version::supports_avx512_bitalg(), "");
5968 evpopcntw(dst, mask, src, merge, vec_enc);
5969 break;
5970 case T_BYTE:
5971 case T_BOOLEAN:
5972 assert(VM_Version::supports_avx512_bitalg(), "");
5973 evpopcntb(dst, mask, src, merge, vec_enc);
5974 break;
5975 default:
5976 fatal("Unsupported type %s", type2name(bt));
5977 break;
5978 }
5979 }
5980
5981 // Bit reversal algorithm first reverses the bits of each byte followed by
5982 // a byte level reversal for multi-byte primitive types (short/int/long).
5983 // Algorithm performs a lookup table access to get reverse bit sequence
5984 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5985 // is obtained by swapping the reverse bit sequences of upper and lower
5986 // nibble of a byte.
5987 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5988 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5989 if (VM_Version::supports_avx512vlbw()) {
5990
5991 // Get the reverse bit sequence of lower nibble of each byte.
5992 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5993 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5994 evpandq(dst, xtmp2, src, vec_enc);
5995 vpshufb(dst, xtmp1, dst, vec_enc);
5996 vpsllq(dst, dst, 4, vec_enc);
5997
5998 // Get the reverse bit sequence of upper nibble of each byte.
5999 vpandn(xtmp2, xtmp2, src, vec_enc);
6000 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6001 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6002
6003 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6004 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6005 evporq(xtmp2, dst, xtmp2, vec_enc);
6006 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6007
6008 } else if(vec_enc == Assembler::AVX_512bit) {
6009 // Shift based bit reversal.
6010 assert(bt == T_LONG || bt == T_INT, "");
6011
6012 // Swap lower and upper nibble of each byte.
6013 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6014
6015 // Swap two least and most significant bits of each nibble.
6016 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6017
6018 // Swap adjacent pair of bits.
6019 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6020 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6021
6022 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6023 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6024 } else {
6025 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6026 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6027
6028 // Get the reverse bit sequence of lower nibble of each byte.
6029 vpand(dst, xtmp2, src, vec_enc);
6030 vpshufb(dst, xtmp1, dst, vec_enc);
6031 vpsllq(dst, dst, 4, vec_enc);
6032
6033 // Get the reverse bit sequence of upper nibble of each byte.
6034 vpandn(xtmp2, xtmp2, src, vec_enc);
6035 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6036 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6037
6038 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6039 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6040 vpor(xtmp2, dst, xtmp2, vec_enc);
6041 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6042 }
6043 }
6044
6045 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6046 XMMRegister xtmp, Register rscratch) {
6047 assert(VM_Version::supports_gfni(), "");
6048 assert(rscratch != noreg || always_reachable(mask), "missing");
6049
6050 // Galois field instruction based bit reversal based on following algorithm.
6051 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6052 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6053 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6054 vector_reverse_byte(bt, dst, xtmp, vec_enc);
6055 }
6056
6057 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6058 XMMRegister xtmp1, Register rtmp, int vec_enc) {
6059 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6060 evpandq(dst, xtmp1, src, vec_enc);
6061 vpsllq(dst, dst, nbits, vec_enc);
6062 vpandn(xtmp1, xtmp1, src, vec_enc);
6063 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6064 evporq(dst, dst, xtmp1, vec_enc);
6065 }
6066
6067 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6068 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6069 // Shift based bit reversal.
6070 assert(VM_Version::supports_evex(), "");
6071 switch(bt) {
6072 case T_LONG:
6073 // Swap upper and lower double word of each quad word.
6074 evprorq(xtmp1, k0, src, 32, true, vec_enc);
6075 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6076 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6077 break;
6078 case T_INT:
6079 // Swap upper and lower word of each double word.
6080 evprord(xtmp1, k0, src, 16, true, vec_enc);
6081 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6082 break;
6083 case T_CHAR:
6084 case T_SHORT:
6085 // Swap upper and lower byte of each word.
6086 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6087 break;
6088 case T_BYTE:
6089 evmovdquq(dst, k0, src, true, vec_enc);
6090 break;
6091 default:
6092 fatal("Unsupported type %s", type2name(bt));
6093 break;
6094 }
6095 }
6096
6097 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6098 if (bt == T_BYTE) {
6099 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6100 evmovdquq(dst, k0, src, true, vec_enc);
6101 } else {
6102 vmovdqu(dst, src);
6103 }
6104 return;
6105 }
6106 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6107 // pre-computed shuffle indices.
6108 switch(bt) {
6109 case T_LONG:
6110 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6111 break;
6112 case T_INT:
6113 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6114 break;
6115 case T_CHAR:
6116 case T_SHORT:
6117 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6118 break;
6119 default:
6120 fatal("Unsupported type %s", type2name(bt));
6121 break;
6122 }
6123 vpshufb(dst, src, dst, vec_enc);
6124 }
6125
6126 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6127 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6128 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6129 assert(is_integral_type(bt), "");
6130 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6131 assert(VM_Version::supports_avx512cd(), "");
6132 switch(bt) {
6133 case T_LONG:
6134 evplzcntq(dst, ktmp, src, merge, vec_enc);
6135 break;
6136 case T_INT:
6137 evplzcntd(dst, ktmp, src, merge, vec_enc);
6138 break;
6139 case T_SHORT:
6140 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6141 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6142 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6143 vpunpckhwd(dst, xtmp1, src, vec_enc);
6144 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6145 vpackusdw(dst, xtmp2, dst, vec_enc);
6146 break;
6147 case T_BYTE:
6148 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6149 // accessing the lookup table.
6150 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6151 // accessing the lookup table.
6152 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6153 assert(VM_Version::supports_avx512bw(), "");
6154 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6155 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6156 vpand(xtmp2, dst, src, vec_enc);
6157 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6158 vpsrlw(xtmp3, src, 4, vec_enc);
6159 vpand(xtmp3, dst, xtmp3, vec_enc);
6160 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6161 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6162 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6163 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6164 break;
6165 default:
6166 fatal("Unsupported type %s", type2name(bt));
6167 break;
6168 }
6169 }
6170
6171 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6172 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6173 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6174 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6175 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6176 // accessing the lookup table.
6177 vpand(dst, xtmp2, src, vec_enc);
6178 vpshufb(dst, xtmp1, dst, vec_enc);
6179 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6180 // accessing the lookup table.
6181 vpsrlw(xtmp3, src, 4, vec_enc);
6182 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6183 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6184 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6185 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6186 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6187 vpaddb(dst, dst, xtmp2, vec_enc);
6188 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6189 }
6190
6191 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6192 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6193 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6194 // Add zero counts of lower byte and upper byte of a word if
6195 // upper byte holds a zero value.
6196 vpsrlw(xtmp3, src, 8, vec_enc);
6197 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6198 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6199 vpsllw(xtmp2, dst, 8, vec_enc);
6200 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6201 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6202 vpsrlw(dst, dst, 8, vec_enc);
6203 }
6204
6205 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6206 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6207 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6208 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6209 // exponent as the leading zero count.
6210
6211 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6212 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6213 // contributes to the leading number of zeros.
6214 vpsrld(dst, src, 1, vec_enc);
6215 vpandn(dst, dst, src, vec_enc);
6216
6217 vcvtdq2ps(dst, dst, vec_enc);
6218
6219 // By comparing the register to itself, all the bits in the destination are set.
6220 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6221
6222 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6223 vpsrld(xtmp2, xtmp1, 24, vec_enc);
6224 vpsrld(dst, dst, 23, vec_enc);
6225 vpand(dst, xtmp2, dst, vec_enc);
6226
6227 // Subtract 127 from the exponent, which removes the bias from the exponent.
6228 vpsrld(xtmp2, xtmp1, 25, vec_enc);
6229 vpsubd(dst, dst, xtmp2, vec_enc);
6230
6231 vpsrld(xtmp2, xtmp1, 27, vec_enc);
6232
6233 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6234 // is found in any of the lanes, replace the lane with -1 from xtmp1.
6235 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6236
6237 // If the original value is negative, replace the lane with 31.
6238 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6239
6240 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6241 // and for negative numbers the result is 0 as the exponent was replaced with 31.
6242 vpsubd(dst, xtmp2, dst, vec_enc);
6243 }
6244
6245 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6246 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6247 // Find the leading zeros of the top and bottom halves of the long individually.
6248 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6249
6250 // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6251 vpsrlq(xtmp1, dst, 32, vec_enc);
6252 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6253 // be in the most significant position of the bottom half.
6254 vpsrlq(xtmp2, dst, 6, vec_enc);
6255
6256 // In the bottom half, add the top half and bottom half results.
6257 vpaddq(dst, xtmp1, dst, vec_enc);
6258
6259 // For the bottom half, choose between the values using the most significant bit of xtmp2.
6260 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6261 // which contains only the top half result.
6262 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6263 // the lane as required.
6264 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6265 }
6266
6267 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6268 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6269 Register rtmp, int vec_enc) {
6270 assert(is_integral_type(bt), "unexpected type");
6271 assert(vec_enc < Assembler::AVX_512bit, "");
6272 switch(bt) {
6273 case T_LONG:
6274 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6275 break;
6276 case T_INT:
6277 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6278 break;
6279 case T_SHORT:
6280 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6281 break;
6282 case T_BYTE:
6283 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6284 break;
6285 default:
6286 fatal("Unsupported type %s", type2name(bt));
6287 break;
6288 }
6289 }
6290
6291 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6292 switch(bt) {
6293 case T_BYTE:
6294 vpsubb(dst, src1, src2, vec_enc);
6295 break;
6296 case T_SHORT:
6297 vpsubw(dst, src1, src2, vec_enc);
6298 break;
6299 case T_INT:
6300 vpsubd(dst, src1, src2, vec_enc);
6301 break;
6302 case T_LONG:
6303 vpsubq(dst, src1, src2, vec_enc);
6304 break;
6305 default:
6306 fatal("Unsupported type %s", type2name(bt));
6307 break;
6308 }
6309 }
6310
6311 // Trailing zero count computation is based on leading zero count operation as per
6312 // following equation. All AVX3 targets support AVX512CD feature which offers
6313 // direct vector instruction to compute leading zero count.
6314 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6315 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6316 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6317 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6318 assert(is_integral_type(bt), "");
6319 // xtmp = -1
6320 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6321 // xtmp = xtmp + src
6322 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6323 // xtmp = xtmp & ~src
6324 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6325 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6326 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6327 vpsub(bt, dst, xtmp4, dst, vec_enc);
6328 }
6329
6330 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6331 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6332 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6333 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6334 assert(is_integral_type(bt), "");
6335 // xtmp = 0
6336 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6337 // xtmp = 0 - src
6338 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6339 // xtmp = xtmp | src
6340 vpor(xtmp3, xtmp3, src, vec_enc);
6341 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6342 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6343 vpsub(bt, dst, xtmp1, dst, vec_enc);
6344 }
6345
6346 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6347 Label done;
6348 Label neg_divisor_fastpath;
6349 cmpl(divisor, 0);
6350 jccb(Assembler::less, neg_divisor_fastpath);
6351 xorl(rdx, rdx);
6352 divl(divisor);
6353 jmpb(done);
6354 bind(neg_divisor_fastpath);
6355 // Fastpath for divisor < 0:
6356 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6357 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6358 movl(rdx, rax);
6359 subl(rdx, divisor);
6360 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6361 andnl(rax, rdx, rax);
6362 } else {
6363 notl(rdx);
6364 andl(rax, rdx);
6365 }
6366 shrl(rax, 31);
6367 bind(done);
6368 }
6369
6370 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6371 Label done;
6372 Label neg_divisor_fastpath;
6373 cmpl(divisor, 0);
6374 jccb(Assembler::less, neg_divisor_fastpath);
6375 xorl(rdx, rdx);
6376 divl(divisor);
6377 jmpb(done);
6378 bind(neg_divisor_fastpath);
6379 // Fastpath when divisor < 0:
6380 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6381 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6382 movl(rdx, rax);
6383 subl(rax, divisor);
6384 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6385 andnl(rax, rax, rdx);
6386 } else {
6387 notl(rax);
6388 andl(rax, rdx);
6389 }
6390 sarl(rax, 31);
6391 andl(rax, divisor);
6392 subl(rdx, rax);
6393 bind(done);
6394 }
6395
6396 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6397 Label done;
6398 Label neg_divisor_fastpath;
6399
6400 cmpl(divisor, 0);
6401 jccb(Assembler::less, neg_divisor_fastpath);
6402 xorl(rdx, rdx);
6403 divl(divisor);
6404 jmpb(done);
6405 bind(neg_divisor_fastpath);
6406 // Fastpath for divisor < 0:
6407 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6408 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6409 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6410 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6411 movl(rdx, rax);
6412 subl(rax, divisor);
6413 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6414 andnl(rax, rax, rdx);
6415 } else {
6416 notl(rax);
6417 andl(rax, rdx);
6418 }
6419 movl(tmp, rax);
6420 shrl(rax, 31); // quotient
6421 sarl(tmp, 31);
6422 andl(tmp, divisor);
6423 subl(rdx, tmp); // remainder
6424 bind(done);
6425 }
6426
6427 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6428 XMMRegister xtmp2, Register rtmp) {
6429 if(VM_Version::supports_gfni()) {
6430 // Galois field instruction based bit reversal based on following algorithm.
6431 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6432 mov64(rtmp, 0x8040201008040201L);
6433 movq(xtmp1, src);
6434 movq(xtmp2, rtmp);
6435 gf2p8affineqb(xtmp1, xtmp2, 0);
6436 movq(dst, xtmp1);
6437 } else {
6438 // Swap even and odd numbered bits.
6439 movl(rtmp, src);
6440 andl(rtmp, 0x55555555);
6441 shll(rtmp, 1);
6442 movl(dst, src);
6443 andl(dst, 0xAAAAAAAA);
6444 shrl(dst, 1);
6445 orl(dst, rtmp);
6446
6447 // Swap LSB and MSB 2 bits of each nibble.
6448 movl(rtmp, dst);
6449 andl(rtmp, 0x33333333);
6450 shll(rtmp, 2);
6451 andl(dst, 0xCCCCCCCC);
6452 shrl(dst, 2);
6453 orl(dst, rtmp);
6454
6455 // Swap LSB and MSB 4 bits of each byte.
6456 movl(rtmp, dst);
6457 andl(rtmp, 0x0F0F0F0F);
6458 shll(rtmp, 4);
6459 andl(dst, 0xF0F0F0F0);
6460 shrl(dst, 4);
6461 orl(dst, rtmp);
6462 }
6463 bswapl(dst);
6464 }
6465
6466 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6467 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6468 if(VM_Version::supports_gfni()) {
6469 // Galois field instruction based bit reversal based on following algorithm.
6470 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6471 mov64(rtmp1, 0x8040201008040201L);
6472 movq(xtmp1, src);
6473 movq(xtmp2, rtmp1);
6474 gf2p8affineqb(xtmp1, xtmp2, 0);
6475 movq(dst, xtmp1);
6476 } else {
6477 // Swap even and odd numbered bits.
6478 movq(rtmp1, src);
6479 mov64(rtmp2, 0x5555555555555555L);
6480 andq(rtmp1, rtmp2);
6481 shlq(rtmp1, 1);
6482 movq(dst, src);
6483 notq(rtmp2);
6484 andq(dst, rtmp2);
6485 shrq(dst, 1);
6486 orq(dst, rtmp1);
6487
6488 // Swap LSB and MSB 2 bits of each nibble.
6489 movq(rtmp1, dst);
6490 mov64(rtmp2, 0x3333333333333333L);
6491 andq(rtmp1, rtmp2);
6492 shlq(rtmp1, 2);
6493 notq(rtmp2);
6494 andq(dst, rtmp2);
6495 shrq(dst, 2);
6496 orq(dst, rtmp1);
6497
6498 // Swap LSB and MSB 4 bits of each byte.
6499 movq(rtmp1, dst);
6500 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6501 andq(rtmp1, rtmp2);
6502 shlq(rtmp1, 4);
6503 notq(rtmp2);
6504 andq(dst, rtmp2);
6505 shrq(dst, 4);
6506 orq(dst, rtmp1);
6507 }
6508 bswapq(dst);
6509 }
6510
6511 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6512 Label done;
6513 Label neg_divisor_fastpath;
6514 cmpq(divisor, 0);
6515 jccb(Assembler::less, neg_divisor_fastpath);
6516 xorl(rdx, rdx);
6517 divq(divisor);
6518 jmpb(done);
6519 bind(neg_divisor_fastpath);
6520 // Fastpath for divisor < 0:
6521 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6522 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6523 movq(rdx, rax);
6524 subq(rdx, divisor);
6525 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6526 andnq(rax, rdx, rax);
6527 } else {
6528 notq(rdx);
6529 andq(rax, rdx);
6530 }
6531 shrq(rax, 63);
6532 bind(done);
6533 }
6534
6535 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6536 Label done;
6537 Label neg_divisor_fastpath;
6538 cmpq(divisor, 0);
6539 jccb(Assembler::less, neg_divisor_fastpath);
6540 xorq(rdx, rdx);
6541 divq(divisor);
6542 jmp(done);
6543 bind(neg_divisor_fastpath);
6544 // Fastpath when divisor < 0:
6545 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6546 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6547 movq(rdx, rax);
6548 subq(rax, divisor);
6549 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6550 andnq(rax, rax, rdx);
6551 } else {
6552 notq(rax);
6553 andq(rax, rdx);
6554 }
6555 sarq(rax, 63);
6556 andq(rax, divisor);
6557 subq(rdx, rax);
6558 bind(done);
6559 }
6560
6561 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6562 Label done;
6563 Label neg_divisor_fastpath;
6564 cmpq(divisor, 0);
6565 jccb(Assembler::less, neg_divisor_fastpath);
6566 xorq(rdx, rdx);
6567 divq(divisor);
6568 jmp(done);
6569 bind(neg_divisor_fastpath);
6570 // Fastpath for divisor < 0:
6571 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6572 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6573 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6574 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6575 movq(rdx, rax);
6576 subq(rax, divisor);
6577 if (VM_Version::supports_bmi1() && VM_Version::supports_avx()) {
6578 andnq(rax, rax, rdx);
6579 } else {
6580 notq(rax);
6581 andq(rax, rdx);
6582 }
6583 movq(tmp, rax);
6584 shrq(rax, 63); // quotient
6585 sarq(tmp, 63);
6586 andq(tmp, divisor);
6587 subq(rdx, tmp); // remainder
6588 bind(done);
6589 }
6590
6591 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6592 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6593 int vlen_enc) {
6594 assert(VM_Version::supports_avx512bw(), "");
6595 // Byte shuffles are inlane operations and indices are determined using
6596 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6597 // normalized to index range 0-15. This makes sure that all the multiples
6598 // of an index value are placed at same relative position in 128 bit
6599 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6600 // will be 16th element in their respective 128 bit lanes.
6601 movl(rtmp, 16);
6602 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6603
6604 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6605 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6606 // original shuffle indices and move the shuffled lanes corresponding to true
6607 // mask to destination vector.
6608 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6609 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6610 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6611
6612 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6613 // and broadcasting second 128 bit lane.
6614 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6615 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6616 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6617 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6618 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6619
6620 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6621 // and broadcasting third 128 bit lane.
6622 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6623 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6624 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6625 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6626 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6627
6628 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6629 // and broadcasting third 128 bit lane.
6630 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6631 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6632 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6633 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6634 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6635 }
6636
6637 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6638 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6639 if (vlen_enc == AVX_128bit) {
6640 vpermilps(dst, src, shuffle, vlen_enc);
6641 } else if (bt == T_INT) {
6642 vpermd(dst, shuffle, src, vlen_enc);
6643 } else {
6644 assert(bt == T_FLOAT, "");
6645 vpermps(dst, shuffle, src, vlen_enc);
6646 }
6647 }
6648
6649 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6650 switch(opcode) {
6651 case Op_AddHF: vaddsh(dst, src1, src2); break;
6652 case Op_SubHF: vsubsh(dst, src1, src2); break;
6653 case Op_MulHF: vmulsh(dst, src1, src2); break;
6654 case Op_DivHF: vdivsh(dst, src1, src2); break;
6655 default: assert(false, "%s", NodeClassNames[opcode]); break;
6656 }
6657 }
6658
6659 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6660 switch(elem_bt) {
6661 case T_BYTE:
6662 if (ideal_opc == Op_SaturatingAddV) {
6663 vpaddsb(dst, src1, src2, vlen_enc);
6664 } else {
6665 assert(ideal_opc == Op_SaturatingSubV, "");
6666 vpsubsb(dst, src1, src2, vlen_enc);
6667 }
6668 break;
6669 case T_SHORT:
6670 if (ideal_opc == Op_SaturatingAddV) {
6671 vpaddsw(dst, src1, src2, vlen_enc);
6672 } else {
6673 assert(ideal_opc == Op_SaturatingSubV, "");
6674 vpsubsw(dst, src1, src2, vlen_enc);
6675 }
6676 break;
6677 default:
6678 fatal("Unsupported type %s", type2name(elem_bt));
6679 break;
6680 }
6681 }
6682
6683 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6684 switch(elem_bt) {
6685 case T_BYTE:
6686 if (ideal_opc == Op_SaturatingAddV) {
6687 vpaddusb(dst, src1, src2, vlen_enc);
6688 } else {
6689 assert(ideal_opc == Op_SaturatingSubV, "");
6690 vpsubusb(dst, src1, src2, vlen_enc);
6691 }
6692 break;
6693 case T_SHORT:
6694 if (ideal_opc == Op_SaturatingAddV) {
6695 vpaddusw(dst, src1, src2, vlen_enc);
6696 } else {
6697 assert(ideal_opc == Op_SaturatingSubV, "");
6698 vpsubusw(dst, src1, src2, vlen_enc);
6699 }
6700 break;
6701 default:
6702 fatal("Unsupported type %s", type2name(elem_bt));
6703 break;
6704 }
6705 }
6706
6707 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6708 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6709 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6710 // overflow_mask = Inp1 <u Inp2
6711 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6712 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6713 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6714 }
6715
6716 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6717 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6718 // Emulate unsigned comparison using signed comparison
6719 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6720 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6721 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6722 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6723
6724 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6725
6726 // Res = INP1 - INP2 (non-commutative and non-associative)
6727 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6728 // Res = Mask ? Zero : Res
6729 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6730 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6731 }
6732
6733 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6734 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6735 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6736 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6737 // Res = Signed Add INP1, INP2
6738 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6739 // T1 = SRC1 | SRC2
6740 vpor(xtmp1, src1, src2, vlen_enc);
6741 // Max_Unsigned = -1
6742 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6743 // Unsigned compare: Mask = Res <u T1
6744 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6745 // res = Mask ? Max_Unsigned : Res
6746 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6747 }
6748
6749 //
6750 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6751 // unsigned addition operation.
6752 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6753 //
6754 // We empirically determined its semantic equivalence to following reduced expression
6755 // overflow_mask = (a + b) <u (a | b)
6756 //
6757 // and also verified it though Alive2 solver.
6758 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6759 //
6760
6761 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6762 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6763 // Res = Signed Add INP1, INP2
6764 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6765 // Compute T1 = INP1 | INP2
6766 vpor(xtmp3, src1, src2, vlen_enc);
6767 // T1 = Minimum signed value.
6768 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6769 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6770 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6771 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6772 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6773 // Compute overflow detection mask = Res<1> <s T1
6774 if (elem_bt == T_INT) {
6775 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6776 } else {
6777 assert(elem_bt == T_LONG, "");
6778 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6779 }
6780 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6781 }
6782
6783 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6784 int vlen_enc, bool xtmp2_hold_M1) {
6785 if (VM_Version::supports_avx512dq()) {
6786 evpmovq2m(ktmp, src, vlen_enc);
6787 } else {
6788 assert(VM_Version::supports_evex(), "");
6789 if (!xtmp2_hold_M1) {
6790 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6791 }
6792 evpsraq(xtmp1, src, 63, vlen_enc);
6793 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6794 }
6795 }
6796
6797 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6798 int vlen_enc, bool xtmp2_hold_M1) {
6799 if (VM_Version::supports_avx512dq()) {
6800 evpmovd2m(ktmp, src, vlen_enc);
6801 } else {
6802 assert(VM_Version::supports_evex(), "");
6803 if (!xtmp2_hold_M1) {
6804 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6805 }
6806 vpsrad(xtmp1, src, 31, vlen_enc);
6807 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6808 }
6809 }
6810
6811
6812 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6813 if (elem_bt == T_LONG) {
6814 if (VM_Version::supports_evex()) {
6815 evpsraq(dst, src, 63, vlen_enc);
6816 } else {
6817 vpsrad(dst, src, 31, vlen_enc);
6818 vpshufd(dst, dst, 0xF5, vlen_enc);
6819 }
6820 } else {
6821 assert(elem_bt == T_INT, "");
6822 vpsrad(dst, src, 31, vlen_enc);
6823 }
6824 }
6825
6826 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6827 if (compute_allones) {
6828 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6829 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6830 } else {
6831 vpcmpeqq(allones, allones, allones, vlen_enc);
6832 }
6833 }
6834 if (elem_bt == T_LONG) {
6835 vpsrlq(dst, allones, 1, vlen_enc);
6836 } else {
6837 assert(elem_bt == T_INT, "");
6838 vpsrld(dst, allones, 1, vlen_enc);
6839 }
6840 }
6841
6842 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6843 if (compute_allones) {
6844 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6845 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6846 } else {
6847 vpcmpeqq(allones, allones, allones, vlen_enc);
6848 }
6849 }
6850 if (elem_bt == T_LONG) {
6851 vpsllq(dst, allones, 63, vlen_enc);
6852 } else {
6853 assert(elem_bt == T_INT, "");
6854 vpslld(dst, allones, 31, vlen_enc);
6855 }
6856 }
6857
6858 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6859 Assembler::ComparisonPredicate cond, int vlen_enc) {
6860 switch(elem_bt) {
6861 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6862 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6863 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6864 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6865 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6866 }
6867 }
6868
6869 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6870 switch(elem_bt) {
6871 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6872 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6873 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6874 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6875 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6876 }
6877 }
6878
6879 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6880 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6881 if (elem_bt == T_LONG) {
6882 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6883 } else {
6884 assert(elem_bt == T_INT, "");
6885 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6886 }
6887 }
6888
6889 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6890 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6891 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6892 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6893 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6894 // Overflow detection based on Hacker's delight section 2-13.
6895 if (ideal_opc == Op_SaturatingAddV) {
6896 // res = src1 + src2
6897 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6898 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6899 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6900 vpxor(xtmp1, dst, src1, vlen_enc);
6901 vpxor(xtmp2, dst, src2, vlen_enc);
6902 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6903 } else {
6904 assert(ideal_opc == Op_SaturatingSubV, "");
6905 // res = src1 - src2
6906 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6907 // Overflow occurs when both inputs have opposite polarity and
6908 // result polarity does not comply with first input polarity.
6909 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6910 vpxor(xtmp1, src1, src2, vlen_enc);
6911 vpxor(xtmp2, dst, src1, vlen_enc);
6912 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6913 }
6914
6915 // Compute overflow detection mask.
6916 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6917 // Note: xtmp1 hold -1 in all its lanes after above call.
6918
6919 // Compute mask based on first input polarity.
6920 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6921
6922 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6923 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6924
6925 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6926 // set bits in first input polarity mask holds a min value.
6927 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6928 // Blend destination lanes with saturated values using overflow detection mask.
6929 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6930 }
6931
6932
6933 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6934 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6935 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6936 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6937 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6938 // Overflow detection based on Hacker's delight section 2-13.
6939 if (ideal_opc == Op_SaturatingAddV) {
6940 // res = src1 + src2
6941 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6942 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6943 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6944 vpxor(xtmp1, dst, src1, vlen_enc);
6945 vpxor(xtmp2, dst, src2, vlen_enc);
6946 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6947 } else {
6948 assert(ideal_opc == Op_SaturatingSubV, "");
6949 // res = src1 - src2
6950 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6951 // Overflow occurs when both inputs have opposite polarity and
6952 // result polarity does not comply with first input polarity.
6953 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6954 vpxor(xtmp1, src1, src2, vlen_enc);
6955 vpxor(xtmp2, dst, src1, vlen_enc);
6956 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6957 }
6958
6959 // Sign-extend to compute overflow detection mask.
6960 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6961
6962 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6963 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6964 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6965
6966 // Compose saturating min/max vector using first input polarity mask.
6967 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6968 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6969
6970 // Blend result with saturating vector using overflow detection mask.
6971 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6972 }
6973
6974 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6975 switch(elem_bt) {
6976 case T_BYTE:
6977 if (ideal_opc == Op_SaturatingAddV) {
6978 vpaddsb(dst, src1, src2, vlen_enc);
6979 } else {
6980 assert(ideal_opc == Op_SaturatingSubV, "");
6981 vpsubsb(dst, src1, src2, vlen_enc);
6982 }
6983 break;
6984 case T_SHORT:
6985 if (ideal_opc == Op_SaturatingAddV) {
6986 vpaddsw(dst, src1, src2, vlen_enc);
6987 } else {
6988 assert(ideal_opc == Op_SaturatingSubV, "");
6989 vpsubsw(dst, src1, src2, vlen_enc);
6990 }
6991 break;
6992 default:
6993 fatal("Unsupported type %s", type2name(elem_bt));
6994 break;
6995 }
6996 }
6997
6998 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6999 switch(elem_bt) {
7000 case T_BYTE:
7001 if (ideal_opc == Op_SaturatingAddV) {
7002 vpaddusb(dst, src1, src2, vlen_enc);
7003 } else {
7004 assert(ideal_opc == Op_SaturatingSubV, "");
7005 vpsubusb(dst, src1, src2, vlen_enc);
7006 }
7007 break;
7008 case T_SHORT:
7009 if (ideal_opc == Op_SaturatingAddV) {
7010 vpaddusw(dst, src1, src2, vlen_enc);
7011 } else {
7012 assert(ideal_opc == Op_SaturatingSubV, "");
7013 vpsubusw(dst, src1, src2, vlen_enc);
7014 }
7015 break;
7016 default:
7017 fatal("Unsupported type %s", type2name(elem_bt));
7018 break;
7019 }
7020 }
7021
7022 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7023 XMMRegister src2, int vlen_enc) {
7024 switch(elem_bt) {
7025 case T_BYTE:
7026 evpermi2b(dst, src1, src2, vlen_enc);
7027 break;
7028 case T_SHORT:
7029 evpermi2w(dst, src1, src2, vlen_enc);
7030 break;
7031 case T_INT:
7032 evpermi2d(dst, src1, src2, vlen_enc);
7033 break;
7034 case T_LONG:
7035 evpermi2q(dst, src1, src2, vlen_enc);
7036 break;
7037 case T_FLOAT:
7038 evpermi2ps(dst, src1, src2, vlen_enc);
7039 break;
7040 case T_DOUBLE:
7041 evpermi2pd(dst, src1, src2, vlen_enc);
7042 break;
7043 default:
7044 fatal("Unsupported type %s", type2name(elem_bt));
7045 break;
7046 }
7047 }
7048
7049 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7050 if (is_unsigned) {
7051 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7052 } else {
7053 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7054 }
7055 }
7056
7057 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7058 if (is_unsigned) {
7059 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7060 } else {
7061 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7062 }
7063 }
7064
7065 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7066 switch(opcode) {
7067 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7068 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7069 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7070 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7071 default: assert(false, "%s", NodeClassNames[opcode]); break;
7072 }
7073 }
7074
7075 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7076 switch(opcode) {
7077 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7078 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7079 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7080 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7081 default: assert(false, "%s", NodeClassNames[opcode]); break;
7082 }
7083 }
7084
7085 void C2_MacroAssembler::sminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7086 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7087 vminmax_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7088 }
7089
7090 void C2_MacroAssembler::sminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7091 KRegister ktmp) {
7092 if (opcode == Op_MaxHF) {
7093 // dst = max(src1, src2)
7094 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN);
7095 } else {
7096 assert(opcode == Op_MinHF, "");
7097 // dst = min(src1, src2)
7098 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN);
7099 }
7100 }
7101
7102 void C2_MacroAssembler::vminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7103 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7104 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7105 // Move sign bits of src2 to mask register.
7106 evpmovw2m(ktmp, src2, vlen_enc);
7107 // xtmp1 = src2 < 0 ? src2 : src1
7108 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7109 // xtmp2 = src2 < 0 ? ? src1 : src2
7110 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7111 // Idea behind above swapping is to make seconds source operand a +ve value.
7112 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7113 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7114 // the second source operand, either a NaN or a valid floating-point value, is returned
7115 // dst = max(xtmp1, xtmp2)
7116 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7117 // isNaN = is_unordered_quiet(xtmp1)
7118 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7119 // Final result is same as first source if its a NaN value,
7120 // in case second operand holds a NaN value then as per above semantics
7121 // result is same as second operand.
7122 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7123 } else {
7124 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7125 // Move sign bits of src1 to mask register.
7126 evpmovw2m(ktmp, src1, vlen_enc);
7127 // xtmp1 = src1 < 0 ? src2 : src1
7128 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7129 // xtmp2 = src1 < 0 ? src1 : src2
7130 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7131 // Idea behind above swapping is to make seconds source operand a -ve value.
7132 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7133 // the second source operand is returned.
7134 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7135 // or a valid floating-point value, is written to the result.
7136 // dst = min(xtmp1, xtmp2)
7137 evminph(dst, xtmp1, xtmp2, vlen_enc);
7138 // isNaN = is_unordered_quiet(xtmp1)
7139 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7140 // Final result is same as first source if its a NaN value,
7141 // in case second operand holds a NaN value then as per above semantics
7142 // result is same as second operand.
7143 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7144 }
7145 }
7146
7147 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7148 KRegister ktmp, int vlen_enc) {
7149 if (opcode == Op_MaxVHF) {
7150 // dst = max(src1, src2)
7151 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7152 } else {
7153 assert(opcode == Op_MinVHF, "");
7154 // dst = min(src1, src2)
7155 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7156 }
7157 }
7158
7159 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, Address src2,
7160 KRegister ktmp, int vlen_enc) {
7161 if (opcode == Op_MaxVHF) {
7162 // dst = max(src1, src2)
7163 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7164 } else {
7165 assert(opcode == Op_MinVHF, "");
7166 // dst = min(src1, src2)
7167 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7168 }
7169 }
7170
7171 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
7172 // The vector iota entries array is ordered by type B/S/I/L/F/D, and
7173 // the offset between two types is 16.
7174 switch(bt) {
7175 case T_BYTE:
7176 return 0;
7177 case T_SHORT:
7178 return 1;
7179 case T_INT:
7180 return 2;
7181 case T_LONG:
7182 return 3;
7183 case T_FLOAT:
7184 return 4;
7185 case T_DOUBLE:
7186 return 5;
7187 default:
7188 ShouldNotReachHere();
7189 }
7190 }