1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "../../share/runtime/globals.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "gc/shared/barrierSet.hpp"
29 #include "gc/shared/barrierSetAssembler.hpp"
30 #include "oops/methodData.hpp"
31 #include "opto/c2_MacroAssembler.hpp"
32 #include "opto/intrinsicnode.hpp"
33 #include "opto/output.hpp"
34 #include "opto/opcodes.hpp"
35 #include "opto/subnode.hpp"
36 #include "runtime/globals.hpp"
37 #include "runtime/objectMonitor.hpp"
38 #include "runtime/objectMonitorTable.hpp"
39 #include "runtime/stubRoutines.hpp"
40 #include "runtime/synchronizer.hpp"
41 #include "utilities/checkedCast.hpp"
42 #include "utilities/globalDefinitions.hpp"
43 #include "utilities/powerOfTwo.hpp"
44 #include "utilities/sizes.hpp"
45
46 #ifdef PRODUCT
47 #define BLOCK_COMMENT(str) /* nothing */
48 #define STOP(error) stop(error)
49 #else
50 #define BLOCK_COMMENT(str) block_comment(str)
51 #define STOP(error) block_comment(error); stop(error)
52 #endif
53
54 // C2 compiled method's prolog code.
55 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
56 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
57
58 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
59 // Remove word for return addr
60 framesize -= wordSize;
61 stack_bang_size -= wordSize;
62
63 // Calls to C2R adapters often do not accept exceptional returns.
64 // We require that their callers must bang for them. But be careful, because
65 // some VM calls (such as call site linkage) can use several kilobytes of
66 // stack. But the stack safety zone should account for that.
67 // See bugs 4446381, 4468289, 4497237.
68 if (stack_bang_size > 0) {
69 generate_stack_overflow_check(stack_bang_size);
70
71 // We always push rbp, so that on return to interpreter rbp, will be
72 // restored correctly and we can correct the stack.
73 push(rbp);
74 // Save caller's stack pointer into RBP if the frame pointer is preserved.
75 if (PreserveFramePointer) {
76 mov(rbp, rsp);
77 }
78 // Remove word for ebp
79 framesize -= wordSize;
80
81 // Create frame
82 if (framesize) {
83 subptr(rsp, framesize);
84 }
85 } else {
86 subptr(rsp, framesize);
87
88 // Save RBP register now.
89 framesize -= wordSize;
90 movptr(Address(rsp, framesize), rbp);
91 // Save caller's stack pointer into RBP if the frame pointer is preserved.
92 if (PreserveFramePointer) {
93 movptr(rbp, rsp);
94 if (framesize > 0) {
95 addptr(rbp, framesize);
96 }
97 }
98 }
99
100 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
101 framesize -= wordSize;
102 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
103 }
104
105 #ifdef ASSERT
106 if (VerifyStackAtCalls) {
107 Label L;
108 push(rax);
109 mov(rax, rsp);
110 andptr(rax, StackAlignmentInBytes-1);
111 cmpptr(rax, StackAlignmentInBytes-wordSize);
112 pop(rax);
113 jcc(Assembler::equal, L);
114 STOP("Stack is not properly aligned!");
115 bind(L);
116 }
117 #endif
118
119 if (!is_stub) {
120 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
121 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
122 Label dummy_slow_path;
123 Label dummy_continuation;
124 Label* slow_path = &dummy_slow_path;
125 Label* continuation = &dummy_continuation;
126 if (!Compile::current()->output()->in_scratch_emit_size()) {
127 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
128 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
129 Compile::current()->output()->add_stub(stub);
130 slow_path = &stub->entry();
131 continuation = &stub->continuation();
132 }
133 bs->nmethod_entry_barrier(this, slow_path, continuation);
134 }
135 }
136
137 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
138 switch (vlen_in_bytes) {
139 case 4: // fall-through
140 case 8: // fall-through
141 case 16: return Assembler::AVX_128bit;
142 case 32: return Assembler::AVX_256bit;
143 case 64: return Assembler::AVX_512bit;
144
145 default: {
146 ShouldNotReachHere();
147 return Assembler::AVX_NoVec;
148 }
149 }
150 }
151
152 // fast_lock and fast_unlock used by C2
153
154 // Because the transitions from emitted code to the runtime
155 // monitorenter/exit helper stubs are so slow it's critical that
156 // we inline both the stack-locking fast path and the inflated fast path.
157 //
158 // See also: cmpFastLock and cmpFastUnlock.
159 //
160 // What follows is a specialized inline transliteration of the code
161 // in enter() and exit(). If we're concerned about I$ bloat another
162 // option would be to emit TrySlowEnter and TrySlowExit methods
163 // at startup-time. These methods would accept arguments as
164 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
165 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
166 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
167 // In practice, however, the # of lock sites is bounded and is usually small.
168 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
169 // if the processor uses simple bimodal branch predictors keyed by EIP
170 // Since the helper routines would be called from multiple synchronization
171 // sites.
172 //
173 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
174 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
175 // to those specialized methods. That'd give us a mostly platform-independent
176 // implementation that the JITs could optimize and inline at their pleasure.
177 // Done correctly, the only time we'd need to cross to native could would be
178 // to park() or unpark() threads. We'd also need a few more unsafe operators
179 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
180 // (b) explicit barriers or fence operations.
181 //
182 // TODO:
183 //
184 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
185 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
186 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
187 // the lock operators would typically be faster than reifying Self.
188 //
189 // * Ideally I'd define the primitives as:
190 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
191 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
192 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
193 // Instead, we're stuck with a rather awkward and brittle register assignments below.
194 // Furthermore the register assignments are overconstrained, possibly resulting in
195 // sub-optimal code near the synchronization site.
196 //
197 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
198 // Alternately, use a better sp-proximity test.
199 //
200 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
201 // Either one is sufficient to uniquely identify a thread.
202 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
203 //
204 // * Intrinsify notify() and notifyAll() for the common cases where the
205 // object is locked by the calling thread but the waitlist is empty.
206 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
207 //
208 // * use jccb and jmpb instead of jcc and jmp to improve code density.
209 // But beware of excessive branch density on AMD Opterons.
210 //
211 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
212 // or failure of the fast path. If the fast path fails then we pass
213 // control to the slow path, typically in C. In fast_lock and
214 // fast_unlock we often branch to DONE_LABEL, just to find that C2
215 // will emit a conditional branch immediately after the node.
216 // So we have branches to branches and lots of ICC.ZF games.
217 // Instead, it might be better to have C2 pass a "FailureLabel"
218 // into fast_lock and fast_unlock. In the case of success, control
219 // will drop through the node. ICC.ZF is undefined at exit.
220 // In the case of failure, the node will branch directly to the
221 // FailureLabel
222
223 // obj: object to lock
224 // box: on-stack box address -- KILLED
225 // rax: tmp -- KILLED
226 // t : tmp -- KILLED
227 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
228 Register t, Register thread) {
229 assert(rax_reg == rax, "Used for CAS");
230 assert_different_registers(obj, box, rax_reg, t, thread);
231
232 // Handle inflated monitor.
233 Label inflated;
234 // Finish fast lock successfully. ZF value is irrelevant.
235 Label locked;
236 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
237 Label slow_path;
238
239 if (UseObjectMonitorTable) {
240 // Clear cache in case fast locking succeeds or we need to take the slow-path.
241 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
242 }
243
244 if (DiagnoseSyncOnValueBasedClasses != 0) {
245 load_klass(rax_reg, obj, t);
246 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
247 jcc(Assembler::notZero, slow_path);
248 }
249
250 const Register mark = t;
251
252 { // Fast Lock
253
254 Label push;
255
256 const Register top = UseObjectMonitorTable ? rax_reg : box;
257
258 // Load the mark.
259 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
260
261 // Prefetch top.
262 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
263
264 // Check for monitor (0b10).
265 testptr(mark, markWord::monitor_value);
266 jcc(Assembler::notZero, inflated);
267
268 // Check if lock-stack is full.
269 cmpl(top, LockStack::end_offset() - 1);
270 jcc(Assembler::greater, slow_path);
271
272 // Check if recursive.
273 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
274 jccb(Assembler::equal, push);
275
276 // Try to lock. Transition lock bits 0b01 => 0b00
277 movptr(rax_reg, mark);
278 orptr(rax_reg, markWord::unlocked_value);
279 andptr(mark, ~(int32_t)markWord::unlocked_value);
280 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
281 jcc(Assembler::notEqual, slow_path);
282
283 if (UseObjectMonitorTable) {
284 // Need to reload top, clobbered by CAS.
285 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
286 }
287 bind(push);
288 // After successful lock, push object on lock-stack.
289 movptr(Address(thread, top), obj);
290 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
291 jmp(locked);
292 }
293
294 { // Handle inflated monitor.
295 bind(inflated);
296
297 const Register monitor = t;
298
299 if (!UseObjectMonitorTable) {
300 assert(mark == monitor, "should be the same here");
301 } else {
302 const Register hash = t;
303 Label monitor_found;
304
305 // Look for the monitor in the om_cache.
306
307 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
308 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
309 const int num_unrolled = OMCache::CAPACITY;
310 for (int i = 0; i < num_unrolled; i++) {
311 movptr(monitor, Address(thread, cache_offset + monitor_offset));
312 cmpptr(obj, Address(thread, cache_offset));
313 jccb(Assembler::equal, monitor_found);
314 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
315 }
316
317 if (UseCompactObjectHeaders) {
318 // TODO: The fast-path table lookup currently doesn't work with Lilliput's
319 // compact identity-hashcode implementation.
320 // See: https://bugs.openjdk.org/browse/JDK-8380981
321 jmp(slow_path);
322 } else {
323 // Look for the monitor in the table.
324
325 // Get the hash code.
326 movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
327 shrq(hash, markWord::hash_shift);
328 andq(hash, markWord::hash_mask);
329
330 // Get the table and calculate the bucket's address.
331 lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
332 movptr(rax_reg, Address(rax_reg));
333 andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
334 movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
335
336 // Read the monitor from the bucket.
337 movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
338
339 // Check if the monitor in the bucket is special (empty, tombstone or removed)
340 cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
341 jcc(Assembler::below, slow_path);
342
343 // Check if object matches.
344 movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
345 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
346 bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
347 cmpptr(rax_reg, obj);
348 jcc(Assembler::notEqual, slow_path);
349 }
350 bind(monitor_found);
351 }
352 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
353 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
354 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
355
356 Label monitor_locked;
357 // Lock the monitor.
358
359 if (UseObjectMonitorTable) {
360 // Cache the monitor for unlock before trashing box. On failure to acquire
361 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
362 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
363 }
364
365 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
366 xorptr(rax_reg, rax_reg);
367 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
368 lock(); cmpxchgptr(box, owner_address);
369 jccb(Assembler::equal, monitor_locked);
370
371 // Check if recursive.
372 cmpptr(box, rax_reg);
373 jccb(Assembler::notEqual, slow_path);
374
375 // Recursive.
376 increment(recursions_address);
377
378 bind(monitor_locked);
379 }
380
381 bind(locked);
382 // Set ZF = 1
383 xorl(rax_reg, rax_reg);
384
385 #ifdef ASSERT
386 // Check that locked label is reached with ZF set.
387 Label zf_correct;
388 Label zf_bad_zero;
389 jcc(Assembler::zero, zf_correct);
390 jmp(zf_bad_zero);
391 #endif
392
393 bind(slow_path);
394 #ifdef ASSERT
395 // Check that slow_path label is reached with ZF not set.
396 jcc(Assembler::notZero, zf_correct);
397 stop("Fast Lock ZF != 0");
398 bind(zf_bad_zero);
399 stop("Fast Lock ZF != 1");
400 bind(zf_correct);
401 #endif
402 // C2 uses the value of ZF to determine the continuation.
403 }
404
405 // obj: object to lock
406 // rax: tmp -- KILLED
407 // t : tmp - cannot be obj nor rax -- KILLED
408 //
409 // Some commentary on balanced locking:
410 //
411 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
412 // Methods that don't have provably balanced locking are forced to run in the
413 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
414 // The interpreter provides two properties:
415 // I1: At return-time the interpreter automatically and quietly unlocks any
416 // objects acquired in the current activation (frame). Recall that the
417 // interpreter maintains an on-stack list of locks currently held by
418 // a frame.
419 // I2: If a method attempts to unlock an object that is not held by the
420 // frame the interpreter throws IMSX.
421 //
422 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
423 // B() doesn't have provably balanced locking so it runs in the interpreter.
424 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
425 // is still locked by A().
426 //
427 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
428 // Specification" states that an object locked by JNI's MonitorEnter should not be
429 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
430 // specify what will occur if a program engages in such mixed-mode locking, however.
431 // Arguably given that the spec legislates the JNI case as undefined our implementation
432 // could reasonably *avoid* checking owner in fast_unlock().
433 // In the interest of performance we elide m->Owner==Self check in unlock.
434 // A perfectly viable alternative is to elide the owner check except when
435 // Xcheck:jni is enabled.
436
437 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
438 assert(reg_rax == rax, "Used for CAS");
439 assert_different_registers(obj, reg_rax, t);
440
441 // Handle inflated monitor.
442 Label inflated, inflated_check_lock_stack;
443 // Finish fast unlock successfully. MUST jump with ZF == 1
444 Label unlocked, slow_path;
445
446 const Register mark = t;
447 const Register monitor = t;
448 const Register top = UseObjectMonitorTable ? t : reg_rax;
449 const Register box = reg_rax;
450
451 Label dummy;
452 C2FastUnlockStub* stub = nullptr;
453
454 if (!Compile::current()->output()->in_scratch_emit_size()) {
455 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
456 Compile::current()->output()->add_stub(stub);
457 }
458
459 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
460
461 { // Fast Unlock
462
463 // Load top.
464 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
465
466 if (!UseObjectMonitorTable) {
467 // Prefetch mark.
468 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
469 }
470
471 // Check if obj is top of lock-stack.
472 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
473 // Top of lock stack was not obj. Must be monitor.
474 jcc(Assembler::notEqual, inflated_check_lock_stack);
475
476 // Pop lock-stack.
477 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
478 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
479
480 // Check if recursive.
481 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
482 jcc(Assembler::equal, unlocked);
483
484 // We elide the monitor check, let the CAS fail instead.
485
486 if (UseObjectMonitorTable) {
487 // Load mark.
488 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
489 }
490
491 // Try to unlock. Transition lock bits 0b00 => 0b01
492 movptr(reg_rax, mark);
493 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
494 orptr(mark, markWord::unlocked_value);
495 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
496 jcc(Assembler::notEqual, push_and_slow_path);
497 jmp(unlocked);
498 }
499
500
501 { // Handle inflated monitor.
502 bind(inflated_check_lock_stack);
503 #ifdef ASSERT
504 Label check_done;
505 subl(top, oopSize);
506 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
507 jcc(Assembler::below, check_done);
508 cmpptr(obj, Address(thread, top));
509 jcc(Assembler::notEqual, inflated_check_lock_stack);
510 stop("Fast Unlock lock on stack");
511 bind(check_done);
512 if (UseObjectMonitorTable) {
513 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
514 }
515 testptr(mark, markWord::monitor_value);
516 jcc(Assembler::notZero, inflated);
517 stop("Fast Unlock not monitor");
518 #endif
519
520 bind(inflated);
521
522 if (!UseObjectMonitorTable) {
523 assert(mark == monitor, "should be the same here");
524 } else {
525 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
526 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
527 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
528 cmpptr(monitor, alignof(ObjectMonitor*));
529 jcc(Assembler::below, slow_path);
530 }
531 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
532 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
533 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
534 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
535 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
536
537 Label recursive;
538
539 // Check if recursive.
540 cmpptr(recursions_address, 0);
541 jcc(Assembler::notZero, recursive);
542
543 // Set owner to null.
544 // Release to satisfy the JMM
545 movptr(owner_address, NULL_WORD);
546 // We need a full fence after clearing owner to avoid stranding.
547 // StoreLoad achieves this.
548 membar(StoreLoad);
549
550 // Check if the entry_list is empty.
551 cmpptr(entry_list_address, NULL_WORD);
552 jcc(Assembler::zero, unlocked); // If so we are done.
553
554 // Check if there is a successor.
555 cmpptr(succ_address, NULL_WORD);
556 jcc(Assembler::notZero, unlocked); // If so we are done.
557
558 // Save the monitor pointer in the current thread, so we can try to
559 // reacquire the lock in SharedRuntime::monitor_exit_helper().
560 if (!UseObjectMonitorTable) {
561 andptr(monitor, ~(int32_t)markWord::monitor_value);
562 }
563 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
564
565 orl(t, 1); // Fast Unlock ZF = 0
566 jmpb(slow_path);
567
568 // Recursive unlock.
569 bind(recursive);
570 decrement(recursions_address);
571 }
572
573 bind(unlocked);
574 xorl(t, t); // Fast Unlock ZF = 1
575
576 #ifdef ASSERT
577 // Check that unlocked label is reached with ZF set.
578 Label zf_correct;
579 Label zf_bad_zero;
580 jcc(Assembler::zero, zf_correct);
581 jmp(zf_bad_zero);
582 #endif
583
584 bind(slow_path);
585 if (stub != nullptr) {
586 bind(stub->slow_path_continuation());
587 }
588 #ifdef ASSERT
589 // Check that stub->continuation() label is reached with ZF not set.
590 jcc(Assembler::notZero, zf_correct);
591 stop("Fast Unlock ZF != 0");
592 bind(zf_bad_zero);
593 stop("Fast Unlock ZF != 1");
594 bind(zf_correct);
595 #endif
596 // C2 uses the value of ZF to determine the continuation.
597 }
598
599 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
600 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
601 }
602
603 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
604 const int framesize = Compile::current()->output()->frame_size_in_bytes();
605 masm->movptr(dst, rsp);
606 if (framesize > 2 * wordSize) {
607 masm->addptr(dst, framesize - 2 * wordSize);
608 }
609 }
610
611 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
612 if (PreserveFramePointer) {
613 // frame pointer is valid
614 #ifdef ASSERT
615 // Verify frame pointer value in rbp.
616 reconstruct_frame_pointer_helper(this, rtmp);
617 Label L_success;
618 cmpq(rbp, rtmp);
619 jccb(Assembler::equal, L_success);
620 STOP("frame pointer mismatch");
621 bind(L_success);
622 #endif // ASSERT
623 } else {
624 reconstruct_frame_pointer_helper(this, rbp);
625 }
626 }
627
628 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
629 jint lo = t->_lo;
630 jint hi = t->_hi;
631 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
632 if (t == TypeInt::INT) {
633 return;
634 }
635
636 BLOCK_COMMENT("CastII {");
637 Label fail;
638 Label succeed;
639
640 if (lo != min_jint) {
641 cmpl(val, lo);
642 jccb(Assembler::less, fail);
643 }
644 if (hi != max_jint) {
645 cmpl(val, hi);
646 jccb(Assembler::greater, fail);
647 }
648 jmpb(succeed);
649
650 bind(fail);
651 movl(c_rarg0, idx);
652 movl(c_rarg1, val);
653 movl(c_rarg2, lo);
654 movl(c_rarg3, hi);
655 reconstruct_frame_pointer(rscratch1);
656 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
657 hlt();
658 bind(succeed);
659 BLOCK_COMMENT("} // CastII");
660 }
661
662 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
663 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
664 }
665
666 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
667 jlong lo = t->_lo;
668 jlong hi = t->_hi;
669 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
670 if (t == TypeLong::LONG) {
671 return;
672 }
673
674 BLOCK_COMMENT("CastLL {");
675 Label fail;
676 Label succeed;
677
678 auto cmp_val = [&](jlong bound) {
679 if (is_simm32(bound)) {
680 cmpq(val, checked_cast<int>(bound));
681 } else {
682 mov64(tmp, bound);
683 cmpq(val, tmp);
684 }
685 };
686
687 if (lo != min_jlong) {
688 cmp_val(lo);
689 jccb(Assembler::less, fail);
690 }
691 if (hi != max_jlong) {
692 cmp_val(hi);
693 jccb(Assembler::greater, fail);
694 }
695 jmpb(succeed);
696
697 bind(fail);
698 movl(c_rarg0, idx);
699 movq(c_rarg1, val);
700 mov64(c_rarg2, lo);
701 mov64(c_rarg3, hi);
702 reconstruct_frame_pointer(rscratch1);
703 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
704 hlt();
705 bind(succeed);
706 BLOCK_COMMENT("} // CastLL");
707 }
708
709 //-------------------------------------------------------------------------------------------
710 // Generic instructions support for use in .ad files C2 code generation
711
712 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
713 if (dst != src) {
714 movdqu(dst, src);
715 }
716 if (opcode == Op_AbsVD) {
717 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
718 } else {
719 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
720 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
721 }
722 }
723
724 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
725 if (opcode == Op_AbsVD) {
726 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
727 } else {
728 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
729 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
730 }
731 }
732
733 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
734 if (dst != src) {
735 movdqu(dst, src);
736 }
737 if (opcode == Op_AbsVF) {
738 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
739 } else {
740 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
741 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
742 }
743 }
744
745 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
746 if (opcode == Op_AbsVF) {
747 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
748 } else {
749 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
750 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
751 }
752 }
753
754 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
755 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
756 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
757
758 if (opcode == Op_MinV) {
759 if (elem_bt == T_BYTE) {
760 pminsb(dst, src);
761 } else if (elem_bt == T_SHORT) {
762 pminsw(dst, src);
763 } else if (elem_bt == T_INT) {
764 pminsd(dst, src);
765 } else {
766 assert(elem_bt == T_LONG, "required");
767 assert(tmp == xmm0, "required");
768 assert_different_registers(dst, src, tmp);
769 movdqu(xmm0, dst);
770 pcmpgtq(xmm0, src);
771 blendvpd(dst, src); // xmm0 as mask
772 }
773 } else { // opcode == Op_MaxV
774 if (elem_bt == T_BYTE) {
775 pmaxsb(dst, src);
776 } else if (elem_bt == T_SHORT) {
777 pmaxsw(dst, src);
778 } else if (elem_bt == T_INT) {
779 pmaxsd(dst, src);
780 } else {
781 assert(elem_bt == T_LONG, "required");
782 assert(tmp == xmm0, "required");
783 assert_different_registers(dst, src, tmp);
784 movdqu(xmm0, src);
785 pcmpgtq(xmm0, dst);
786 blendvpd(dst, src); // xmm0 as mask
787 }
788 }
789 }
790
791 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
792 XMMRegister src1, Address src2, int vlen_enc) {
793 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
794 if (opcode == Op_UMinV) {
795 switch(elem_bt) {
796 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
797 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
798 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
799 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
800 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
801 }
802 } else {
803 assert(opcode == Op_UMaxV, "required");
804 switch(elem_bt) {
805 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
806 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
807 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
808 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
809 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
810 }
811 }
812 }
813
814 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
815 // For optimality, leverage a full vector width of 512 bits
816 // for operations over smaller vector sizes on AVX512 targets.
817 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
818 if (opcode == Op_UMaxV) {
819 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
820 } else {
821 assert(opcode == Op_UMinV, "required");
822 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
823 }
824 } else {
825 // T1 = -1
826 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
827 // T1 = -1 << 63
828 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
829 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
830 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
831 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
832 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
833 // Mask = T2 > T1
834 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
835 if (opcode == Op_UMaxV) {
836 // Res = Mask ? Src2 : Src1
837 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
838 } else {
839 // Res = Mask ? Src1 : Src2
840 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
841 }
842 }
843 }
844
845 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
846 XMMRegister src1, XMMRegister src2, int vlen_enc) {
847 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
848 if (opcode == Op_UMinV) {
849 switch(elem_bt) {
850 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
851 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
852 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
853 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
854 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
855 }
856 } else {
857 assert(opcode == Op_UMaxV, "required");
858 switch(elem_bt) {
859 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
860 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
861 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
862 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
863 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
864 }
865 }
866 }
867
868 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
869 XMMRegister dst, XMMRegister src1, XMMRegister src2,
870 int vlen_enc) {
871 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
872
873 if (opcode == Op_MinV) {
874 if (elem_bt == T_BYTE) {
875 vpminsb(dst, src1, src2, vlen_enc);
876 } else if (elem_bt == T_SHORT) {
877 vpminsw(dst, src1, src2, vlen_enc);
878 } else if (elem_bt == T_INT) {
879 vpminsd(dst, src1, src2, vlen_enc);
880 } else {
881 assert(elem_bt == T_LONG, "required");
882 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
883 vpminsq(dst, src1, src2, vlen_enc);
884 } else {
885 assert_different_registers(dst, src1, src2);
886 vpcmpgtq(dst, src1, src2, vlen_enc);
887 vblendvpd(dst, src1, src2, dst, vlen_enc);
888 }
889 }
890 } else { // opcode == Op_MaxV
891 if (elem_bt == T_BYTE) {
892 vpmaxsb(dst, src1, src2, vlen_enc);
893 } else if (elem_bt == T_SHORT) {
894 vpmaxsw(dst, src1, src2, vlen_enc);
895 } else if (elem_bt == T_INT) {
896 vpmaxsd(dst, src1, src2, vlen_enc);
897 } else {
898 assert(elem_bt == T_LONG, "required");
899 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
900 vpmaxsq(dst, src1, src2, vlen_enc);
901 } else {
902 assert_different_registers(dst, src1, src2);
903 vpcmpgtq(dst, src1, src2, vlen_enc);
904 vblendvpd(dst, src2, src1, dst, vlen_enc);
905 }
906 }
907 }
908 }
909
910 // Float/Double min max
911
912 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
913 XMMRegister dst, XMMRegister a, XMMRegister b,
914 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
915 int vlen_enc) {
916 assert(UseAVX > 0, "required");
917 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
918 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
919 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
920 assert_different_registers(a, tmp, atmp, btmp);
921 assert_different_registers(b, tmp, atmp, btmp);
922
923 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
924 bool is_double_word = is_double_word_type(elem_bt);
925
926 /* Note on 'non-obvious' assembly sequence:
927 *
928 * While there are vminps/vmaxps instructions, there are two important differences between hardware
929 * and Java on how they handle floats:
930 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
931 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
932 *
933 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
934 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
935 * (only useful when signs differ, noop otherwise)
936 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
937
938 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
939 * btmp = (b < +0.0) ? a : b
940 * atmp = (b < +0.0) ? b : a
941 * Tmp = Max_Float(atmp , btmp)
942 * Res = (atmp == NaN) ? atmp : Tmp
943 */
944
945 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
946 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
947 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
948 XMMRegister mask;
949
950 if (!is_double_word && is_min) {
951 mask = a;
952 vblend = &MacroAssembler::vblendvps;
953 vmaxmin = &MacroAssembler::vminps;
954 vcmp = &MacroAssembler::vcmpps;
955 } else if (!is_double_word && !is_min) {
956 mask = b;
957 vblend = &MacroAssembler::vblendvps;
958 vmaxmin = &MacroAssembler::vmaxps;
959 vcmp = &MacroAssembler::vcmpps;
960 } else if (is_double_word && is_min) {
961 mask = a;
962 vblend = &MacroAssembler::vblendvpd;
963 vmaxmin = &MacroAssembler::vminpd;
964 vcmp = &MacroAssembler::vcmppd;
965 } else {
966 assert(is_double_word && !is_min, "sanity");
967 mask = b;
968 vblend = &MacroAssembler::vblendvpd;
969 vmaxmin = &MacroAssembler::vmaxpd;
970 vcmp = &MacroAssembler::vcmppd;
971 }
972
973 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
974 XMMRegister maxmin, scratch;
975 if (dst == btmp) {
976 maxmin = btmp;
977 scratch = tmp;
978 } else {
979 maxmin = tmp;
980 scratch = btmp;
981 }
982
983 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
984 if (precompute_mask && !is_double_word) {
985 vpsrad(tmp, mask, 32, vlen_enc);
986 mask = tmp;
987 } else if (precompute_mask && is_double_word) {
988 vpxor(tmp, tmp, tmp, vlen_enc);
989 vpcmpgtq(tmp, tmp, mask, vlen_enc);
990 mask = tmp;
991 }
992
993 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
994 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
995 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
996 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
997 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
998 }
999
1000 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1001 XMMRegister dst, XMMRegister a, XMMRegister b,
1002 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1003 int vlen_enc) {
1004 assert(UseAVX > 2, "required");
1005 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1006 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1007 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1008 assert_different_registers(dst, a, atmp, btmp);
1009 assert_different_registers(dst, b, atmp, btmp);
1010
1011 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1012 bool is_double_word = is_double_word_type(elem_bt);
1013 bool merge = true;
1014
1015 if (!is_double_word && is_min) {
1016 evpmovd2m(ktmp, a, vlen_enc);
1017 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019 vminps(dst, atmp, btmp, vlen_enc);
1020 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022 } else if (!is_double_word && !is_min) {
1023 evpmovd2m(ktmp, b, vlen_enc);
1024 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1025 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1026 vmaxps(dst, atmp, btmp, vlen_enc);
1027 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1029 } else if (is_double_word && is_min) {
1030 evpmovq2m(ktmp, a, vlen_enc);
1031 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1032 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1033 vminpd(dst, atmp, btmp, vlen_enc);
1034 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1035 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1036 } else {
1037 assert(is_double_word && !is_min, "sanity");
1038 evpmovq2m(ktmp, b, vlen_enc);
1039 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1040 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1041 vmaxpd(dst, atmp, btmp, vlen_enc);
1042 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1043 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1044 }
1045 }
1046
1047 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1048 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1049 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1050 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1051
1052 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1053 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1054 if (elem_bt == T_FLOAT) {
1055 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1056 } else {
1057 assert(elem_bt == T_DOUBLE, "");
1058 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1059 }
1060 }
1061
1062 // Float/Double signum
1063 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1064 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1065
1066 Label DONE_LABEL;
1067
1068 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1069 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1070 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1071 if (opcode == Op_SignumF) {
1072 if (VM_Version::supports_avx10_2()) {
1073 vucomxss(dst, zero);
1074 jcc(Assembler::negative, DONE_LABEL);
1075 } else {
1076 ucomiss(dst, zero);
1077 jcc(Assembler::equal, DONE_LABEL);
1078 }
1079 movflt(dst, one);
1080 jcc(Assembler::above, DONE_LABEL);
1081 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1082 } else if (opcode == Op_SignumD) {
1083 if (VM_Version::supports_avx10_2()) {
1084 vucomxsd(dst, zero);
1085 jcc(Assembler::negative, DONE_LABEL);
1086 } else {
1087 ucomisd(dst, zero);
1088 jcc(Assembler::equal, DONE_LABEL);
1089 }
1090 movdbl(dst, one);
1091 jcc(Assembler::above, DONE_LABEL);
1092 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1093 }
1094
1095 bind(DONE_LABEL);
1096 }
1097
1098 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1099 if (sign) {
1100 pmovsxbw(dst, src);
1101 } else {
1102 pmovzxbw(dst, src);
1103 }
1104 }
1105
1106 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1107 if (sign) {
1108 vpmovsxbw(dst, src, vector_len);
1109 } else {
1110 vpmovzxbw(dst, src, vector_len);
1111 }
1112 }
1113
1114 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1115 if (sign) {
1116 vpmovsxbd(dst, src, vector_len);
1117 } else {
1118 vpmovzxbd(dst, src, vector_len);
1119 }
1120 }
1121
1122 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1123 if (sign) {
1124 vpmovsxwd(dst, src, vector_len);
1125 } else {
1126 vpmovzxwd(dst, src, vector_len);
1127 }
1128 }
1129
1130 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1131 int shift, int vector_len) {
1132 if (opcode == Op_RotateLeftV) {
1133 if (etype == T_INT) {
1134 evprold(dst, src, shift, vector_len);
1135 } else {
1136 assert(etype == T_LONG, "expected type T_LONG");
1137 evprolq(dst, src, shift, vector_len);
1138 }
1139 } else {
1140 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1141 if (etype == T_INT) {
1142 evprord(dst, src, shift, vector_len);
1143 } else {
1144 assert(etype == T_LONG, "expected type T_LONG");
1145 evprorq(dst, src, shift, vector_len);
1146 }
1147 }
1148 }
1149
1150 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1151 XMMRegister shift, int vector_len) {
1152 if (opcode == Op_RotateLeftV) {
1153 if (etype == T_INT) {
1154 evprolvd(dst, src, shift, vector_len);
1155 } else {
1156 assert(etype == T_LONG, "expected type T_LONG");
1157 evprolvq(dst, src, shift, vector_len);
1158 }
1159 } else {
1160 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1161 if (etype == T_INT) {
1162 evprorvd(dst, src, shift, vector_len);
1163 } else {
1164 assert(etype == T_LONG, "expected type T_LONG");
1165 evprorvq(dst, src, shift, vector_len);
1166 }
1167 }
1168 }
1169
1170 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1171 if (opcode == Op_RShiftVI) {
1172 psrad(dst, shift);
1173 } else if (opcode == Op_LShiftVI) {
1174 pslld(dst, shift);
1175 } else {
1176 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1177 psrld(dst, shift);
1178 }
1179 }
1180
1181 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1182 switch (opcode) {
1183 case Op_RShiftVI: psrad(dst, shift); break;
1184 case Op_LShiftVI: pslld(dst, shift); break;
1185 case Op_URShiftVI: psrld(dst, shift); break;
1186
1187 default: assert(false, "%s", NodeClassNames[opcode]);
1188 }
1189 }
1190
1191 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1192 if (opcode == Op_RShiftVI) {
1193 vpsrad(dst, nds, shift, vector_len);
1194 } else if (opcode == Op_LShiftVI) {
1195 vpslld(dst, nds, shift, vector_len);
1196 } else {
1197 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1198 vpsrld(dst, nds, shift, vector_len);
1199 }
1200 }
1201
1202 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1203 switch (opcode) {
1204 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1205 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1206 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1207
1208 default: assert(false, "%s", NodeClassNames[opcode]);
1209 }
1210 }
1211
1212 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1213 switch (opcode) {
1214 case Op_RShiftVB: // fall-through
1215 case Op_RShiftVS: psraw(dst, shift); break;
1216
1217 case Op_LShiftVB: // fall-through
1218 case Op_LShiftVS: psllw(dst, shift); break;
1219
1220 case Op_URShiftVS: // fall-through
1221 case Op_URShiftVB: psrlw(dst, shift); break;
1222
1223 default: assert(false, "%s", NodeClassNames[opcode]);
1224 }
1225 }
1226
1227 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1228 switch (opcode) {
1229 case Op_RShiftVB: // fall-through
1230 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1231
1232 case Op_LShiftVB: // fall-through
1233 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1234
1235 case Op_URShiftVS: // fall-through
1236 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1237
1238 default: assert(false, "%s", NodeClassNames[opcode]);
1239 }
1240 }
1241
1242 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1243 switch (opcode) {
1244 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1245 case Op_LShiftVL: psllq(dst, shift); break;
1246 case Op_URShiftVL: psrlq(dst, shift); break;
1247
1248 default: assert(false, "%s", NodeClassNames[opcode]);
1249 }
1250 }
1251
1252 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1253 if (opcode == Op_RShiftVL) {
1254 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1255 } else if (opcode == Op_LShiftVL) {
1256 psllq(dst, shift);
1257 } else {
1258 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1259 psrlq(dst, shift);
1260 }
1261 }
1262
1263 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1264 switch (opcode) {
1265 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1266 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1267 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1268
1269 default: assert(false, "%s", NodeClassNames[opcode]);
1270 }
1271 }
1272
1273 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1274 if (opcode == Op_RShiftVL) {
1275 evpsraq(dst, nds, shift, vector_len);
1276 } else if (opcode == Op_LShiftVL) {
1277 vpsllq(dst, nds, shift, vector_len);
1278 } else {
1279 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1280 vpsrlq(dst, nds, shift, vector_len);
1281 }
1282 }
1283
1284 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1285 switch (opcode) {
1286 case Op_RShiftVB: // fall-through
1287 case Op_RShiftVS: // fall-through
1288 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1289
1290 case Op_LShiftVB: // fall-through
1291 case Op_LShiftVS: // fall-through
1292 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1293
1294 case Op_URShiftVB: // fall-through
1295 case Op_URShiftVS: // fall-through
1296 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1297
1298 default: assert(false, "%s", NodeClassNames[opcode]);
1299 }
1300 }
1301
1302 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1303 switch (opcode) {
1304 case Op_RShiftVB: // fall-through
1305 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1306
1307 case Op_LShiftVB: // fall-through
1308 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1309
1310 case Op_URShiftVB: // fall-through
1311 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1312
1313 default: assert(false, "%s", NodeClassNames[opcode]);
1314 }
1315 }
1316
1317 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1318 assert(UseAVX >= 2, "required");
1319 switch (opcode) {
1320 case Op_RShiftVL: {
1321 if (UseAVX > 2) {
1322 assert(tmp == xnoreg, "not used");
1323 if (!VM_Version::supports_avx512vl()) {
1324 vlen_enc = Assembler::AVX_512bit;
1325 }
1326 evpsravq(dst, src, shift, vlen_enc);
1327 } else {
1328 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1329 vpsrlvq(dst, src, shift, vlen_enc);
1330 vpsrlvq(tmp, tmp, shift, vlen_enc);
1331 vpxor(dst, dst, tmp, vlen_enc);
1332 vpsubq(dst, dst, tmp, vlen_enc);
1333 }
1334 break;
1335 }
1336 case Op_LShiftVL: {
1337 assert(tmp == xnoreg, "not used");
1338 vpsllvq(dst, src, shift, vlen_enc);
1339 break;
1340 }
1341 case Op_URShiftVL: {
1342 assert(tmp == xnoreg, "not used");
1343 vpsrlvq(dst, src, shift, vlen_enc);
1344 break;
1345 }
1346 default: assert(false, "%s", NodeClassNames[opcode]);
1347 }
1348 }
1349
1350 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1351 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1352 assert(opcode == Op_LShiftVB ||
1353 opcode == Op_RShiftVB ||
1354 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1355 bool sign = (opcode != Op_URShiftVB);
1356 assert(vector_len == 0, "required");
1357 vextendbd(sign, dst, src, 1);
1358 vpmovzxbd(vtmp, shift, 1);
1359 varshiftd(opcode, dst, dst, vtmp, 1);
1360 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1361 vextracti128_high(vtmp, dst);
1362 vpackusdw(dst, dst, vtmp, 0);
1363 }
1364
1365 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1366 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1367 assert(opcode == Op_LShiftVB ||
1368 opcode == Op_RShiftVB ||
1369 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1370 bool sign = (opcode != Op_URShiftVB);
1371 int ext_vector_len = vector_len + 1;
1372 vextendbw(sign, dst, src, ext_vector_len);
1373 vpmovzxbw(vtmp, shift, ext_vector_len);
1374 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1375 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1376 if (vector_len == 0) {
1377 vextracti128_high(vtmp, dst);
1378 vpackuswb(dst, dst, vtmp, vector_len);
1379 } else {
1380 vextracti64x4_high(vtmp, dst);
1381 vpackuswb(dst, dst, vtmp, vector_len);
1382 vpermq(dst, dst, 0xD8, vector_len);
1383 }
1384 }
1385
1386 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1387 switch(typ) {
1388 case T_BYTE:
1389 pinsrb(dst, val, idx);
1390 break;
1391 case T_SHORT:
1392 pinsrw(dst, val, idx);
1393 break;
1394 case T_INT:
1395 pinsrd(dst, val, idx);
1396 break;
1397 case T_LONG:
1398 pinsrq(dst, val, idx);
1399 break;
1400 default:
1401 assert(false,"Should not reach here.");
1402 break;
1403 }
1404 }
1405
1406 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1407 switch(typ) {
1408 case T_BYTE:
1409 vpinsrb(dst, src, val, idx);
1410 break;
1411 case T_SHORT:
1412 vpinsrw(dst, src, val, idx);
1413 break;
1414 case T_INT:
1415 vpinsrd(dst, src, val, idx);
1416 break;
1417 case T_LONG:
1418 vpinsrq(dst, src, val, idx);
1419 break;
1420 default:
1421 assert(false,"Should not reach here.");
1422 break;
1423 }
1424 }
1425
1426 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1427 Register base, Register idx_base,
1428 Register mask, Register mask_idx,
1429 Register rtmp, int vlen_enc) {
1430 vpxor(dst, dst, dst, vlen_enc);
1431 if (elem_bt == T_SHORT) {
1432 for (int i = 0; i < 4; i++) {
1433 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1434 Label skip_load;
1435 btq(mask, mask_idx);
1436 jccb(Assembler::carryClear, skip_load);
1437 movl(rtmp, Address(idx_base, i * 4));
1438 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1439 bind(skip_load);
1440 incq(mask_idx);
1441 }
1442 } else {
1443 assert(elem_bt == T_BYTE, "");
1444 for (int i = 0; i < 8; i++) {
1445 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1446 Label skip_load;
1447 btq(mask, mask_idx);
1448 jccb(Assembler::carryClear, skip_load);
1449 movl(rtmp, Address(idx_base, i * 4));
1450 pinsrb(dst, Address(base, rtmp), i);
1451 bind(skip_load);
1452 incq(mask_idx);
1453 }
1454 }
1455 }
1456
1457 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1458 Register base, Register idx_base,
1459 Register rtmp, int vlen_enc) {
1460 vpxor(dst, dst, dst, vlen_enc);
1461 if (elem_bt == T_SHORT) {
1462 for (int i = 0; i < 4; i++) {
1463 // dst[i] = src[idx_base[i]]
1464 movl(rtmp, Address(idx_base, i * 4));
1465 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1466 }
1467 } else {
1468 assert(elem_bt == T_BYTE, "");
1469 for (int i = 0; i < 8; i++) {
1470 // dst[i] = src[idx_base[i]]
1471 movl(rtmp, Address(idx_base, i * 4));
1472 pinsrb(dst, Address(base, rtmp), i);
1473 }
1474 }
1475 }
1476
1477 /*
1478 * Gather using hybrid algorithm, first partially unroll scalar loop
1479 * to accumulate values from gather indices into a quad-word(64bit) slice.
1480 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1481 * permutation to place the slice into appropriate vector lane
1482 * locations in destination vector. Following pseudo code describes the
1483 * algorithm in detail:
1484 *
1485 * DST_VEC = ZERO_VEC
1486 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1487 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1488 * FOREACH_ITER:
1489 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1490 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1491 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1492 * PERM_INDEX = PERM_INDEX - TWO_VEC
1493 *
1494 * With each iteration, doubleword permute indices (0,1) corresponding
1495 * to gathered quadword gets right shifted by two lane positions.
1496 *
1497 */
1498 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1499 Register base, Register idx_base,
1500 Register mask, XMMRegister xtmp1,
1501 XMMRegister xtmp2, XMMRegister temp_dst,
1502 Register rtmp, Register mask_idx,
1503 Register length, int vector_len, int vlen_enc) {
1504 Label GATHER8_LOOP;
1505 assert(is_subword_type(elem_ty), "");
1506 movl(length, vector_len);
1507 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1508 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1509 vallones(xtmp2, vlen_enc);
1510 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1511 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1512 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1513
1514 bind(GATHER8_LOOP);
1515 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1516 if (mask == noreg) {
1517 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1518 } else {
1519 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1520 }
1521 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1522 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1523 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1524 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1525 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1526 vpor(dst, dst, temp_dst, vlen_enc);
1527 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1528 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1529 jcc(Assembler::notEqual, GATHER8_LOOP);
1530 }
1531
1532 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1533 switch(typ) {
1534 case T_INT:
1535 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1536 break;
1537 case T_FLOAT:
1538 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1539 break;
1540 case T_LONG:
1541 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1542 break;
1543 case T_DOUBLE:
1544 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1545 break;
1546 default:
1547 assert(false,"Should not reach here.");
1548 break;
1549 }
1550 }
1551
1552 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1553 switch(typ) {
1554 case T_INT:
1555 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1556 break;
1557 case T_FLOAT:
1558 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1559 break;
1560 case T_LONG:
1561 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1562 break;
1563 case T_DOUBLE:
1564 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1565 break;
1566 default:
1567 assert(false,"Should not reach here.");
1568 break;
1569 }
1570 }
1571
1572 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1573 switch(typ) {
1574 case T_INT:
1575 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1576 break;
1577 case T_FLOAT:
1578 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1579 break;
1580 case T_LONG:
1581 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1582 break;
1583 case T_DOUBLE:
1584 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1585 break;
1586 default:
1587 assert(false,"Should not reach here.");
1588 break;
1589 }
1590 }
1591
1592 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1593 if (vlen_in_bytes <= 16) {
1594 pxor (dst, dst);
1595 psubb(dst, src);
1596 switch (elem_bt) {
1597 case T_BYTE: /* nothing to do */ break;
1598 case T_SHORT: pmovsxbw(dst, dst); break;
1599 case T_INT: pmovsxbd(dst, dst); break;
1600 case T_FLOAT: pmovsxbd(dst, dst); break;
1601 case T_LONG: pmovsxbq(dst, dst); break;
1602 case T_DOUBLE: pmovsxbq(dst, dst); break;
1603
1604 default: assert(false, "%s", type2name(elem_bt));
1605 }
1606 } else {
1607 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1608 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1609
1610 vpxor (dst, dst, dst, vlen_enc);
1611 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1612
1613 switch (elem_bt) {
1614 case T_BYTE: /* nothing to do */ break;
1615 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1616 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1617 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1618 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1619 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1620
1621 default: assert(false, "%s", type2name(elem_bt));
1622 }
1623 }
1624 }
1625
1626 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1627 if (novlbwdq) {
1628 vpmovsxbd(xtmp, src, vlen_enc);
1629 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1630 Assembler::eq, true, vlen_enc, noreg);
1631 } else {
1632 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1633 vpsubb(xtmp, xtmp, src, vlen_enc);
1634 evpmovb2m(dst, xtmp, vlen_enc);
1635 }
1636 }
1637
1638 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1639 if (is_integral_type(bt)) {
1640 switch (vlen_in_bytes) {
1641 case 4: movdl(dst, src); break;
1642 case 8: movq(dst, src); break;
1643 case 16: movdqu(dst, src); break;
1644 case 32: vmovdqu(dst, src); break;
1645 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1646 default: ShouldNotReachHere();
1647 }
1648 } else {
1649 switch (vlen_in_bytes) {
1650 case 4: movflt(dst, src); break;
1651 case 8: movdbl(dst, src); break;
1652 case 16: movups(dst, src); break;
1653 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1654 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1655 default: ShouldNotReachHere();
1656 }
1657 }
1658 }
1659
1660 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1661 assert(rscratch != noreg || always_reachable(src), "missing");
1662
1663 if (reachable(src)) {
1664 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1665 } else {
1666 lea(rscratch, src);
1667 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1668 }
1669 }
1670
1671 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1672 int vlen_enc = vector_length_encoding(vlen);
1673 if (VM_Version::supports_avx()) {
1674 if (bt == T_LONG) {
1675 if (VM_Version::supports_avx2()) {
1676 vpbroadcastq(dst, src, vlen_enc);
1677 } else {
1678 vmovddup(dst, src, vlen_enc);
1679 }
1680 } else if (bt == T_DOUBLE) {
1681 if (vlen_enc != Assembler::AVX_128bit) {
1682 vbroadcastsd(dst, src, vlen_enc, noreg);
1683 } else {
1684 vmovddup(dst, src, vlen_enc);
1685 }
1686 } else {
1687 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1688 vpbroadcastd(dst, src, vlen_enc);
1689 } else {
1690 vbroadcastss(dst, src, vlen_enc);
1691 }
1692 }
1693 } else if (VM_Version::supports_sse3()) {
1694 movddup(dst, src);
1695 } else {
1696 load_vector(bt, dst, src, vlen);
1697 }
1698 }
1699
1700 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1701 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1702 int offset = exact_log2(type2aelembytes(bt)) << 6;
1703 if (is_floating_point_type(bt)) {
1704 offset += 128;
1705 }
1706 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1707 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1708 }
1709
1710 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1711
1712 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1713 int vector_len = Assembler::AVX_128bit;
1714
1715 switch (opcode) {
1716 case Op_AndReductionV: pand(dst, src); break;
1717 case Op_OrReductionV: por (dst, src); break;
1718 case Op_XorReductionV: pxor(dst, src); break;
1719 case Op_MinReductionV:
1720 switch (typ) {
1721 case T_BYTE: pminsb(dst, src); break;
1722 case T_SHORT: pminsw(dst, src); break;
1723 case T_INT: pminsd(dst, src); break;
1724 case T_LONG: assert(UseAVX > 2, "required");
1725 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1726 default: assert(false, "wrong type");
1727 }
1728 break;
1729 case Op_MaxReductionV:
1730 switch (typ) {
1731 case T_BYTE: pmaxsb(dst, src); break;
1732 case T_SHORT: pmaxsw(dst, src); break;
1733 case T_INT: pmaxsd(dst, src); break;
1734 case T_LONG: assert(UseAVX > 2, "required");
1735 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1736 default: assert(false, "wrong type");
1737 }
1738 break;
1739 case Op_UMinReductionV:
1740 switch (typ) {
1741 case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1742 case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1743 case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1744 case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1745 default: assert(false, "wrong type");
1746 }
1747 break;
1748 case Op_UMaxReductionV:
1749 switch (typ) {
1750 case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1751 case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1752 case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1753 case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1754 default: assert(false, "wrong type");
1755 }
1756 break;
1757 case Op_AddReductionVF: addss(dst, src); break;
1758 case Op_AddReductionVD: addsd(dst, src); break;
1759 case Op_AddReductionVI:
1760 switch (typ) {
1761 case T_BYTE: paddb(dst, src); break;
1762 case T_SHORT: paddw(dst, src); break;
1763 case T_INT: paddd(dst, src); break;
1764 default: assert(false, "wrong type");
1765 }
1766 break;
1767 case Op_AddReductionVL: paddq(dst, src); break;
1768 case Op_MulReductionVF: mulss(dst, src); break;
1769 case Op_MulReductionVD: mulsd(dst, src); break;
1770 case Op_MulReductionVI:
1771 switch (typ) {
1772 case T_SHORT: pmullw(dst, src); break;
1773 case T_INT: pmulld(dst, src); break;
1774 default: assert(false, "wrong type");
1775 }
1776 break;
1777 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1778 evpmullq(dst, dst, src, vector_len); break;
1779 default: assert(false, "wrong opcode");
1780 }
1781 }
1782
1783 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1784 switch (opcode) {
1785 case Op_AddReductionVF: addps(dst, src); break;
1786 case Op_AddReductionVD: addpd(dst, src); break;
1787 case Op_MulReductionVF: mulps(dst, src); break;
1788 case Op_MulReductionVD: mulpd(dst, src); break;
1789 default: assert(false, "%s", NodeClassNames[opcode]);
1790 }
1791 }
1792
1793 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1794 int vector_len = Assembler::AVX_256bit;
1795
1796 switch (opcode) {
1797 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1798 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1799 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1800 case Op_MinReductionV:
1801 switch (typ) {
1802 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1803 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1804 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1805 case T_LONG: assert(UseAVX > 2, "required");
1806 vpminsq(dst, src1, src2, vector_len); break;
1807 default: assert(false, "wrong type");
1808 }
1809 break;
1810 case Op_MaxReductionV:
1811 switch (typ) {
1812 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1813 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1814 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1815 case T_LONG: assert(UseAVX > 2, "required");
1816 vpmaxsq(dst, src1, src2, vector_len); break;
1817 default: assert(false, "wrong type");
1818 }
1819 break;
1820 case Op_UMinReductionV:
1821 switch (typ) {
1822 case T_BYTE: vpminub(dst, src1, src2, vector_len); break;
1823 case T_SHORT: vpminuw(dst, src1, src2, vector_len); break;
1824 case T_INT: vpminud(dst, src1, src2, vector_len); break;
1825 case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break;
1826 default: assert(false, "wrong type");
1827 }
1828 break;
1829 case Op_UMaxReductionV:
1830 switch (typ) {
1831 case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break;
1832 case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break;
1833 case T_INT: vpmaxud(dst, src1, src2, vector_len); break;
1834 case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1835 default: assert(false, "wrong type");
1836 }
1837 break;
1838 case Op_AddReductionVI:
1839 switch (typ) {
1840 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1841 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1842 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1843 default: assert(false, "wrong type");
1844 }
1845 break;
1846 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1847 case Op_MulReductionVI:
1848 switch (typ) {
1849 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1850 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1851 default: assert(false, "wrong type");
1852 }
1853 break;
1854 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1855 default: assert(false, "wrong opcode");
1856 }
1857 }
1858
1859 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1860 int vector_len = Assembler::AVX_256bit;
1861
1862 switch (opcode) {
1863 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1864 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1865 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1866 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1867 default: assert(false, "%s", NodeClassNames[opcode]);
1868 }
1869 }
1870
1871 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1872 XMMRegister dst, XMMRegister src,
1873 XMMRegister vtmp1, XMMRegister vtmp2) {
1874 switch (opcode) {
1875 case Op_AddReductionVF:
1876 case Op_MulReductionVF:
1877 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1878 break;
1879
1880 case Op_AddReductionVD:
1881 case Op_MulReductionVD:
1882 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1883 break;
1884
1885 default: assert(false, "wrong opcode");
1886 }
1887 }
1888
1889 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1890 XMMRegister dst, XMMRegister src,
1891 XMMRegister vtmp1, XMMRegister vtmp2) {
1892 switch (opcode) {
1893 case Op_AddReductionVF:
1894 case Op_MulReductionVF:
1895 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1896 break;
1897
1898 case Op_AddReductionVD:
1899 case Op_MulReductionVD:
1900 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1901 break;
1902
1903 default: assert(false, "%s", NodeClassNames[opcode]);
1904 }
1905 }
1906
1907 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1908 Register dst, Register src1, XMMRegister src2,
1909 XMMRegister vtmp1, XMMRegister vtmp2) {
1910 switch (vlen) {
1911 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1912 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1913 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1914 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1915
1916 default: assert(false, "wrong vector length");
1917 }
1918 }
1919
1920 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1921 Register dst, Register src1, XMMRegister src2,
1922 XMMRegister vtmp1, XMMRegister vtmp2) {
1923 switch (vlen) {
1924 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1925 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1926 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1927 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1928
1929 default: assert(false, "wrong vector length");
1930 }
1931 }
1932
1933 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1934 Register dst, Register src1, XMMRegister src2,
1935 XMMRegister vtmp1, XMMRegister vtmp2) {
1936 switch (vlen) {
1937 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1938 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1939 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1940 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1941
1942 default: assert(false, "wrong vector length");
1943 }
1944 }
1945
1946 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1947 Register dst, Register src1, XMMRegister src2,
1948 XMMRegister vtmp1, XMMRegister vtmp2) {
1949 switch (vlen) {
1950 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1951 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1952 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1953 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1954
1955 default: assert(false, "wrong vector length");
1956 }
1957 }
1958
1959 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1960 Register dst, Register src1, XMMRegister src2,
1961 XMMRegister vtmp1, XMMRegister vtmp2) {
1962 switch (vlen) {
1963 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1964 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1965 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1966
1967 default: assert(false, "wrong vector length");
1968 }
1969 }
1970
1971 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1972 switch (vlen) {
1973 case 2:
1974 assert(vtmp2 == xnoreg, "");
1975 reduce2F(opcode, dst, src, vtmp1);
1976 break;
1977 case 4:
1978 assert(vtmp2 == xnoreg, "");
1979 reduce4F(opcode, dst, src, vtmp1);
1980 break;
1981 case 8:
1982 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1983 break;
1984 case 16:
1985 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1986 break;
1987 default: assert(false, "wrong vector length");
1988 }
1989 }
1990
1991 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1992 switch (vlen) {
1993 case 2:
1994 assert(vtmp2 == xnoreg, "");
1995 reduce2D(opcode, dst, src, vtmp1);
1996 break;
1997 case 4:
1998 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1999 break;
2000 case 8:
2001 reduce8D(opcode, dst, src, vtmp1, vtmp2);
2002 break;
2003 default: assert(false, "wrong vector length");
2004 }
2005 }
2006
2007 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2008 switch (vlen) {
2009 case 2:
2010 assert(vtmp1 == xnoreg, "");
2011 assert(vtmp2 == xnoreg, "");
2012 unorderedReduce2F(opcode, dst, src);
2013 break;
2014 case 4:
2015 assert(vtmp2 == xnoreg, "");
2016 unorderedReduce4F(opcode, dst, src, vtmp1);
2017 break;
2018 case 8:
2019 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2020 break;
2021 case 16:
2022 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2023 break;
2024 default: assert(false, "wrong vector length");
2025 }
2026 }
2027
2028 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2029 switch (vlen) {
2030 case 2:
2031 assert(vtmp1 == xnoreg, "");
2032 assert(vtmp2 == xnoreg, "");
2033 unorderedReduce2D(opcode, dst, src);
2034 break;
2035 case 4:
2036 assert(vtmp2 == xnoreg, "");
2037 unorderedReduce4D(opcode, dst, src, vtmp1);
2038 break;
2039 case 8:
2040 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2041 break;
2042 default: assert(false, "wrong vector length");
2043 }
2044 }
2045
2046 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2047 if (opcode == Op_AddReductionVI) {
2048 if (vtmp1 != src2) {
2049 movdqu(vtmp1, src2);
2050 }
2051 phaddd(vtmp1, vtmp1);
2052 } else {
2053 pshufd(vtmp1, src2, 0x1);
2054 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2055 }
2056 movdl(vtmp2, src1);
2057 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2058 movdl(dst, vtmp1);
2059 }
2060
2061 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2062 if (opcode == Op_AddReductionVI) {
2063 if (vtmp1 != src2) {
2064 movdqu(vtmp1, src2);
2065 }
2066 phaddd(vtmp1, src2);
2067 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2068 } else {
2069 pshufd(vtmp2, src2, 0xE);
2070 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2071 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2072 }
2073 }
2074
2075 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2076 if (opcode == Op_AddReductionVI) {
2077 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2078 vextracti128_high(vtmp2, vtmp1);
2079 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2080 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2081 } else {
2082 vextracti128_high(vtmp1, src2);
2083 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2084 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2085 }
2086 }
2087
2088 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2089 vextracti64x4_high(vtmp2, src2);
2090 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2091 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2092 }
2093
2094 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2095 pshufd(vtmp2, src2, 0x1);
2096 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2097 movdqu(vtmp1, vtmp2);
2098 psrldq(vtmp1, 2);
2099 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2100 movdqu(vtmp2, vtmp1);
2101 psrldq(vtmp2, 1);
2102 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2103 movdl(vtmp2, src1);
2104 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2105 pmovzxbd(vtmp1, vtmp1);
2106 } else {
2107 pmovsxbd(vtmp1, vtmp1);
2108 }
2109 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2110 pextrb(dst, vtmp1, 0x0);
2111 movsbl(dst, dst);
2112 }
2113
2114 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2115 pshufd(vtmp1, src2, 0xE);
2116 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2117 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2118 }
2119
2120 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2121 vextracti128_high(vtmp2, src2);
2122 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2123 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2124 }
2125
2126 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2127 vextracti64x4_high(vtmp1, src2);
2128 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2129 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2130 }
2131
2132 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2133 pmovsxbw(vtmp2, src2);
2134 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2135 }
2136
2137 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2138 if (UseAVX > 1) {
2139 int vector_len = Assembler::AVX_256bit;
2140 vpmovsxbw(vtmp1, src2, vector_len);
2141 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2142 } else {
2143 pmovsxbw(vtmp2, src2);
2144 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2145 pshufd(vtmp2, src2, 0x1);
2146 pmovsxbw(vtmp2, src2);
2147 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2148 }
2149 }
2150
2151 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2152 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2153 int vector_len = Assembler::AVX_512bit;
2154 vpmovsxbw(vtmp1, src2, vector_len);
2155 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2156 } else {
2157 assert(UseAVX >= 2,"Should not reach here.");
2158 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2159 vextracti128_high(vtmp2, src2);
2160 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2161 }
2162 }
2163
2164 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2165 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2166 vextracti64x4_high(vtmp2, src2);
2167 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2168 }
2169
2170 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2171 if (opcode == Op_AddReductionVI) {
2172 if (vtmp1 != src2) {
2173 movdqu(vtmp1, src2);
2174 }
2175 phaddw(vtmp1, vtmp1);
2176 phaddw(vtmp1, vtmp1);
2177 } else {
2178 pshufd(vtmp2, src2, 0x1);
2179 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2180 movdqu(vtmp1, vtmp2);
2181 psrldq(vtmp1, 2);
2182 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2183 }
2184 movdl(vtmp2, src1);
2185 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2186 pmovzxwd(vtmp1, vtmp1);
2187 } else {
2188 pmovsxwd(vtmp1, vtmp1);
2189 }
2190 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2191 pextrw(dst, vtmp1, 0x0);
2192 movswl(dst, dst);
2193 }
2194
2195 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2196 if (opcode == Op_AddReductionVI) {
2197 if (vtmp1 != src2) {
2198 movdqu(vtmp1, src2);
2199 }
2200 phaddw(vtmp1, src2);
2201 } else {
2202 pshufd(vtmp1, src2, 0xE);
2203 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2204 }
2205 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2206 }
2207
2208 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2209 if (opcode == Op_AddReductionVI) {
2210 int vector_len = Assembler::AVX_256bit;
2211 vphaddw(vtmp2, src2, src2, vector_len);
2212 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2213 } else {
2214 vextracti128_high(vtmp2, src2);
2215 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2216 }
2217 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2218 }
2219
2220 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2221 int vector_len = Assembler::AVX_256bit;
2222 vextracti64x4_high(vtmp1, src2);
2223 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2224 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2225 }
2226
2227 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2228 pshufd(vtmp2, src2, 0xE);
2229 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2230 movdq(vtmp1, src1);
2231 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2232 movdq(dst, vtmp1);
2233 }
2234
2235 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2236 vextracti128_high(vtmp1, src2);
2237 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2238 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2239 }
2240
2241 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2242 vextracti64x4_high(vtmp2, src2);
2243 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2244 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2245 }
2246
2247 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2248 mov64(temp, -1L);
2249 bzhiq(temp, temp, len);
2250 kmovql(dst, temp);
2251 }
2252
2253 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2254 reduce_operation_128(T_FLOAT, opcode, dst, src);
2255 pshufd(vtmp, src, 0x1);
2256 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2257 }
2258
2259 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2260 reduce2F(opcode, dst, src, vtmp);
2261 pshufd(vtmp, src, 0x2);
2262 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2263 pshufd(vtmp, src, 0x3);
2264 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2265 }
2266
2267 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2268 reduce4F(opcode, dst, src, vtmp2);
2269 vextractf128_high(vtmp2, src);
2270 reduce4F(opcode, dst, vtmp2, vtmp1);
2271 }
2272
2273 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2274 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2275 vextracti64x4_high(vtmp1, src);
2276 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2277 }
2278
2279 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2280 pshufd(dst, src, 0x1);
2281 reduce_operation_128(T_FLOAT, opcode, dst, src);
2282 }
2283
2284 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2285 pshufd(vtmp, src, 0xE);
2286 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2287 unorderedReduce2F(opcode, dst, vtmp);
2288 }
2289
2290 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2291 vextractf128_high(vtmp1, src);
2292 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2293 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2294 }
2295
2296 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2297 vextractf64x4_high(vtmp2, src);
2298 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2299 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2300 }
2301
2302 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2303 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2304 pshufd(vtmp, src, 0xE);
2305 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2306 }
2307
2308 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2309 reduce2D(opcode, dst, src, vtmp2);
2310 vextractf128_high(vtmp2, src);
2311 reduce2D(opcode, dst, vtmp2, vtmp1);
2312 }
2313
2314 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2315 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2316 vextracti64x4_high(vtmp1, src);
2317 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2318 }
2319
2320 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2321 pshufd(dst, src, 0xE);
2322 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2323 }
2324
2325 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2326 vextractf128_high(vtmp, src);
2327 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2328 unorderedReduce2D(opcode, dst, vtmp);
2329 }
2330
2331 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2332 vextractf64x4_high(vtmp2, src);
2333 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2334 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2335 }
2336
2337 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2338 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2339 }
2340
2341 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2342 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2343 }
2344
2345 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2346 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2347 }
2348
2349 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2350 int vec_enc) {
2351 switch(elem_bt) {
2352 case T_INT:
2353 case T_FLOAT:
2354 vmaskmovps(dst, src, mask, vec_enc);
2355 break;
2356 case T_LONG:
2357 case T_DOUBLE:
2358 vmaskmovpd(dst, src, mask, vec_enc);
2359 break;
2360 default:
2361 fatal("Unsupported type %s", type2name(elem_bt));
2362 break;
2363 }
2364 }
2365
2366 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2367 int vec_enc) {
2368 switch(elem_bt) {
2369 case T_INT:
2370 case T_FLOAT:
2371 vmaskmovps(dst, src, mask, vec_enc);
2372 break;
2373 case T_LONG:
2374 case T_DOUBLE:
2375 vmaskmovpd(dst, src, mask, vec_enc);
2376 break;
2377 default:
2378 fatal("Unsupported type %s", type2name(elem_bt));
2379 break;
2380 }
2381 }
2382
2383 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2384 XMMRegister dst, XMMRegister src,
2385 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2386 XMMRegister xmm_0, XMMRegister xmm_1) {
2387 const int permconst[] = {1, 14};
2388 XMMRegister wsrc = src;
2389 XMMRegister wdst = xmm_0;
2390 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2391
2392 int vlen_enc = Assembler::AVX_128bit;
2393 if (vlen == 16) {
2394 vlen_enc = Assembler::AVX_256bit;
2395 }
2396
2397 for (int i = log2(vlen) - 1; i >=0; i--) {
2398 if (i == 0 && !is_dst_valid) {
2399 wdst = dst;
2400 }
2401 if (i == 3) {
2402 vextracti64x4_high(wtmp, wsrc);
2403 } else if (i == 2) {
2404 vextracti128_high(wtmp, wsrc);
2405 } else { // i = [0,1]
2406 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2407 }
2408
2409 if (VM_Version::supports_avx10_2()) {
2410 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2411 } else {
2412 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2413 }
2414 wsrc = wdst;
2415 vlen_enc = Assembler::AVX_128bit;
2416 }
2417 if (is_dst_valid) {
2418 if (VM_Version::supports_avx10_2()) {
2419 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2420 } else {
2421 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2422 }
2423 }
2424 }
2425
2426 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2427 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2428 XMMRegister xmm_0, XMMRegister xmm_1) {
2429 XMMRegister wsrc = src;
2430 XMMRegister wdst = xmm_0;
2431 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2432 int vlen_enc = Assembler::AVX_128bit;
2433 if (vlen == 8) {
2434 vlen_enc = Assembler::AVX_256bit;
2435 }
2436 for (int i = log2(vlen) - 1; i >=0; i--) {
2437 if (i == 0 && !is_dst_valid) {
2438 wdst = dst;
2439 }
2440 if (i == 1) {
2441 vextracti128_high(wtmp, wsrc);
2442 } else if (i == 2) {
2443 vextracti64x4_high(wtmp, wsrc);
2444 } else {
2445 assert(i == 0, "%d", i);
2446 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2447 }
2448
2449 if (VM_Version::supports_avx10_2()) {
2450 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2451 } else {
2452 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2453 }
2454
2455 wsrc = wdst;
2456 vlen_enc = Assembler::AVX_128bit;
2457 }
2458
2459 if (is_dst_valid) {
2460 if (VM_Version::supports_avx10_2()) {
2461 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2462 } else {
2463 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2464 }
2465 }
2466 }
2467
2468 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2469 switch (bt) {
2470 case T_BYTE: pextrb(dst, src, idx); break;
2471 case T_SHORT: pextrw(dst, src, idx); break;
2472 case T_INT: pextrd(dst, src, idx); break;
2473 case T_LONG: pextrq(dst, src, idx); break;
2474
2475 default:
2476 assert(false,"Should not reach here.");
2477 break;
2478 }
2479 }
2480
2481 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2482 int esize = type2aelembytes(typ);
2483 int elem_per_lane = 16/esize;
2484 int lane = elemindex / elem_per_lane;
2485 int eindex = elemindex % elem_per_lane;
2486
2487 if (lane >= 2) {
2488 assert(UseAVX > 2, "required");
2489 vextractf32x4(dst, src, lane & 3);
2490 return dst;
2491 } else if (lane > 0) {
2492 assert(UseAVX > 0, "required");
2493 vextractf128(dst, src, lane);
2494 return dst;
2495 } else {
2496 return src;
2497 }
2498 }
2499
2500 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2501 if (typ == T_BYTE) {
2502 movsbl(dst, dst);
2503 } else if (typ == T_SHORT) {
2504 movswl(dst, dst);
2505 }
2506 }
2507
2508 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2509 int esize = type2aelembytes(typ);
2510 int elem_per_lane = 16/esize;
2511 int eindex = elemindex % elem_per_lane;
2512 assert(is_integral_type(typ),"required");
2513
2514 if (eindex == 0) {
2515 if (typ == T_LONG) {
2516 movq(dst, src);
2517 } else {
2518 movdl(dst, src);
2519 movsxl(typ, dst);
2520 }
2521 } else {
2522 extract(typ, dst, src, eindex);
2523 movsxl(typ, dst);
2524 }
2525 }
2526
2527 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2528 int esize = type2aelembytes(typ);
2529 int elem_per_lane = 16/esize;
2530 int eindex = elemindex % elem_per_lane;
2531 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2532
2533 if (eindex == 0) {
2534 movq(dst, src);
2535 } else {
2536 if (typ == T_FLOAT) {
2537 if (UseAVX == 0) {
2538 movdqu(dst, src);
2539 shufps(dst, dst, eindex);
2540 } else {
2541 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2542 }
2543 } else {
2544 if (UseAVX == 0) {
2545 movdqu(dst, src);
2546 psrldq(dst, eindex*esize);
2547 } else {
2548 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2549 }
2550 movq(dst, dst);
2551 }
2552 }
2553 // Zero upper bits
2554 if (typ == T_FLOAT) {
2555 if (UseAVX == 0) {
2556 assert(vtmp != xnoreg, "required.");
2557 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2558 pand(dst, vtmp);
2559 } else {
2560 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2561 }
2562 }
2563 }
2564
2565 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2566 switch(typ) {
2567 case T_BYTE:
2568 case T_BOOLEAN:
2569 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2570 break;
2571 case T_SHORT:
2572 case T_CHAR:
2573 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2574 break;
2575 case T_INT:
2576 case T_FLOAT:
2577 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2578 break;
2579 case T_LONG:
2580 case T_DOUBLE:
2581 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2582 break;
2583 default:
2584 assert(false,"Should not reach here.");
2585 break;
2586 }
2587 }
2588
2589 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2590 assert(rscratch != noreg || always_reachable(src2), "missing");
2591
2592 switch(typ) {
2593 case T_BOOLEAN:
2594 case T_BYTE:
2595 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2596 break;
2597 case T_CHAR:
2598 case T_SHORT:
2599 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2600 break;
2601 case T_INT:
2602 case T_FLOAT:
2603 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2604 break;
2605 case T_LONG:
2606 case T_DOUBLE:
2607 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2608 break;
2609 default:
2610 assert(false,"Should not reach here.");
2611 break;
2612 }
2613 }
2614
2615 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2616 switch(typ) {
2617 case T_BYTE:
2618 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2619 break;
2620 case T_SHORT:
2621 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2622 break;
2623 case T_INT:
2624 case T_FLOAT:
2625 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2626 break;
2627 case T_LONG:
2628 case T_DOUBLE:
2629 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2630 break;
2631 default:
2632 assert(false,"Should not reach here.");
2633 break;
2634 }
2635 }
2636
2637 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2638 assert(vlen_in_bytes <= 32, "");
2639 int esize = type2aelembytes(bt);
2640 if (vlen_in_bytes == 32) {
2641 assert(vtmp == xnoreg, "required.");
2642 if (esize >= 4) {
2643 vtestps(src1, src2, AVX_256bit);
2644 } else {
2645 vptest(src1, src2, AVX_256bit);
2646 }
2647 return;
2648 }
2649 if (vlen_in_bytes < 16) {
2650 // Duplicate the lower part to fill the whole register,
2651 // Don't need to do so for src2
2652 assert(vtmp != xnoreg, "required");
2653 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2654 pshufd(vtmp, src1, shuffle_imm);
2655 } else {
2656 assert(vtmp == xnoreg, "required");
2657 vtmp = src1;
2658 }
2659 if (esize >= 4 && VM_Version::supports_avx()) {
2660 vtestps(vtmp, src2, AVX_128bit);
2661 } else {
2662 ptest(vtmp, src2);
2663 }
2664 }
2665
2666 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2667 #ifdef ASSERT
2668 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2669 bool is_bw_supported = VM_Version::supports_avx512bw();
2670 if (is_bw && !is_bw_supported) {
2671 assert(vlen_enc != Assembler::AVX_512bit, "required");
2672 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2673 "XMM register should be 0-15");
2674 }
2675 #endif // ASSERT
2676 switch (elem_bt) {
2677 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2678 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2679 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2680 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2681 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2682 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2683 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2684 }
2685 }
2686
2687 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2688 assert(UseAVX >= 2, "required");
2689 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2690 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2691 if ((UseAVX > 2) &&
2692 (!is_bw || VM_Version::supports_avx512bw()) &&
2693 (!is_vl || VM_Version::supports_avx512vl())) {
2694 switch (elem_bt) {
2695 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2696 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2697 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2698 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2699 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2700 }
2701 } else {
2702 assert(vlen_enc != Assembler::AVX_512bit, "required");
2703 assert((dst->encoding() < 16),"XMM register should be 0-15");
2704 switch (elem_bt) {
2705 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2706 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2707 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2708 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2709 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2710 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2711 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2712 }
2713 }
2714 }
2715
2716 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2717 switch (to_elem_bt) {
2718 case T_SHORT:
2719 vpmovsxbw(dst, src, vlen_enc);
2720 break;
2721 case T_INT:
2722 vpmovsxbd(dst, src, vlen_enc);
2723 break;
2724 case T_FLOAT:
2725 vpmovsxbd(dst, src, vlen_enc);
2726 vcvtdq2ps(dst, dst, vlen_enc);
2727 break;
2728 case T_LONG:
2729 vpmovsxbq(dst, src, vlen_enc);
2730 break;
2731 case T_DOUBLE: {
2732 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2733 vpmovsxbd(dst, src, mid_vlen_enc);
2734 vcvtdq2pd(dst, dst, vlen_enc);
2735 break;
2736 }
2737 default:
2738 fatal("Unsupported type %s", type2name(to_elem_bt));
2739 break;
2740 }
2741 }
2742
2743 //-------------------------------------------------------------------------------------------
2744
2745 // IndexOf for constant substrings with size >= 8 chars
2746 // which don't need to be loaded through stack.
2747 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2748 Register cnt1, Register cnt2,
2749 int int_cnt2, Register result,
2750 XMMRegister vec, Register tmp,
2751 int ae) {
2752 ShortBranchVerifier sbv(this);
2753 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2754 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2755
2756 // This method uses the pcmpestri instruction with bound registers
2757 // inputs:
2758 // xmm - substring
2759 // rax - substring length (elements count)
2760 // mem - scanned string
2761 // rdx - string length (elements count)
2762 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2763 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2764 // outputs:
2765 // rcx - matched index in string
2766 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2767 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2768 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2769 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2770 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2771
2772 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2773 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2774 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2775
2776 // Note, inline_string_indexOf() generates checks:
2777 // if (substr.count > string.count) return -1;
2778 // if (substr.count == 0) return 0;
2779 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2780
2781 // Load substring.
2782 if (ae == StrIntrinsicNode::UL) {
2783 pmovzxbw(vec, Address(str2, 0));
2784 } else {
2785 movdqu(vec, Address(str2, 0));
2786 }
2787 movl(cnt2, int_cnt2);
2788 movptr(result, str1); // string addr
2789
2790 if (int_cnt2 > stride) {
2791 jmpb(SCAN_TO_SUBSTR);
2792
2793 // Reload substr for rescan, this code
2794 // is executed only for large substrings (> 8 chars)
2795 bind(RELOAD_SUBSTR);
2796 if (ae == StrIntrinsicNode::UL) {
2797 pmovzxbw(vec, Address(str2, 0));
2798 } else {
2799 movdqu(vec, Address(str2, 0));
2800 }
2801 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2802
2803 bind(RELOAD_STR);
2804 // We came here after the beginning of the substring was
2805 // matched but the rest of it was not so we need to search
2806 // again. Start from the next element after the previous match.
2807
2808 // cnt2 is number of substring reminding elements and
2809 // cnt1 is number of string reminding elements when cmp failed.
2810 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2811 subl(cnt1, cnt2);
2812 addl(cnt1, int_cnt2);
2813 movl(cnt2, int_cnt2); // Now restore cnt2
2814
2815 decrementl(cnt1); // Shift to next element
2816 cmpl(cnt1, cnt2);
2817 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2818
2819 addptr(result, (1<<scale1));
2820
2821 } // (int_cnt2 > 8)
2822
2823 // Scan string for start of substr in 16-byte vectors
2824 bind(SCAN_TO_SUBSTR);
2825 pcmpestri(vec, Address(result, 0), mode);
2826 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2827 subl(cnt1, stride);
2828 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2829 cmpl(cnt1, cnt2);
2830 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2831 addptr(result, 16);
2832 jmpb(SCAN_TO_SUBSTR);
2833
2834 // Found a potential substr
2835 bind(FOUND_CANDIDATE);
2836 // Matched whole vector if first element matched (tmp(rcx) == 0).
2837 if (int_cnt2 == stride) {
2838 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2839 } else { // int_cnt2 > 8
2840 jccb(Assembler::overflow, FOUND_SUBSTR);
2841 }
2842 // After pcmpestri tmp(rcx) contains matched element index
2843 // Compute start addr of substr
2844 lea(result, Address(result, tmp, scale1));
2845
2846 // Make sure string is still long enough
2847 subl(cnt1, tmp);
2848 cmpl(cnt1, cnt2);
2849 if (int_cnt2 == stride) {
2850 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2851 } else { // int_cnt2 > 8
2852 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2853 }
2854 // Left less then substring.
2855
2856 bind(RET_NOT_FOUND);
2857 movl(result, -1);
2858 jmp(EXIT);
2859
2860 if (int_cnt2 > stride) {
2861 // This code is optimized for the case when whole substring
2862 // is matched if its head is matched.
2863 bind(MATCH_SUBSTR_HEAD);
2864 pcmpestri(vec, Address(result, 0), mode);
2865 // Reload only string if does not match
2866 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2867
2868 Label CONT_SCAN_SUBSTR;
2869 // Compare the rest of substring (> 8 chars).
2870 bind(FOUND_SUBSTR);
2871 // First 8 chars are already matched.
2872 negptr(cnt2);
2873 addptr(cnt2, stride);
2874
2875 bind(SCAN_SUBSTR);
2876 subl(cnt1, stride);
2877 cmpl(cnt2, -stride); // Do not read beyond substring
2878 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2879 // Back-up strings to avoid reading beyond substring:
2880 // cnt1 = cnt1 - cnt2 + 8
2881 addl(cnt1, cnt2); // cnt2 is negative
2882 addl(cnt1, stride);
2883 movl(cnt2, stride); negptr(cnt2);
2884 bind(CONT_SCAN_SUBSTR);
2885 if (int_cnt2 < (int)G) {
2886 int tail_off1 = int_cnt2<<scale1;
2887 int tail_off2 = int_cnt2<<scale2;
2888 if (ae == StrIntrinsicNode::UL) {
2889 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2890 } else {
2891 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2892 }
2893 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2894 } else {
2895 // calculate index in register to avoid integer overflow (int_cnt2*2)
2896 movl(tmp, int_cnt2);
2897 addptr(tmp, cnt2);
2898 if (ae == StrIntrinsicNode::UL) {
2899 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2900 } else {
2901 movdqu(vec, Address(str2, tmp, scale2, 0));
2902 }
2903 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2904 }
2905 // Need to reload strings pointers if not matched whole vector
2906 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2907 addptr(cnt2, stride);
2908 jcc(Assembler::negative, SCAN_SUBSTR);
2909 // Fall through if found full substring
2910
2911 } // (int_cnt2 > 8)
2912
2913 bind(RET_FOUND);
2914 // Found result if we matched full small substring.
2915 // Compute substr offset
2916 subptr(result, str1);
2917 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2918 shrl(result, 1); // index
2919 }
2920 bind(EXIT);
2921
2922 } // string_indexofC8
2923
2924 // Small strings are loaded through stack if they cross page boundary.
2925 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2926 Register cnt1, Register cnt2,
2927 int int_cnt2, Register result,
2928 XMMRegister vec, Register tmp,
2929 int ae) {
2930 ShortBranchVerifier sbv(this);
2931 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2932 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2933
2934 //
2935 // int_cnt2 is length of small (< 8 chars) constant substring
2936 // or (-1) for non constant substring in which case its length
2937 // is in cnt2 register.
2938 //
2939 // Note, inline_string_indexOf() generates checks:
2940 // if (substr.count > string.count) return -1;
2941 // if (substr.count == 0) return 0;
2942 //
2943 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2944 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2945 // This method uses the pcmpestri instruction with bound registers
2946 // inputs:
2947 // xmm - substring
2948 // rax - substring length (elements count)
2949 // mem - scanned string
2950 // rdx - string length (elements count)
2951 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2952 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2953 // outputs:
2954 // rcx - matched index in string
2955 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2956 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2957 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2958 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2959
2960 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2961 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2962 FOUND_CANDIDATE;
2963
2964 { //========================================================
2965 // We don't know where these strings are located
2966 // and we can't read beyond them. Load them through stack.
2967 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2968
2969 movptr(tmp, rsp); // save old SP
2970
2971 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2972 if (int_cnt2 == (1>>scale2)) { // One byte
2973 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2974 load_unsigned_byte(result, Address(str2, 0));
2975 movdl(vec, result); // move 32 bits
2976 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2977 // Not enough header space in 32-bit VM: 12+3 = 15.
2978 movl(result, Address(str2, -1));
2979 shrl(result, 8);
2980 movdl(vec, result); // move 32 bits
2981 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2982 load_unsigned_short(result, Address(str2, 0));
2983 movdl(vec, result); // move 32 bits
2984 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2985 movdl(vec, Address(str2, 0)); // move 32 bits
2986 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2987 movq(vec, Address(str2, 0)); // move 64 bits
2988 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2989 // Array header size is 12 bytes in 32-bit VM
2990 // + 6 bytes for 3 chars == 18 bytes,
2991 // enough space to load vec and shift.
2992 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2993 if (ae == StrIntrinsicNode::UL) {
2994 int tail_off = int_cnt2-8;
2995 pmovzxbw(vec, Address(str2, tail_off));
2996 psrldq(vec, -2*tail_off);
2997 }
2998 else {
2999 int tail_off = int_cnt2*(1<<scale2);
3000 movdqu(vec, Address(str2, tail_off-16));
3001 psrldq(vec, 16-tail_off);
3002 }
3003 }
3004 } else { // not constant substring
3005 cmpl(cnt2, stride);
3006 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3007
3008 // We can read beyond string if srt+16 does not cross page boundary
3009 // since heaps are aligned and mapped by pages.
3010 assert(os::vm_page_size() < (int)G, "default page should be small");
3011 movl(result, str2); // We need only low 32 bits
3012 andl(result, ((int)os::vm_page_size()-1));
3013 cmpl(result, ((int)os::vm_page_size()-16));
3014 jccb(Assembler::belowEqual, CHECK_STR);
3015
3016 // Move small strings to stack to allow load 16 bytes into vec.
3017 subptr(rsp, 16);
3018 int stk_offset = wordSize-(1<<scale2);
3019 push(cnt2);
3020
3021 bind(COPY_SUBSTR);
3022 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3023 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3024 movb(Address(rsp, cnt2, scale2, stk_offset), result);
3025 } else if (ae == StrIntrinsicNode::UU) {
3026 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3027 movw(Address(rsp, cnt2, scale2, stk_offset), result);
3028 }
3029 decrement(cnt2);
3030 jccb(Assembler::notZero, COPY_SUBSTR);
3031
3032 pop(cnt2);
3033 movptr(str2, rsp); // New substring address
3034 } // non constant
3035
3036 bind(CHECK_STR);
3037 cmpl(cnt1, stride);
3038 jccb(Assembler::aboveEqual, BIG_STRINGS);
3039
3040 // Check cross page boundary.
3041 movl(result, str1); // We need only low 32 bits
3042 andl(result, ((int)os::vm_page_size()-1));
3043 cmpl(result, ((int)os::vm_page_size()-16));
3044 jccb(Assembler::belowEqual, BIG_STRINGS);
3045
3046 subptr(rsp, 16);
3047 int stk_offset = -(1<<scale1);
3048 if (int_cnt2 < 0) { // not constant
3049 push(cnt2);
3050 stk_offset += wordSize;
3051 }
3052 movl(cnt2, cnt1);
3053
3054 bind(COPY_STR);
3055 if (ae == StrIntrinsicNode::LL) {
3056 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3057 movb(Address(rsp, cnt2, scale1, stk_offset), result);
3058 } else {
3059 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3060 movw(Address(rsp, cnt2, scale1, stk_offset), result);
3061 }
3062 decrement(cnt2);
3063 jccb(Assembler::notZero, COPY_STR);
3064
3065 if (int_cnt2 < 0) { // not constant
3066 pop(cnt2);
3067 }
3068 movptr(str1, rsp); // New string address
3069
3070 bind(BIG_STRINGS);
3071 // Load substring.
3072 if (int_cnt2 < 0) { // -1
3073 if (ae == StrIntrinsicNode::UL) {
3074 pmovzxbw(vec, Address(str2, 0));
3075 } else {
3076 movdqu(vec, Address(str2, 0));
3077 }
3078 push(cnt2); // substr count
3079 push(str2); // substr addr
3080 push(str1); // string addr
3081 } else {
3082 // Small (< 8 chars) constant substrings are loaded already.
3083 movl(cnt2, int_cnt2);
3084 }
3085 push(tmp); // original SP
3086
3087 } // Finished loading
3088
3089 //========================================================
3090 // Start search
3091 //
3092
3093 movptr(result, str1); // string addr
3094
3095 if (int_cnt2 < 0) { // Only for non constant substring
3096 jmpb(SCAN_TO_SUBSTR);
3097
3098 // SP saved at sp+0
3099 // String saved at sp+1*wordSize
3100 // Substr saved at sp+2*wordSize
3101 // Substr count saved at sp+3*wordSize
3102
3103 // Reload substr for rescan, this code
3104 // is executed only for large substrings (> 8 chars)
3105 bind(RELOAD_SUBSTR);
3106 movptr(str2, Address(rsp, 2*wordSize));
3107 movl(cnt2, Address(rsp, 3*wordSize));
3108 if (ae == StrIntrinsicNode::UL) {
3109 pmovzxbw(vec, Address(str2, 0));
3110 } else {
3111 movdqu(vec, Address(str2, 0));
3112 }
3113 // We came here after the beginning of the substring was
3114 // matched but the rest of it was not so we need to search
3115 // again. Start from the next element after the previous match.
3116 subptr(str1, result); // Restore counter
3117 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3118 shrl(str1, 1);
3119 }
3120 addl(cnt1, str1);
3121 decrementl(cnt1); // Shift to next element
3122 cmpl(cnt1, cnt2);
3123 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3124
3125 addptr(result, (1<<scale1));
3126 } // non constant
3127
3128 // Scan string for start of substr in 16-byte vectors
3129 bind(SCAN_TO_SUBSTR);
3130 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3131 pcmpestri(vec, Address(result, 0), mode);
3132 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3133 subl(cnt1, stride);
3134 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3135 cmpl(cnt1, cnt2);
3136 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3137 addptr(result, 16);
3138
3139 bind(ADJUST_STR);
3140 cmpl(cnt1, stride); // Do not read beyond string
3141 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3142 // Back-up string to avoid reading beyond string.
3143 lea(result, Address(result, cnt1, scale1, -16));
3144 movl(cnt1, stride);
3145 jmpb(SCAN_TO_SUBSTR);
3146
3147 // Found a potential substr
3148 bind(FOUND_CANDIDATE);
3149 // After pcmpestri tmp(rcx) contains matched element index
3150
3151 // Make sure string is still long enough
3152 subl(cnt1, tmp);
3153 cmpl(cnt1, cnt2);
3154 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3155 // Left less then substring.
3156
3157 bind(RET_NOT_FOUND);
3158 movl(result, -1);
3159 jmp(CLEANUP);
3160
3161 bind(FOUND_SUBSTR);
3162 // Compute start addr of substr
3163 lea(result, Address(result, tmp, scale1));
3164 if (int_cnt2 > 0) { // Constant substring
3165 // Repeat search for small substring (< 8 chars)
3166 // from new point without reloading substring.
3167 // Have to check that we don't read beyond string.
3168 cmpl(tmp, stride-int_cnt2);
3169 jccb(Assembler::greater, ADJUST_STR);
3170 // Fall through if matched whole substring.
3171 } else { // non constant
3172 assert(int_cnt2 == -1, "should be != 0");
3173
3174 addl(tmp, cnt2);
3175 // Found result if we matched whole substring.
3176 cmpl(tmp, stride);
3177 jcc(Assembler::lessEqual, RET_FOUND);
3178
3179 // Repeat search for small substring (<= 8 chars)
3180 // from new point 'str1' without reloading substring.
3181 cmpl(cnt2, stride);
3182 // Have to check that we don't read beyond string.
3183 jccb(Assembler::lessEqual, ADJUST_STR);
3184
3185 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3186 // Compare the rest of substring (> 8 chars).
3187 movptr(str1, result);
3188
3189 cmpl(tmp, cnt2);
3190 // First 8 chars are already matched.
3191 jccb(Assembler::equal, CHECK_NEXT);
3192
3193 bind(SCAN_SUBSTR);
3194 pcmpestri(vec, Address(str1, 0), mode);
3195 // Need to reload strings pointers if not matched whole vector
3196 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3197
3198 bind(CHECK_NEXT);
3199 subl(cnt2, stride);
3200 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3201 addptr(str1, 16);
3202 if (ae == StrIntrinsicNode::UL) {
3203 addptr(str2, 8);
3204 } else {
3205 addptr(str2, 16);
3206 }
3207 subl(cnt1, stride);
3208 cmpl(cnt2, stride); // Do not read beyond substring
3209 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3210 // Back-up strings to avoid reading beyond substring.
3211
3212 if (ae == StrIntrinsicNode::UL) {
3213 lea(str2, Address(str2, cnt2, scale2, -8));
3214 lea(str1, Address(str1, cnt2, scale1, -16));
3215 } else {
3216 lea(str2, Address(str2, cnt2, scale2, -16));
3217 lea(str1, Address(str1, cnt2, scale1, -16));
3218 }
3219 subl(cnt1, cnt2);
3220 movl(cnt2, stride);
3221 addl(cnt1, stride);
3222 bind(CONT_SCAN_SUBSTR);
3223 if (ae == StrIntrinsicNode::UL) {
3224 pmovzxbw(vec, Address(str2, 0));
3225 } else {
3226 movdqu(vec, Address(str2, 0));
3227 }
3228 jmp(SCAN_SUBSTR);
3229
3230 bind(RET_FOUND_LONG);
3231 movptr(str1, Address(rsp, wordSize));
3232 } // non constant
3233
3234 bind(RET_FOUND);
3235 // Compute substr offset
3236 subptr(result, str1);
3237 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3238 shrl(result, 1); // index
3239 }
3240 bind(CLEANUP);
3241 pop(rsp); // restore SP
3242
3243 } // string_indexof
3244
3245 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3246 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3247 ShortBranchVerifier sbv(this);
3248 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3249
3250 int stride = 8;
3251
3252 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3253 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3254 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3255 FOUND_SEQ_CHAR, DONE_LABEL;
3256
3257 movptr(result, str1);
3258 if (UseAVX >= 2) {
3259 cmpl(cnt1, stride);
3260 jcc(Assembler::less, SCAN_TO_CHAR);
3261 cmpl(cnt1, 2*stride);
3262 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3263 movdl(vec1, ch);
3264 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3265 vpxor(vec2, vec2);
3266 movl(tmp, cnt1);
3267 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3268 andl(cnt1,0x0000000F); //tail count (in chars)
3269
3270 bind(SCAN_TO_16_CHAR_LOOP);
3271 vmovdqu(vec3, Address(result, 0));
3272 vpcmpeqw(vec3, vec3, vec1, 1);
3273 vptest(vec2, vec3);
3274 jcc(Assembler::carryClear, FOUND_CHAR);
3275 addptr(result, 32);
3276 subl(tmp, 2*stride);
3277 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3278 jmp(SCAN_TO_8_CHAR);
3279 bind(SCAN_TO_8_CHAR_INIT);
3280 movdl(vec1, ch);
3281 pshuflw(vec1, vec1, 0x00);
3282 pshufd(vec1, vec1, 0);
3283 pxor(vec2, vec2);
3284 }
3285 bind(SCAN_TO_8_CHAR);
3286 cmpl(cnt1, stride);
3287 jcc(Assembler::less, SCAN_TO_CHAR);
3288 if (UseAVX < 2) {
3289 movdl(vec1, ch);
3290 pshuflw(vec1, vec1, 0x00);
3291 pshufd(vec1, vec1, 0);
3292 pxor(vec2, vec2);
3293 }
3294 movl(tmp, cnt1);
3295 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3296 andl(cnt1,0x00000007); //tail count (in chars)
3297
3298 bind(SCAN_TO_8_CHAR_LOOP);
3299 movdqu(vec3, Address(result, 0));
3300 pcmpeqw(vec3, vec1);
3301 ptest(vec2, vec3);
3302 jcc(Assembler::carryClear, FOUND_CHAR);
3303 addptr(result, 16);
3304 subl(tmp, stride);
3305 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3306 bind(SCAN_TO_CHAR);
3307 testl(cnt1, cnt1);
3308 jcc(Assembler::zero, RET_NOT_FOUND);
3309 bind(SCAN_TO_CHAR_LOOP);
3310 load_unsigned_short(tmp, Address(result, 0));
3311 cmpl(ch, tmp);
3312 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3313 addptr(result, 2);
3314 subl(cnt1, 1);
3315 jccb(Assembler::zero, RET_NOT_FOUND);
3316 jmp(SCAN_TO_CHAR_LOOP);
3317
3318 bind(RET_NOT_FOUND);
3319 movl(result, -1);
3320 jmpb(DONE_LABEL);
3321
3322 bind(FOUND_CHAR);
3323 if (UseAVX >= 2) {
3324 vpmovmskb(tmp, vec3);
3325 } else {
3326 pmovmskb(tmp, vec3);
3327 }
3328 bsfl(ch, tmp);
3329 addptr(result, ch);
3330
3331 bind(FOUND_SEQ_CHAR);
3332 subptr(result, str1);
3333 shrl(result, 1);
3334
3335 bind(DONE_LABEL);
3336 } // string_indexof_char
3337
3338 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3339 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3340 ShortBranchVerifier sbv(this);
3341 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3342
3343 int stride = 16;
3344
3345 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3346 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3347 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3348 FOUND_SEQ_CHAR, DONE_LABEL;
3349
3350 movptr(result, str1);
3351 if (UseAVX >= 2) {
3352 cmpl(cnt1, stride);
3353 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3354 cmpl(cnt1, stride*2);
3355 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3356 movdl(vec1, ch);
3357 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3358 vpxor(vec2, vec2);
3359 movl(tmp, cnt1);
3360 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3361 andl(cnt1,0x0000001F); //tail count (in chars)
3362
3363 bind(SCAN_TO_32_CHAR_LOOP);
3364 vmovdqu(vec3, Address(result, 0));
3365 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3366 vptest(vec2, vec3);
3367 jcc(Assembler::carryClear, FOUND_CHAR);
3368 addptr(result, 32);
3369 subl(tmp, stride*2);
3370 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3371 jmp(SCAN_TO_16_CHAR);
3372
3373 bind(SCAN_TO_16_CHAR_INIT);
3374 movdl(vec1, ch);
3375 pxor(vec2, vec2);
3376 pshufb(vec1, vec2);
3377 }
3378
3379 bind(SCAN_TO_16_CHAR);
3380 cmpl(cnt1, stride);
3381 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3382 if (UseAVX < 2) {
3383 movdl(vec1, ch);
3384 pxor(vec2, vec2);
3385 pshufb(vec1, vec2);
3386 }
3387 movl(tmp, cnt1);
3388 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3389 andl(cnt1,0x0000000F); //tail count (in bytes)
3390
3391 bind(SCAN_TO_16_CHAR_LOOP);
3392 movdqu(vec3, Address(result, 0));
3393 pcmpeqb(vec3, vec1);
3394 ptest(vec2, vec3);
3395 jcc(Assembler::carryClear, FOUND_CHAR);
3396 addptr(result, 16);
3397 subl(tmp, stride);
3398 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3399
3400 bind(SCAN_TO_CHAR_INIT);
3401 testl(cnt1, cnt1);
3402 jcc(Assembler::zero, RET_NOT_FOUND);
3403 bind(SCAN_TO_CHAR_LOOP);
3404 load_unsigned_byte(tmp, Address(result, 0));
3405 cmpl(ch, tmp);
3406 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3407 addptr(result, 1);
3408 subl(cnt1, 1);
3409 jccb(Assembler::zero, RET_NOT_FOUND);
3410 jmp(SCAN_TO_CHAR_LOOP);
3411
3412 bind(RET_NOT_FOUND);
3413 movl(result, -1);
3414 jmpb(DONE_LABEL);
3415
3416 bind(FOUND_CHAR);
3417 if (UseAVX >= 2) {
3418 vpmovmskb(tmp, vec3);
3419 } else {
3420 pmovmskb(tmp, vec3);
3421 }
3422 bsfl(ch, tmp);
3423 addptr(result, ch);
3424
3425 bind(FOUND_SEQ_CHAR);
3426 subptr(result, str1);
3427
3428 bind(DONE_LABEL);
3429 } // stringL_indexof_char
3430
3431 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3432 switch (eltype) {
3433 case T_BOOLEAN: return sizeof(jboolean);
3434 case T_BYTE: return sizeof(jbyte);
3435 case T_SHORT: return sizeof(jshort);
3436 case T_CHAR: return sizeof(jchar);
3437 case T_INT: return sizeof(jint);
3438 default:
3439 ShouldNotReachHere();
3440 return -1;
3441 }
3442 }
3443
3444 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3445 switch (eltype) {
3446 // T_BOOLEAN used as surrogate for unsigned byte
3447 case T_BOOLEAN: movzbl(dst, src); break;
3448 case T_BYTE: movsbl(dst, src); break;
3449 case T_SHORT: movswl(dst, src); break;
3450 case T_CHAR: movzwl(dst, src); break;
3451 case T_INT: movl(dst, src); break;
3452 default:
3453 ShouldNotReachHere();
3454 }
3455 }
3456
3457 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3458 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3459 }
3460
3461 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3462 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3463 }
3464
3465 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3466 const int vlen = Assembler::AVX_256bit;
3467 switch (eltype) {
3468 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3469 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3470 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3471 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3472 case T_INT:
3473 // do nothing
3474 break;
3475 default:
3476 ShouldNotReachHere();
3477 }
3478 }
3479
3480 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3481 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3482 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3483 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3484 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3485 BasicType eltype) {
3486 ShortBranchVerifier sbv(this);
3487 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3488 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3489 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3490
3491 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3492 SHORT_UNROLLED_LOOP_EXIT,
3493 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3494 UNROLLED_VECTOR_LOOP_BEGIN,
3495 END;
3496 switch (eltype) {
3497 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3498 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3499 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3500 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3501 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3502 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3503 }
3504
3505 // For "renaming" for readibility of the code
3506 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3507 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3508 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3509
3510 const int elsize = arrays_hashcode_elsize(eltype);
3511
3512 /*
3513 if (cnt1 >= 2) {
3514 if (cnt1 >= 32) {
3515 UNROLLED VECTOR LOOP
3516 }
3517 UNROLLED SCALAR LOOP
3518 }
3519 SINGLE SCALAR
3520 */
3521
3522 cmpl(cnt1, 32);
3523 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3524
3525 // cnt1 >= 32 && generate_vectorized_loop
3526 xorl(index, index);
3527
3528 // vresult = IntVector.zero(I256);
3529 for (int idx = 0; idx < 4; idx++) {
3530 vpxor(vresult[idx], vresult[idx]);
3531 }
3532 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3533 Register bound = tmp2;
3534 Register next = tmp3;
3535 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3536 movl(next, Address(tmp2, 0));
3537 movdl(vnext, next);
3538 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3539
3540 // index = 0;
3541 // bound = cnt1 & ~(32 - 1);
3542 movl(bound, cnt1);
3543 andl(bound, ~(32 - 1));
3544 // for (; index < bound; index += 32) {
3545 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3546 // result *= next;
3547 imull(result, next);
3548 // loop fission to upfront the cost of fetching from memory, OOO execution
3549 // can then hopefully do a better job of prefetching
3550 for (int idx = 0; idx < 4; idx++) {
3551 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3552 }
3553 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3554 for (int idx = 0; idx < 4; idx++) {
3555 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3556 arrays_hashcode_elvcast(vtmp[idx], eltype);
3557 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3558 }
3559 // index += 32;
3560 addl(index, 32);
3561 // index < bound;
3562 cmpl(index, bound);
3563 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3564 // }
3565
3566 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3567 subl(cnt1, bound);
3568 // release bound
3569
3570 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3571 for (int idx = 0; idx < 4; idx++) {
3572 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3573 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3574 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3575 }
3576 // result += vresult.reduceLanes(ADD);
3577 for (int idx = 0; idx < 4; idx++) {
3578 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3579 }
3580
3581 // } else if (cnt1 < 32) {
3582
3583 bind(SHORT_UNROLLED_BEGIN);
3584 // int i = 1;
3585 movl(index, 1);
3586 cmpl(index, cnt1);
3587 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3588
3589 // for (; i < cnt1 ; i += 2) {
3590 bind(SHORT_UNROLLED_LOOP_BEGIN);
3591 movl(tmp3, 961);
3592 imull(result, tmp3);
3593 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3594 movl(tmp3, tmp2);
3595 shll(tmp3, 5);
3596 subl(tmp3, tmp2);
3597 addl(result, tmp3);
3598 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3599 addl(result, tmp3);
3600 addl(index, 2);
3601 cmpl(index, cnt1);
3602 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3603
3604 // }
3605 // if (i >= cnt1) {
3606 bind(SHORT_UNROLLED_LOOP_EXIT);
3607 jccb(Assembler::greater, END);
3608 movl(tmp2, result);
3609 shll(result, 5);
3610 subl(result, tmp2);
3611 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3612 addl(result, tmp3);
3613 // }
3614 bind(END);
3615
3616 BLOCK_COMMENT("} // arrays_hashcode");
3617
3618 } // arrays_hashcode
3619
3620 // helper function for string_compare
3621 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3622 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3623 Address::ScaleFactor scale2, Register index, int ae) {
3624 if (ae == StrIntrinsicNode::LL) {
3625 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3626 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3627 } else if (ae == StrIntrinsicNode::UU) {
3628 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3629 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3630 } else {
3631 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3632 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3633 }
3634 }
3635
3636 // Compare strings, used for char[] and byte[].
3637 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3638 Register cnt1, Register cnt2, Register result,
3639 XMMRegister vec1, int ae, KRegister mask) {
3640 ShortBranchVerifier sbv(this);
3641 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3642 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3643 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3644 int stride2x2 = 0x40;
3645 Address::ScaleFactor scale = Address::no_scale;
3646 Address::ScaleFactor scale1 = Address::no_scale;
3647 Address::ScaleFactor scale2 = Address::no_scale;
3648
3649 if (ae != StrIntrinsicNode::LL) {
3650 stride2x2 = 0x20;
3651 }
3652
3653 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3654 shrl(cnt2, 1);
3655 }
3656 // Compute the minimum of the string lengths and the
3657 // difference of the string lengths (stack).
3658 // Do the conditional move stuff
3659 movl(result, cnt1);
3660 subl(cnt1, cnt2);
3661 push(cnt1);
3662 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3663
3664 // Is the minimum length zero?
3665 testl(cnt2, cnt2);
3666 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3667 if (ae == StrIntrinsicNode::LL) {
3668 // Load first bytes
3669 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3670 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3671 } else if (ae == StrIntrinsicNode::UU) {
3672 // Load first characters
3673 load_unsigned_short(result, Address(str1, 0));
3674 load_unsigned_short(cnt1, Address(str2, 0));
3675 } else {
3676 load_unsigned_byte(result, Address(str1, 0));
3677 load_unsigned_short(cnt1, Address(str2, 0));
3678 }
3679 subl(result, cnt1);
3680 jcc(Assembler::notZero, POP_LABEL);
3681
3682 if (ae == StrIntrinsicNode::UU) {
3683 // Divide length by 2 to get number of chars
3684 shrl(cnt2, 1);
3685 }
3686 cmpl(cnt2, 1);
3687 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3688
3689 // Check if the strings start at the same location and setup scale and stride
3690 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3691 cmpptr(str1, str2);
3692 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3693 if (ae == StrIntrinsicNode::LL) {
3694 scale = Address::times_1;
3695 stride = 16;
3696 } else {
3697 scale = Address::times_2;
3698 stride = 8;
3699 }
3700 } else {
3701 scale1 = Address::times_1;
3702 scale2 = Address::times_2;
3703 // scale not used
3704 stride = 8;
3705 }
3706
3707 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3708 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3709 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3710 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3711 Label COMPARE_TAIL_LONG;
3712 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3713
3714 int pcmpmask = 0x19;
3715 if (ae == StrIntrinsicNode::LL) {
3716 pcmpmask &= ~0x01;
3717 }
3718
3719 // Setup to compare 16-chars (32-bytes) vectors,
3720 // start from first character again because it has aligned address.
3721 if (ae == StrIntrinsicNode::LL) {
3722 stride2 = 32;
3723 } else {
3724 stride2 = 16;
3725 }
3726 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3727 adr_stride = stride << scale;
3728 } else {
3729 adr_stride1 = 8; //stride << scale1;
3730 adr_stride2 = 16; //stride << scale2;
3731 }
3732
3733 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3734 // rax and rdx are used by pcmpestri as elements counters
3735 movl(result, cnt2);
3736 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3737 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3738
3739 // fast path : compare first 2 8-char vectors.
3740 bind(COMPARE_16_CHARS);
3741 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3742 movdqu(vec1, Address(str1, 0));
3743 } else {
3744 pmovzxbw(vec1, Address(str1, 0));
3745 }
3746 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3747 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3748
3749 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3750 movdqu(vec1, Address(str1, adr_stride));
3751 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3752 } else {
3753 pmovzxbw(vec1, Address(str1, adr_stride1));
3754 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3755 }
3756 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3757 addl(cnt1, stride);
3758
3759 // Compare the characters at index in cnt1
3760 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3761 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3762 subl(result, cnt2);
3763 jmp(POP_LABEL);
3764
3765 // Setup the registers to start vector comparison loop
3766 bind(COMPARE_WIDE_VECTORS);
3767 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3768 lea(str1, Address(str1, result, scale));
3769 lea(str2, Address(str2, result, scale));
3770 } else {
3771 lea(str1, Address(str1, result, scale1));
3772 lea(str2, Address(str2, result, scale2));
3773 }
3774 subl(result, stride2);
3775 subl(cnt2, stride2);
3776 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3777 negptr(result);
3778
3779 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3780 bind(COMPARE_WIDE_VECTORS_LOOP);
3781
3782 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3783 cmpl(cnt2, stride2x2);
3784 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3785 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3786 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3787
3788 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3789 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3790 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3791 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3792 } else {
3793 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3794 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3795 }
3796 kortestql(mask, mask);
3797 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3798 addptr(result, stride2x2); // update since we already compared at this addr
3799 subl(cnt2, stride2x2); // and sub the size too
3800 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3801
3802 vpxor(vec1, vec1);
3803 jmpb(COMPARE_WIDE_TAIL);
3804 }//if (VM_Version::supports_avx512vlbw())
3805
3806 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3807 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3808 vmovdqu(vec1, Address(str1, result, scale));
3809 vpxor(vec1, Address(str2, result, scale));
3810 } else {
3811 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3812 vpxor(vec1, Address(str2, result, scale2));
3813 }
3814 vptest(vec1, vec1);
3815 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3816 addptr(result, stride2);
3817 subl(cnt2, stride2);
3818 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3819 // clean upper bits of YMM registers
3820 vpxor(vec1, vec1);
3821
3822 // compare wide vectors tail
3823 bind(COMPARE_WIDE_TAIL);
3824 testptr(result, result);
3825 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3826
3827 movl(result, stride2);
3828 movl(cnt2, result);
3829 negptr(result);
3830 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3831
3832 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3833 bind(VECTOR_NOT_EQUAL);
3834 // clean upper bits of YMM registers
3835 vpxor(vec1, vec1);
3836 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3837 lea(str1, Address(str1, result, scale));
3838 lea(str2, Address(str2, result, scale));
3839 } else {
3840 lea(str1, Address(str1, result, scale1));
3841 lea(str2, Address(str2, result, scale2));
3842 }
3843 jmp(COMPARE_16_CHARS);
3844
3845 // Compare tail chars, length between 1 to 15 chars
3846 bind(COMPARE_TAIL_LONG);
3847 movl(cnt2, result);
3848 cmpl(cnt2, stride);
3849 jcc(Assembler::less, COMPARE_SMALL_STR);
3850
3851 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3852 movdqu(vec1, Address(str1, 0));
3853 } else {
3854 pmovzxbw(vec1, Address(str1, 0));
3855 }
3856 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3857 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3858 subptr(cnt2, stride);
3859 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3860 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3861 lea(str1, Address(str1, result, scale));
3862 lea(str2, Address(str2, result, scale));
3863 } else {
3864 lea(str1, Address(str1, result, scale1));
3865 lea(str2, Address(str2, result, scale2));
3866 }
3867 negptr(cnt2);
3868 jmpb(WHILE_HEAD_LABEL);
3869
3870 bind(COMPARE_SMALL_STR);
3871 } else if (UseSSE42Intrinsics) {
3872 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3873 int pcmpmask = 0x19;
3874 // Setup to compare 8-char (16-byte) vectors,
3875 // start from first character again because it has aligned address.
3876 movl(result, cnt2);
3877 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3878 if (ae == StrIntrinsicNode::LL) {
3879 pcmpmask &= ~0x01;
3880 }
3881 jcc(Assembler::zero, COMPARE_TAIL);
3882 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3883 lea(str1, Address(str1, result, scale));
3884 lea(str2, Address(str2, result, scale));
3885 } else {
3886 lea(str1, Address(str1, result, scale1));
3887 lea(str2, Address(str2, result, scale2));
3888 }
3889 negptr(result);
3890
3891 // pcmpestri
3892 // inputs:
3893 // vec1- substring
3894 // rax - negative string length (elements count)
3895 // mem - scanned string
3896 // rdx - string length (elements count)
3897 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3898 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3899 // outputs:
3900 // rcx - first mismatched element index
3901 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3902
3903 bind(COMPARE_WIDE_VECTORS);
3904 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3905 movdqu(vec1, Address(str1, result, scale));
3906 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3907 } else {
3908 pmovzxbw(vec1, Address(str1, result, scale1));
3909 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3910 }
3911 // After pcmpestri cnt1(rcx) contains mismatched element index
3912
3913 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3914 addptr(result, stride);
3915 subptr(cnt2, stride);
3916 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3917
3918 // compare wide vectors tail
3919 testptr(result, result);
3920 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3921
3922 movl(cnt2, stride);
3923 movl(result, stride);
3924 negptr(result);
3925 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3926 movdqu(vec1, Address(str1, result, scale));
3927 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3928 } else {
3929 pmovzxbw(vec1, Address(str1, result, scale1));
3930 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3931 }
3932 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3933
3934 // Mismatched characters in the vectors
3935 bind(VECTOR_NOT_EQUAL);
3936 addptr(cnt1, result);
3937 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3938 subl(result, cnt2);
3939 jmpb(POP_LABEL);
3940
3941 bind(COMPARE_TAIL); // limit is zero
3942 movl(cnt2, result);
3943 // Fallthru to tail compare
3944 }
3945 // Shift str2 and str1 to the end of the arrays, negate min
3946 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3947 lea(str1, Address(str1, cnt2, scale));
3948 lea(str2, Address(str2, cnt2, scale));
3949 } else {
3950 lea(str1, Address(str1, cnt2, scale1));
3951 lea(str2, Address(str2, cnt2, scale2));
3952 }
3953 decrementl(cnt2); // first character was compared already
3954 negptr(cnt2);
3955
3956 // Compare the rest of the elements
3957 bind(WHILE_HEAD_LABEL);
3958 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3959 subl(result, cnt1);
3960 jccb(Assembler::notZero, POP_LABEL);
3961 increment(cnt2);
3962 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3963
3964 // Strings are equal up to min length. Return the length difference.
3965 bind(LENGTH_DIFF_LABEL);
3966 pop(result);
3967 if (ae == StrIntrinsicNode::UU) {
3968 // Divide diff by 2 to get number of chars
3969 sarl(result, 1);
3970 }
3971 jmpb(DONE_LABEL);
3972
3973 if (VM_Version::supports_avx512vlbw()) {
3974
3975 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3976
3977 kmovql(cnt1, mask);
3978 notq(cnt1);
3979 bsfq(cnt2, cnt1);
3980 if (ae != StrIntrinsicNode::LL) {
3981 // Divide diff by 2 to get number of chars
3982 sarl(cnt2, 1);
3983 }
3984 addq(result, cnt2);
3985 if (ae == StrIntrinsicNode::LL) {
3986 load_unsigned_byte(cnt1, Address(str2, result));
3987 load_unsigned_byte(result, Address(str1, result));
3988 } else if (ae == StrIntrinsicNode::UU) {
3989 load_unsigned_short(cnt1, Address(str2, result, scale));
3990 load_unsigned_short(result, Address(str1, result, scale));
3991 } else {
3992 load_unsigned_short(cnt1, Address(str2, result, scale2));
3993 load_unsigned_byte(result, Address(str1, result, scale1));
3994 }
3995 subl(result, cnt1);
3996 jmpb(POP_LABEL);
3997 }//if (VM_Version::supports_avx512vlbw())
3998
3999 // Discard the stored length difference
4000 bind(POP_LABEL);
4001 pop(cnt1);
4002
4003 // That's it
4004 bind(DONE_LABEL);
4005 if(ae == StrIntrinsicNode::UL) {
4006 negl(result);
4007 }
4008
4009 }
4010
4011 // Search for Non-ASCII character (Negative byte value) in a byte array,
4012 // return the index of the first such character, otherwise the length
4013 // of the array segment searched.
4014 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4015 // @IntrinsicCandidate
4016 // public static int countPositives(byte[] ba, int off, int len) {
4017 // for (int i = off; i < off + len; i++) {
4018 // if (ba[i] < 0) {
4019 // return i - off;
4020 // }
4021 // }
4022 // return len;
4023 // }
4024 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4025 Register result, Register tmp1,
4026 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4027 // rsi: byte array
4028 // rcx: len
4029 // rax: result
4030 ShortBranchVerifier sbv(this);
4031 assert_different_registers(ary1, len, result, tmp1);
4032 assert_different_registers(vec1, vec2);
4033 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4034
4035 movl(result, len); // copy
4036 // len == 0
4037 testl(len, len);
4038 jcc(Assembler::zero, DONE);
4039
4040 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4041 VM_Version::supports_avx512vlbw() &&
4042 VM_Version::supports_bmi2()) {
4043
4044 Label test_64_loop, test_tail, BREAK_LOOP;
4045 movl(tmp1, len);
4046 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4047
4048 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4049 andl(len, 0xffffffc0); // vector count (in chars)
4050 jccb(Assembler::zero, test_tail);
4051
4052 lea(ary1, Address(ary1, len, Address::times_1));
4053 negptr(len);
4054
4055 bind(test_64_loop);
4056 // Check whether our 64 elements of size byte contain negatives
4057 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4058 kortestql(mask1, mask1);
4059 jcc(Assembler::notZero, BREAK_LOOP);
4060
4061 addptr(len, 64);
4062 jccb(Assembler::notZero, test_64_loop);
4063
4064 bind(test_tail);
4065 // bail out when there is nothing to be done
4066 testl(tmp1, -1);
4067 jcc(Assembler::zero, DONE);
4068
4069
4070 // check the tail for absense of negatives
4071 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4072 {
4073 Register tmp3_aliased = len;
4074 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4075 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4076 notq(tmp3_aliased);
4077 kmovql(mask2, tmp3_aliased);
4078 }
4079
4080 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4081 ktestq(mask1, mask2);
4082 jcc(Assembler::zero, DONE);
4083
4084 // do a full check for negative registers in the tail
4085 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4086 // ary1 already pointing to the right place
4087 jmpb(TAIL_START);
4088
4089 bind(BREAK_LOOP);
4090 // At least one byte in the last 64 byte block was negative.
4091 // Set up to look at the last 64 bytes as if they were a tail
4092 lea(ary1, Address(ary1, len, Address::times_1));
4093 addptr(result, len);
4094 // Ignore the very last byte: if all others are positive,
4095 // it must be negative, so we can skip right to the 2+1 byte
4096 // end comparison at this point
4097 orl(result, 63);
4098 movl(len, 63);
4099 // Fallthru to tail compare
4100 } else {
4101
4102 if (UseAVX >= 2) {
4103 // With AVX2, use 32-byte vector compare
4104 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4105
4106 // Compare 32-byte vectors
4107 testl(len, 0xffffffe0); // vector count (in bytes)
4108 jccb(Assembler::zero, TAIL_START);
4109
4110 andl(len, 0xffffffe0);
4111 lea(ary1, Address(ary1, len, Address::times_1));
4112 negptr(len);
4113
4114 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4115 movdl(vec2, tmp1);
4116 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4117
4118 bind(COMPARE_WIDE_VECTORS);
4119 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4120 vptest(vec1, vec2);
4121 jccb(Assembler::notZero, BREAK_LOOP);
4122 addptr(len, 32);
4123 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4124
4125 testl(result, 0x0000001f); // any bytes remaining?
4126 jcc(Assembler::zero, DONE);
4127
4128 // Quick test using the already prepared vector mask
4129 movl(len, result);
4130 andl(len, 0x0000001f);
4131 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4132 vptest(vec1, vec2);
4133 jcc(Assembler::zero, DONE);
4134 // There are zeros, jump to the tail to determine exactly where
4135 jmpb(TAIL_START);
4136
4137 bind(BREAK_LOOP);
4138 // At least one byte in the last 32-byte vector is negative.
4139 // Set up to look at the last 32 bytes as if they were a tail
4140 lea(ary1, Address(ary1, len, Address::times_1));
4141 addptr(result, len);
4142 // Ignore the very last byte: if all others are positive,
4143 // it must be negative, so we can skip right to the 2+1 byte
4144 // end comparison at this point
4145 orl(result, 31);
4146 movl(len, 31);
4147 // Fallthru to tail compare
4148 } else if (UseSSE42Intrinsics) {
4149 // With SSE4.2, use double quad vector compare
4150 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4151
4152 // Compare 16-byte vectors
4153 testl(len, 0xfffffff0); // vector count (in bytes)
4154 jcc(Assembler::zero, TAIL_START);
4155
4156 andl(len, 0xfffffff0);
4157 lea(ary1, Address(ary1, len, Address::times_1));
4158 negptr(len);
4159
4160 movl(tmp1, 0x80808080);
4161 movdl(vec2, tmp1);
4162 pshufd(vec2, vec2, 0);
4163
4164 bind(COMPARE_WIDE_VECTORS);
4165 movdqu(vec1, Address(ary1, len, Address::times_1));
4166 ptest(vec1, vec2);
4167 jccb(Assembler::notZero, BREAK_LOOP);
4168 addptr(len, 16);
4169 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4170
4171 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4172 jcc(Assembler::zero, DONE);
4173
4174 // Quick test using the already prepared vector mask
4175 movl(len, result);
4176 andl(len, 0x0000000f); // tail count (in bytes)
4177 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4178 ptest(vec1, vec2);
4179 jcc(Assembler::zero, DONE);
4180 jmpb(TAIL_START);
4181
4182 bind(BREAK_LOOP);
4183 // At least one byte in the last 16-byte vector is negative.
4184 // Set up and look at the last 16 bytes as if they were a tail
4185 lea(ary1, Address(ary1, len, Address::times_1));
4186 addptr(result, len);
4187 // Ignore the very last byte: if all others are positive,
4188 // it must be negative, so we can skip right to the 2+1 byte
4189 // end comparison at this point
4190 orl(result, 15);
4191 movl(len, 15);
4192 // Fallthru to tail compare
4193 }
4194 }
4195
4196 bind(TAIL_START);
4197 // Compare 4-byte vectors
4198 andl(len, 0xfffffffc); // vector count (in bytes)
4199 jccb(Assembler::zero, COMPARE_CHAR);
4200
4201 lea(ary1, Address(ary1, len, Address::times_1));
4202 negptr(len);
4203
4204 bind(COMPARE_VECTORS);
4205 movl(tmp1, Address(ary1, len, Address::times_1));
4206 andl(tmp1, 0x80808080);
4207 jccb(Assembler::notZero, TAIL_ADJUST);
4208 addptr(len, 4);
4209 jccb(Assembler::notZero, COMPARE_VECTORS);
4210
4211 // Compare trailing char (final 2-3 bytes), if any
4212 bind(COMPARE_CHAR);
4213
4214 testl(result, 0x2); // tail char
4215 jccb(Assembler::zero, COMPARE_BYTE);
4216 load_unsigned_short(tmp1, Address(ary1, 0));
4217 andl(tmp1, 0x00008080);
4218 jccb(Assembler::notZero, CHAR_ADJUST);
4219 lea(ary1, Address(ary1, 2));
4220
4221 bind(COMPARE_BYTE);
4222 testl(result, 0x1); // tail byte
4223 jccb(Assembler::zero, DONE);
4224 load_unsigned_byte(tmp1, Address(ary1, 0));
4225 testl(tmp1, 0x00000080);
4226 jccb(Assembler::zero, DONE);
4227 subptr(result, 1);
4228 jmpb(DONE);
4229
4230 bind(TAIL_ADJUST);
4231 // there are negative bits in the last 4 byte block.
4232 // Adjust result and check the next three bytes
4233 addptr(result, len);
4234 orl(result, 3);
4235 lea(ary1, Address(ary1, len, Address::times_1));
4236 jmpb(COMPARE_CHAR);
4237
4238 bind(CHAR_ADJUST);
4239 // We are looking at a char + optional byte tail, and found that one
4240 // of the bytes in the char is negative. Adjust the result, check the
4241 // first byte and readjust if needed.
4242 andl(result, 0xfffffffc);
4243 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4244 jccb(Assembler::notZero, DONE);
4245 addptr(result, 1);
4246
4247 // That's it
4248 bind(DONE);
4249 if (UseAVX >= 2) {
4250 // clean upper bits of YMM registers
4251 vpxor(vec1, vec1);
4252 vpxor(vec2, vec2);
4253 }
4254 }
4255
4256 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4257 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4258 Register limit, Register result, Register chr,
4259 XMMRegister vec1, XMMRegister vec2, bool is_char,
4260 KRegister mask, bool expand_ary2) {
4261 // for expand_ary2, limit is the (smaller) size of the second array.
4262 ShortBranchVerifier sbv(this);
4263 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4264
4265 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4266 "Expansion only implemented for AVX2");
4267
4268 int length_offset = arrayOopDesc::length_offset_in_bytes();
4269 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4270
4271 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4272 int scaleIncr = expand_ary2 ? 8 : 16;
4273
4274 if (is_array_equ) {
4275 // Check the input args
4276 cmpoop(ary1, ary2);
4277 jcc(Assembler::equal, TRUE_LABEL);
4278
4279 // Need additional checks for arrays_equals.
4280 testptr(ary1, ary1);
4281 jcc(Assembler::zero, FALSE_LABEL);
4282 testptr(ary2, ary2);
4283 jcc(Assembler::zero, FALSE_LABEL);
4284
4285 // Check the lengths
4286 movl(limit, Address(ary1, length_offset));
4287 cmpl(limit, Address(ary2, length_offset));
4288 jcc(Assembler::notEqual, FALSE_LABEL);
4289 }
4290
4291 // count == 0
4292 testl(limit, limit);
4293 jcc(Assembler::zero, TRUE_LABEL);
4294
4295 if (is_array_equ) {
4296 // Load array address
4297 lea(ary1, Address(ary1, base_offset));
4298 lea(ary2, Address(ary2, base_offset));
4299 }
4300
4301 if (is_array_equ && is_char) {
4302 // arrays_equals when used for char[].
4303 shll(limit, 1); // byte count != 0
4304 }
4305 movl(result, limit); // copy
4306
4307 if (UseAVX >= 2) {
4308 // With AVX2, use 32-byte vector compare
4309 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4310
4311 // Compare 32-byte vectors
4312 if (expand_ary2) {
4313 andl(result, 0x0000000f); // tail count (in bytes)
4314 andl(limit, 0xfffffff0); // vector count (in bytes)
4315 jcc(Assembler::zero, COMPARE_TAIL);
4316 } else {
4317 andl(result, 0x0000001f); // tail count (in bytes)
4318 andl(limit, 0xffffffe0); // vector count (in bytes)
4319 jcc(Assembler::zero, COMPARE_TAIL_16);
4320 }
4321
4322 lea(ary1, Address(ary1, limit, scaleFactor));
4323 lea(ary2, Address(ary2, limit, Address::times_1));
4324 negptr(limit);
4325
4326 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4327 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4328
4329 cmpl(limit, -64);
4330 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4331
4332 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4333
4334 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4335 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4336 kortestql(mask, mask);
4337 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4338 addptr(limit, 64); // update since we already compared at this addr
4339 cmpl(limit, -64);
4340 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4341
4342 // At this point we may still need to compare -limit+result bytes.
4343 // We could execute the next two instruction and just continue via non-wide path:
4344 // cmpl(limit, 0);
4345 // jcc(Assembler::equal, COMPARE_TAIL); // true
4346 // But since we stopped at the points ary{1,2}+limit which are
4347 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4348 // (|limit| <= 32 and result < 32),
4349 // we may just compare the last 64 bytes.
4350 //
4351 addptr(result, -64); // it is safe, bc we just came from this area
4352 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4353 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4354 kortestql(mask, mask);
4355 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4356
4357 jmp(TRUE_LABEL);
4358
4359 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4360
4361 }//if (VM_Version::supports_avx512vlbw())
4362
4363 bind(COMPARE_WIDE_VECTORS);
4364 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4365 if (expand_ary2) {
4366 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4367 } else {
4368 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4369 }
4370 vpxor(vec1, vec2);
4371
4372 vptest(vec1, vec1);
4373 jcc(Assembler::notZero, FALSE_LABEL);
4374 addptr(limit, scaleIncr * 2);
4375 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4376
4377 testl(result, result);
4378 jcc(Assembler::zero, TRUE_LABEL);
4379
4380 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4381 if (expand_ary2) {
4382 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4383 } else {
4384 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4385 }
4386 vpxor(vec1, vec2);
4387
4388 vptest(vec1, vec1);
4389 jcc(Assembler::notZero, FALSE_LABEL);
4390 jmp(TRUE_LABEL);
4391
4392 bind(COMPARE_TAIL_16); // limit is zero
4393 movl(limit, result);
4394
4395 // Compare 16-byte chunks
4396 andl(result, 0x0000000f); // tail count (in bytes)
4397 andl(limit, 0xfffffff0); // vector count (in bytes)
4398 jcc(Assembler::zero, COMPARE_TAIL);
4399
4400 lea(ary1, Address(ary1, limit, scaleFactor));
4401 lea(ary2, Address(ary2, limit, Address::times_1));
4402 negptr(limit);
4403
4404 bind(COMPARE_WIDE_VECTORS_16);
4405 movdqu(vec1, Address(ary1, limit, scaleFactor));
4406 if (expand_ary2) {
4407 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4408 } else {
4409 movdqu(vec2, Address(ary2, limit, Address::times_1));
4410 }
4411 pxor(vec1, vec2);
4412
4413 ptest(vec1, vec1);
4414 jcc(Assembler::notZero, FALSE_LABEL);
4415 addptr(limit, scaleIncr);
4416 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4417
4418 bind(COMPARE_TAIL); // limit is zero
4419 movl(limit, result);
4420 // Fallthru to tail compare
4421 } else if (UseSSE42Intrinsics) {
4422 // With SSE4.2, use double quad vector compare
4423 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4424
4425 // Compare 16-byte vectors
4426 andl(result, 0x0000000f); // tail count (in bytes)
4427 andl(limit, 0xfffffff0); // vector count (in bytes)
4428 jcc(Assembler::zero, COMPARE_TAIL);
4429
4430 lea(ary1, Address(ary1, limit, Address::times_1));
4431 lea(ary2, Address(ary2, limit, Address::times_1));
4432 negptr(limit);
4433
4434 bind(COMPARE_WIDE_VECTORS);
4435 movdqu(vec1, Address(ary1, limit, Address::times_1));
4436 movdqu(vec2, Address(ary2, limit, Address::times_1));
4437 pxor(vec1, vec2);
4438
4439 ptest(vec1, vec1);
4440 jcc(Assembler::notZero, FALSE_LABEL);
4441 addptr(limit, 16);
4442 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4443
4444 testl(result, result);
4445 jcc(Assembler::zero, TRUE_LABEL);
4446
4447 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4448 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4449 pxor(vec1, vec2);
4450
4451 ptest(vec1, vec1);
4452 jccb(Assembler::notZero, FALSE_LABEL);
4453 jmpb(TRUE_LABEL);
4454
4455 bind(COMPARE_TAIL); // limit is zero
4456 movl(limit, result);
4457 // Fallthru to tail compare
4458 }
4459
4460 // Compare 4-byte vectors
4461 if (expand_ary2) {
4462 testl(result, result);
4463 jccb(Assembler::zero, TRUE_LABEL);
4464 } else {
4465 andl(limit, 0xfffffffc); // vector count (in bytes)
4466 jccb(Assembler::zero, COMPARE_CHAR);
4467 }
4468
4469 lea(ary1, Address(ary1, limit, scaleFactor));
4470 lea(ary2, Address(ary2, limit, Address::times_1));
4471 negptr(limit);
4472
4473 bind(COMPARE_VECTORS);
4474 if (expand_ary2) {
4475 // There are no "vector" operations for bytes to shorts
4476 movzbl(chr, Address(ary2, limit, Address::times_1));
4477 cmpw(Address(ary1, limit, Address::times_2), chr);
4478 jccb(Assembler::notEqual, FALSE_LABEL);
4479 addptr(limit, 1);
4480 jcc(Assembler::notZero, COMPARE_VECTORS);
4481 jmp(TRUE_LABEL);
4482 } else {
4483 movl(chr, Address(ary1, limit, Address::times_1));
4484 cmpl(chr, Address(ary2, limit, Address::times_1));
4485 jccb(Assembler::notEqual, FALSE_LABEL);
4486 addptr(limit, 4);
4487 jcc(Assembler::notZero, COMPARE_VECTORS);
4488 }
4489
4490 // Compare trailing char (final 2 bytes), if any
4491 bind(COMPARE_CHAR);
4492 testl(result, 0x2); // tail char
4493 jccb(Assembler::zero, COMPARE_BYTE);
4494 load_unsigned_short(chr, Address(ary1, 0));
4495 load_unsigned_short(limit, Address(ary2, 0));
4496 cmpl(chr, limit);
4497 jccb(Assembler::notEqual, FALSE_LABEL);
4498
4499 if (is_array_equ && is_char) {
4500 bind(COMPARE_BYTE);
4501 } else {
4502 lea(ary1, Address(ary1, 2));
4503 lea(ary2, Address(ary2, 2));
4504
4505 bind(COMPARE_BYTE);
4506 testl(result, 0x1); // tail byte
4507 jccb(Assembler::zero, TRUE_LABEL);
4508 load_unsigned_byte(chr, Address(ary1, 0));
4509 load_unsigned_byte(limit, Address(ary2, 0));
4510 cmpl(chr, limit);
4511 jccb(Assembler::notEqual, FALSE_LABEL);
4512 }
4513 bind(TRUE_LABEL);
4514 movl(result, 1); // return true
4515 jmpb(DONE);
4516
4517 bind(FALSE_LABEL);
4518 xorl(result, result); // return false
4519
4520 // That's it
4521 bind(DONE);
4522 if (UseAVX >= 2) {
4523 // clean upper bits of YMM registers
4524 vpxor(vec1, vec1);
4525 vpxor(vec2, vec2);
4526 }
4527 }
4528
4529 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4530 #define __ masm.
4531 Register dst = stub.data<0>();
4532 XMMRegister src = stub.data<1>();
4533 address target = stub.data<2>();
4534 __ bind(stub.entry());
4535 __ subptr(rsp, 8);
4536 __ movdbl(Address(rsp), src);
4537 __ call(RuntimeAddress(target));
4538 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4539 __ pop(dst);
4540 __ jmp(stub.continuation());
4541 #undef __
4542 }
4543
4544 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4545 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4546 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4547
4548 address slowpath_target;
4549 if (dst_bt == T_INT) {
4550 if (src_bt == T_FLOAT) {
4551 cvttss2sil(dst, src);
4552 cmpl(dst, 0x80000000);
4553 slowpath_target = StubRoutines::x86::f2i_fixup();
4554 } else {
4555 cvttsd2sil(dst, src);
4556 cmpl(dst, 0x80000000);
4557 slowpath_target = StubRoutines::x86::d2i_fixup();
4558 }
4559 } else {
4560 if (src_bt == T_FLOAT) {
4561 cvttss2siq(dst, src);
4562 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4563 slowpath_target = StubRoutines::x86::f2l_fixup();
4564 } else {
4565 cvttsd2siq(dst, src);
4566 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4567 slowpath_target = StubRoutines::x86::d2l_fixup();
4568 }
4569 }
4570
4571 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4572 int max_size = 23 + (UseAPX ? 1 : 0);
4573 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4574 jcc(Assembler::equal, stub->entry());
4575 bind(stub->continuation());
4576 }
4577
4578 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4579 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4580 switch(ideal_opc) {
4581 case Op_LShiftVS:
4582 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4583 case Op_LShiftVI:
4584 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4585 case Op_LShiftVL:
4586 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4587 case Op_RShiftVS:
4588 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4589 case Op_RShiftVI:
4590 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4591 case Op_RShiftVL:
4592 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4593 case Op_URShiftVS:
4594 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4595 case Op_URShiftVI:
4596 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4597 case Op_URShiftVL:
4598 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4599 case Op_RotateRightV:
4600 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4601 case Op_RotateLeftV:
4602 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4603 default:
4604 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4605 break;
4606 }
4607 }
4608
4609 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4610 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4611 if (is_unsigned) {
4612 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4613 } else {
4614 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4615 }
4616 }
4617
4618 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4619 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4620 switch (elem_bt) {
4621 case T_BYTE:
4622 if (ideal_opc == Op_SaturatingAddV) {
4623 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4624 } else {
4625 assert(ideal_opc == Op_SaturatingSubV, "");
4626 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4627 }
4628 break;
4629 case T_SHORT:
4630 if (ideal_opc == Op_SaturatingAddV) {
4631 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4632 } else {
4633 assert(ideal_opc == Op_SaturatingSubV, "");
4634 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4635 }
4636 break;
4637 default:
4638 fatal("Unsupported type %s", type2name(elem_bt));
4639 break;
4640 }
4641 }
4642
4643 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4644 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4645 switch (elem_bt) {
4646 case T_BYTE:
4647 if (ideal_opc == Op_SaturatingAddV) {
4648 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4649 } else {
4650 assert(ideal_opc == Op_SaturatingSubV, "");
4651 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4652 }
4653 break;
4654 case T_SHORT:
4655 if (ideal_opc == Op_SaturatingAddV) {
4656 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4657 } else {
4658 assert(ideal_opc == Op_SaturatingSubV, "");
4659 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4660 }
4661 break;
4662 default:
4663 fatal("Unsupported type %s", type2name(elem_bt));
4664 break;
4665 }
4666 }
4667
4668 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4669 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4670 if (is_unsigned) {
4671 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4672 } else {
4673 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4674 }
4675 }
4676
4677 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4678 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4679 switch (elem_bt) {
4680 case T_BYTE:
4681 if (ideal_opc == Op_SaturatingAddV) {
4682 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4683 } else {
4684 assert(ideal_opc == Op_SaturatingSubV, "");
4685 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4686 }
4687 break;
4688 case T_SHORT:
4689 if (ideal_opc == Op_SaturatingAddV) {
4690 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4691 } else {
4692 assert(ideal_opc == Op_SaturatingSubV, "");
4693 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4694 }
4695 break;
4696 default:
4697 fatal("Unsupported type %s", type2name(elem_bt));
4698 break;
4699 }
4700 }
4701
4702 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4703 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4704 switch (elem_bt) {
4705 case T_BYTE:
4706 if (ideal_opc == Op_SaturatingAddV) {
4707 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4708 } else {
4709 assert(ideal_opc == Op_SaturatingSubV, "");
4710 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4711 }
4712 break;
4713 case T_SHORT:
4714 if (ideal_opc == Op_SaturatingAddV) {
4715 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4716 } else {
4717 assert(ideal_opc == Op_SaturatingSubV, "");
4718 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4719 }
4720 break;
4721 default:
4722 fatal("Unsupported type %s", type2name(elem_bt));
4723 break;
4724 }
4725 }
4726
4727 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4728 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4729 bool is_varshift) {
4730 switch (ideal_opc) {
4731 case Op_AddVB:
4732 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4733 case Op_AddVS:
4734 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4735 case Op_AddVI:
4736 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4737 case Op_AddVL:
4738 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4739 case Op_AddVF:
4740 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4741 case Op_AddVD:
4742 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4743 case Op_SubVB:
4744 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4745 case Op_SubVS:
4746 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4747 case Op_SubVI:
4748 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4749 case Op_SubVL:
4750 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4751 case Op_SubVF:
4752 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4753 case Op_SubVD:
4754 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4755 case Op_MulVS:
4756 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4757 case Op_MulVI:
4758 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4759 case Op_MulVL:
4760 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4761 case Op_MulVF:
4762 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4763 case Op_MulVD:
4764 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4765 case Op_DivVF:
4766 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4767 case Op_DivVD:
4768 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4769 case Op_SqrtVF:
4770 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4771 case Op_SqrtVD:
4772 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4773 case Op_AbsVB:
4774 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4775 case Op_AbsVS:
4776 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4777 case Op_AbsVI:
4778 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4779 case Op_AbsVL:
4780 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4781 case Op_FmaVF:
4782 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4783 case Op_FmaVD:
4784 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4785 case Op_VectorRearrange:
4786 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4787 case Op_LShiftVS:
4788 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4789 case Op_LShiftVI:
4790 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4791 case Op_LShiftVL:
4792 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4793 case Op_RShiftVS:
4794 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4795 case Op_RShiftVI:
4796 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4797 case Op_RShiftVL:
4798 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4799 case Op_URShiftVS:
4800 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4801 case Op_URShiftVI:
4802 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4803 case Op_URShiftVL:
4804 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4805 case Op_RotateLeftV:
4806 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4807 case Op_RotateRightV:
4808 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4809 case Op_MaxV:
4810 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4811 case Op_MinV:
4812 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4813 case Op_UMinV:
4814 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4815 case Op_UMaxV:
4816 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4817 case Op_XorV:
4818 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4819 case Op_OrV:
4820 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4821 case Op_AndV:
4822 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4823 default:
4824 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4825 break;
4826 }
4827 }
4828
4829 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4830 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4831 switch (ideal_opc) {
4832 case Op_AddVB:
4833 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4834 case Op_AddVS:
4835 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4836 case Op_AddVI:
4837 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4838 case Op_AddVL:
4839 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4840 case Op_AddVF:
4841 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4842 case Op_AddVD:
4843 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4844 case Op_SubVB:
4845 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4846 case Op_SubVS:
4847 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4848 case Op_SubVI:
4849 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4850 case Op_SubVL:
4851 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4852 case Op_SubVF:
4853 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4854 case Op_SubVD:
4855 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4856 case Op_MulVS:
4857 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4858 case Op_MulVI:
4859 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4860 case Op_MulVL:
4861 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4862 case Op_MulVF:
4863 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4864 case Op_MulVD:
4865 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4866 case Op_DivVF:
4867 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4868 case Op_DivVD:
4869 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4870 case Op_FmaVF:
4871 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4872 case Op_FmaVD:
4873 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4874 case Op_MaxV:
4875 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4876 case Op_MinV:
4877 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4878 case Op_UMaxV:
4879 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4880 case Op_UMinV:
4881 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4882 case Op_XorV:
4883 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4884 case Op_OrV:
4885 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4886 case Op_AndV:
4887 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4888 default:
4889 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4890 break;
4891 }
4892 }
4893
4894 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4895 KRegister src1, KRegister src2) {
4896 BasicType etype = T_ILLEGAL;
4897 switch(mask_len) {
4898 case 2:
4899 case 4:
4900 case 8: etype = T_BYTE; break;
4901 case 16: etype = T_SHORT; break;
4902 case 32: etype = T_INT; break;
4903 case 64: etype = T_LONG; break;
4904 default: fatal("Unsupported type"); break;
4905 }
4906 assert(etype != T_ILLEGAL, "");
4907 switch(ideal_opc) {
4908 case Op_AndVMask:
4909 kand(etype, dst, src1, src2); break;
4910 case Op_OrVMask:
4911 kor(etype, dst, src1, src2); break;
4912 case Op_XorVMask:
4913 kxor(etype, dst, src1, src2); break;
4914 default:
4915 fatal("Unsupported masked operation"); break;
4916 }
4917 }
4918
4919 /*
4920 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4921 * If src is NaN, the result is 0.
4922 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4923 * the result is equal to the value of Integer.MIN_VALUE.
4924 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4925 * the result is equal to the value of Integer.MAX_VALUE.
4926 */
4927 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4928 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4929 Register rscratch, AddressLiteral float_sign_flip,
4930 int vec_enc) {
4931 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4932 Label done;
4933 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4934 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4935 vptest(xtmp2, xtmp2, vec_enc);
4936 jccb(Assembler::equal, done);
4937
4938 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4939 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4940
4941 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4942 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4943 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4944
4945 // Recompute the mask for remaining special value.
4946 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4947 // Extract SRC values corresponding to TRUE mask lanes.
4948 vpand(xtmp4, xtmp2, src, vec_enc);
4949 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4950 // values are set.
4951 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4952
4953 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4954 bind(done);
4955 }
4956
4957 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4958 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4959 Register rscratch, AddressLiteral float_sign_flip,
4960 int vec_enc) {
4961 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4962 Label done;
4963 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4964 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4965 kortestwl(ktmp1, ktmp1);
4966 jccb(Assembler::equal, done);
4967
4968 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4969 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4970 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4971
4972 kxorwl(ktmp1, ktmp1, ktmp2);
4973 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4974 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4975 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4976 bind(done);
4977 }
4978
4979 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4980 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4981 Register rscratch, AddressLiteral double_sign_flip,
4982 int vec_enc) {
4983 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4984
4985 Label done;
4986 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4987 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4988 kortestwl(ktmp1, ktmp1);
4989 jccb(Assembler::equal, done);
4990
4991 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4992 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4993 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4994
4995 kxorwl(ktmp1, ktmp1, ktmp2);
4996 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4997 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4998 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4999 bind(done);
5000 }
5001
5002 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5003 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5004 Register rscratch, AddressLiteral float_sign_flip,
5005 int vec_enc) {
5006 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5007 Label done;
5008 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5009 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5010 kortestwl(ktmp1, ktmp1);
5011 jccb(Assembler::equal, done);
5012
5013 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5014 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5015 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5016
5017 kxorwl(ktmp1, ktmp1, ktmp2);
5018 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5019 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5020 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5021 bind(done);
5022 }
5023
5024 /*
5025 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5026 * If src is NaN, the result is 0.
5027 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5028 * the result is equal to the value of Long.MIN_VALUE.
5029 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5030 * the result is equal to the value of Long.MAX_VALUE.
5031 */
5032 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5033 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5034 Register rscratch, AddressLiteral double_sign_flip,
5035 int vec_enc) {
5036 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5037
5038 Label done;
5039 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5040 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5041 kortestwl(ktmp1, ktmp1);
5042 jccb(Assembler::equal, done);
5043
5044 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5045 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5046 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5047
5048 kxorwl(ktmp1, ktmp1, ktmp2);
5049 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5050 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5051 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5052 bind(done);
5053 }
5054
5055 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5056 XMMRegister xtmp, int index, int vec_enc) {
5057 assert(vec_enc < Assembler::AVX_512bit, "");
5058 if (vec_enc == Assembler::AVX_256bit) {
5059 vextractf128_high(xtmp, src);
5060 vshufps(dst, src, xtmp, index, vec_enc);
5061 } else {
5062 vshufps(dst, src, zero, index, vec_enc);
5063 }
5064 }
5065
5066 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5067 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5068 AddressLiteral float_sign_flip, int src_vec_enc) {
5069 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5070
5071 Label done;
5072 // Compare the destination lanes with float_sign_flip
5073 // value to get mask for all special values.
5074 movdqu(xtmp1, float_sign_flip, rscratch);
5075 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5076 ptest(xtmp2, xtmp2);
5077 jccb(Assembler::equal, done);
5078
5079 // Flip float_sign_flip to get max integer value.
5080 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5081 pxor(xtmp1, xtmp4);
5082
5083 // Set detination lanes corresponding to unordered source lanes as zero.
5084 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5085 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5086
5087 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5088 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5089 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5090
5091 // Recompute the mask for remaining special value.
5092 pxor(xtmp2, xtmp3);
5093 // Extract mask corresponding to non-negative source lanes.
5094 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5095
5096 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5097 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5098 pand(xtmp3, xtmp2);
5099
5100 // Replace destination lanes holding special value(0x80000000) with max int
5101 // if corresponding source lane holds a +ve value.
5102 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5103 bind(done);
5104 }
5105
5106
5107 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5108 XMMRegister xtmp, Register rscratch, int vec_enc) {
5109 switch(to_elem_bt) {
5110 case T_SHORT:
5111 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5112 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5113 vpackusdw(dst, dst, zero, vec_enc);
5114 if (vec_enc == Assembler::AVX_256bit) {
5115 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5116 }
5117 break;
5118 case T_BYTE:
5119 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5120 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5121 vpackusdw(dst, dst, zero, vec_enc);
5122 if (vec_enc == Assembler::AVX_256bit) {
5123 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5124 }
5125 vpackuswb(dst, dst, zero, vec_enc);
5126 break;
5127 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5128 }
5129 }
5130
5131 /*
5132 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5133 * a) Perform vector D2L/F2I cast.
5134 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5135 * It signifies that source value could be any of the special floating point
5136 * values(NaN,-Inf,Inf,Max,-Min).
5137 * c) Set destination to zero if source is NaN value.
5138 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5139 */
5140
5141 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5142 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5143 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5144 int to_elem_sz = type2aelembytes(to_elem_bt);
5145 assert(to_elem_sz <= 4, "");
5146 vcvttps2dq(dst, src, vec_enc);
5147 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5148 if (to_elem_sz < 4) {
5149 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5150 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5151 }
5152 }
5153
5154 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5155 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5156 Register rscratch, int vec_enc) {
5157 int to_elem_sz = type2aelembytes(to_elem_bt);
5158 assert(to_elem_sz <= 4, "");
5159 vcvttps2dq(dst, src, vec_enc);
5160 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5161 switch(to_elem_bt) {
5162 case T_INT:
5163 break;
5164 case T_SHORT:
5165 evpmovdw(dst, dst, vec_enc);
5166 break;
5167 case T_BYTE:
5168 evpmovdb(dst, dst, vec_enc);
5169 break;
5170 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5171 }
5172 }
5173
5174 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5175 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5176 Register rscratch, int vec_enc) {
5177 evcvttps2qq(dst, src, vec_enc);
5178 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5179 }
5180
5181 // Handling for downcasting from double to integer or sub-word types on AVX2.
5182 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5183 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5184 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5185 int to_elem_sz = type2aelembytes(to_elem_bt);
5186 assert(to_elem_sz < 8, "");
5187 vcvttpd2dq(dst, src, vec_enc);
5188 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5189 float_sign_flip, vec_enc);
5190 if (to_elem_sz < 4) {
5191 // xtmp4 holds all zero lanes.
5192 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5193 }
5194 }
5195
5196 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5197 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5198 KRegister ktmp2, AddressLiteral sign_flip,
5199 Register rscratch, int vec_enc) {
5200 if (VM_Version::supports_avx512dq()) {
5201 evcvttpd2qq(dst, src, vec_enc);
5202 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5203 switch(to_elem_bt) {
5204 case T_LONG:
5205 break;
5206 case T_INT:
5207 evpmovsqd(dst, dst, vec_enc);
5208 break;
5209 case T_SHORT:
5210 evpmovsqd(dst, dst, vec_enc);
5211 evpmovdw(dst, dst, vec_enc);
5212 break;
5213 case T_BYTE:
5214 evpmovsqd(dst, dst, vec_enc);
5215 evpmovdb(dst, dst, vec_enc);
5216 break;
5217 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5218 }
5219 } else {
5220 assert(type2aelembytes(to_elem_bt) <= 4, "");
5221 vcvttpd2dq(dst, src, vec_enc);
5222 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5223 switch(to_elem_bt) {
5224 case T_INT:
5225 break;
5226 case T_SHORT:
5227 evpmovdw(dst, dst, vec_enc);
5228 break;
5229 case T_BYTE:
5230 evpmovdb(dst, dst, vec_enc);
5231 break;
5232 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5233 }
5234 }
5235 }
5236
5237 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5238 switch(to_elem_bt) {
5239 case T_LONG:
5240 evcvttps2qqs(dst, src, vec_enc);
5241 break;
5242 case T_INT:
5243 evcvttps2dqs(dst, src, vec_enc);
5244 break;
5245 case T_SHORT:
5246 evcvttps2dqs(dst, src, vec_enc);
5247 evpmovdw(dst, dst, vec_enc);
5248 break;
5249 case T_BYTE:
5250 evcvttps2dqs(dst, src, vec_enc);
5251 evpmovdb(dst, dst, vec_enc);
5252 break;
5253 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5254 }
5255 }
5256
5257 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5258 switch(to_elem_bt) {
5259 case T_LONG:
5260 evcvttps2qqs(dst, src, vec_enc);
5261 break;
5262 case T_INT:
5263 evcvttps2dqs(dst, src, vec_enc);
5264 break;
5265 case T_SHORT:
5266 evcvttps2dqs(dst, src, vec_enc);
5267 evpmovdw(dst, dst, vec_enc);
5268 break;
5269 case T_BYTE:
5270 evcvttps2dqs(dst, src, vec_enc);
5271 evpmovdb(dst, dst, vec_enc);
5272 break;
5273 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5274 }
5275 }
5276
5277 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5278 switch(to_elem_bt) {
5279 case T_LONG:
5280 evcvttpd2qqs(dst, src, vec_enc);
5281 break;
5282 case T_INT:
5283 evcvttpd2dqs(dst, src, vec_enc);
5284 break;
5285 case T_SHORT:
5286 evcvttpd2dqs(dst, src, vec_enc);
5287 evpmovdw(dst, dst, vec_enc);
5288 break;
5289 case T_BYTE:
5290 evcvttpd2dqs(dst, src, vec_enc);
5291 evpmovdb(dst, dst, vec_enc);
5292 break;
5293 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5294 }
5295 }
5296
5297 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5298 switch(to_elem_bt) {
5299 case T_LONG:
5300 evcvttpd2qqs(dst, src, vec_enc);
5301 break;
5302 case T_INT:
5303 evcvttpd2dqs(dst, src, vec_enc);
5304 break;
5305 case T_SHORT:
5306 evcvttpd2dqs(dst, src, vec_enc);
5307 evpmovdw(dst, dst, vec_enc);
5308 break;
5309 case T_BYTE:
5310 evcvttpd2dqs(dst, src, vec_enc);
5311 evpmovdb(dst, dst, vec_enc);
5312 break;
5313 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5314 }
5315 }
5316
5317 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5318 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5319 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5320 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5321 // and re-instantiate original MXCSR.RC mode after that.
5322 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5323
5324 mov64(tmp, julong_cast(0.5L));
5325 evpbroadcastq(xtmp1, tmp, vec_enc);
5326 vaddpd(xtmp1, src , xtmp1, vec_enc);
5327 evcvtpd2qq(dst, xtmp1, vec_enc);
5328 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5329 double_sign_flip, vec_enc);;
5330
5331 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5332 }
5333
5334 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5335 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5336 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5337 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5338 // and re-instantiate original MXCSR.RC mode after that.
5339 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5340
5341 movl(tmp, jint_cast(0.5));
5342 movq(xtmp1, tmp);
5343 vbroadcastss(xtmp1, xtmp1, vec_enc);
5344 vaddps(xtmp1, src , xtmp1, vec_enc);
5345 vcvtps2dq(dst, xtmp1, vec_enc);
5346 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5347 float_sign_flip, vec_enc);
5348
5349 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5350 }
5351
5352 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5353 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5354 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5355 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5356 // and re-instantiate original MXCSR.RC mode after that.
5357 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5358
5359 movl(tmp, jint_cast(0.5));
5360 movq(xtmp1, tmp);
5361 vbroadcastss(xtmp1, xtmp1, vec_enc);
5362 vaddps(xtmp1, src , xtmp1, vec_enc);
5363 vcvtps2dq(dst, xtmp1, vec_enc);
5364 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5365
5366 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5367 }
5368
5369 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5370 BasicType from_elem_bt, BasicType to_elem_bt) {
5371 switch (from_elem_bt) {
5372 case T_BYTE:
5373 switch (to_elem_bt) {
5374 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5375 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5376 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5377 default: ShouldNotReachHere();
5378 }
5379 break;
5380 case T_SHORT:
5381 switch (to_elem_bt) {
5382 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5383 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5384 default: ShouldNotReachHere();
5385 }
5386 break;
5387 case T_INT:
5388 assert(to_elem_bt == T_LONG, "");
5389 vpmovzxdq(dst, src, vlen_enc);
5390 break;
5391 default:
5392 ShouldNotReachHere();
5393 }
5394 }
5395
5396 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5397 BasicType from_elem_bt, BasicType to_elem_bt) {
5398 switch (from_elem_bt) {
5399 case T_BYTE:
5400 switch (to_elem_bt) {
5401 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5402 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5403 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5404 default: ShouldNotReachHere();
5405 }
5406 break;
5407 case T_SHORT:
5408 switch (to_elem_bt) {
5409 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5410 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5411 default: ShouldNotReachHere();
5412 }
5413 break;
5414 case T_INT:
5415 assert(to_elem_bt == T_LONG, "");
5416 vpmovsxdq(dst, src, vlen_enc);
5417 break;
5418 default:
5419 ShouldNotReachHere();
5420 }
5421 }
5422
5423 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5424 BasicType dst_bt, BasicType src_bt, int vlen) {
5425 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5426 assert(vlen_enc != AVX_512bit, "");
5427
5428 int dst_bt_size = type2aelembytes(dst_bt);
5429 int src_bt_size = type2aelembytes(src_bt);
5430 if (dst_bt_size > src_bt_size) {
5431 switch (dst_bt_size / src_bt_size) {
5432 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5433 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5434 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5435 default: ShouldNotReachHere();
5436 }
5437 } else {
5438 assert(dst_bt_size < src_bt_size, "");
5439 switch (src_bt_size / dst_bt_size) {
5440 case 2: {
5441 if (vlen_enc == AVX_128bit) {
5442 vpacksswb(dst, src, src, vlen_enc);
5443 } else {
5444 vpacksswb(dst, src, src, vlen_enc);
5445 vpermq(dst, dst, 0x08, vlen_enc);
5446 }
5447 break;
5448 }
5449 case 4: {
5450 if (vlen_enc == AVX_128bit) {
5451 vpackssdw(dst, src, src, vlen_enc);
5452 vpacksswb(dst, dst, dst, vlen_enc);
5453 } else {
5454 vpackssdw(dst, src, src, vlen_enc);
5455 vpermq(dst, dst, 0x08, vlen_enc);
5456 vpacksswb(dst, dst, dst, AVX_128bit);
5457 }
5458 break;
5459 }
5460 case 8: {
5461 if (vlen_enc == AVX_128bit) {
5462 vpshufd(dst, src, 0x08, vlen_enc);
5463 vpackssdw(dst, dst, dst, vlen_enc);
5464 vpacksswb(dst, dst, dst, vlen_enc);
5465 } else {
5466 vpshufd(dst, src, 0x08, vlen_enc);
5467 vpermq(dst, dst, 0x08, vlen_enc);
5468 vpackssdw(dst, dst, dst, AVX_128bit);
5469 vpacksswb(dst, dst, dst, AVX_128bit);
5470 }
5471 break;
5472 }
5473 default: ShouldNotReachHere();
5474 }
5475 }
5476 }
5477
5478 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5479 bool merge, BasicType bt, int vlen_enc) {
5480 if (bt == T_INT) {
5481 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5482 } else {
5483 assert(bt == T_LONG, "");
5484 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5485 }
5486 }
5487
5488 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5489 bool merge, BasicType bt, int vlen_enc) {
5490 if (bt == T_INT) {
5491 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5492 } else {
5493 assert(bt == T_LONG, "");
5494 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5495 }
5496 }
5497
5498 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5499 Register rtmp2, XMMRegister xtmp, int mask_len,
5500 int vec_enc) {
5501 int index = 0;
5502 int vindex = 0;
5503 mov64(rtmp1, 0x0101010101010101L);
5504 pdepq(rtmp1, src, rtmp1);
5505 if (mask_len > 8) {
5506 movq(rtmp2, src);
5507 vpxor(xtmp, xtmp, xtmp, vec_enc);
5508 movq(xtmp, rtmp1);
5509 }
5510 movq(dst, rtmp1);
5511
5512 mask_len -= 8;
5513 while (mask_len > 0) {
5514 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5515 index++;
5516 if ((index % 2) == 0) {
5517 pxor(xtmp, xtmp);
5518 }
5519 mov64(rtmp1, 0x0101010101010101L);
5520 shrq(rtmp2, 8);
5521 pdepq(rtmp1, rtmp2, rtmp1);
5522 pinsrq(xtmp, rtmp1, index % 2);
5523 vindex = index / 2;
5524 if (vindex) {
5525 // Write entire 16 byte vector when both 64 bit
5526 // lanes are update to save redundant instructions.
5527 if (index % 2) {
5528 vinsertf128(dst, dst, xtmp, vindex);
5529 }
5530 } else {
5531 vmovdqu(dst, xtmp);
5532 }
5533 mask_len -= 8;
5534 }
5535 }
5536
5537 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5538 switch(opc) {
5539 case Op_VectorMaskTrueCount:
5540 popcntq(dst, tmp);
5541 break;
5542 case Op_VectorMaskLastTrue:
5543 if (VM_Version::supports_lzcnt()) {
5544 lzcntq(tmp, tmp);
5545 movl(dst, 63);
5546 subl(dst, tmp);
5547 } else {
5548 movl(dst, -1);
5549 bsrq(tmp, tmp);
5550 cmov32(Assembler::notZero, dst, tmp);
5551 }
5552 break;
5553 case Op_VectorMaskFirstTrue:
5554 if (VM_Version::supports_bmi1()) {
5555 if (masklen < 32) {
5556 orl(tmp, 1 << masklen);
5557 tzcntl(dst, tmp);
5558 } else if (masklen == 32) {
5559 tzcntl(dst, tmp);
5560 } else {
5561 assert(masklen == 64, "");
5562 tzcntq(dst, tmp);
5563 }
5564 } else {
5565 if (masklen < 32) {
5566 orl(tmp, 1 << masklen);
5567 bsfl(dst, tmp);
5568 } else {
5569 assert(masklen == 32 || masklen == 64, "");
5570 movl(dst, masklen);
5571 if (masklen == 32) {
5572 bsfl(tmp, tmp);
5573 } else {
5574 bsfq(tmp, tmp);
5575 }
5576 cmov32(Assembler::notZero, dst, tmp);
5577 }
5578 }
5579 break;
5580 case Op_VectorMaskToLong:
5581 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5582 break;
5583 default: assert(false, "Unhandled mask operation");
5584 }
5585 }
5586
5587 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5588 int masklen, int masksize, int vec_enc) {
5589 assert(VM_Version::supports_popcnt(), "");
5590
5591 if(VM_Version::supports_avx512bw()) {
5592 kmovql(tmp, mask);
5593 } else {
5594 assert(masklen <= 16, "");
5595 kmovwl(tmp, mask);
5596 }
5597
5598 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5599 // operations needs to be clipped.
5600 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5601 andq(tmp, (1 << masklen) - 1);
5602 }
5603
5604 vector_mask_operation_helper(opc, dst, tmp, masklen);
5605 }
5606
5607 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5608 Register tmp, int masklen, BasicType bt, int vec_enc) {
5609 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5610 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5611 assert(VM_Version::supports_popcnt(), "");
5612
5613 bool need_clip = false;
5614 switch(bt) {
5615 case T_BOOLEAN:
5616 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5617 vpxor(xtmp, xtmp, xtmp, vec_enc);
5618 vpsubb(xtmp, xtmp, mask, vec_enc);
5619 vpmovmskb(tmp, xtmp, vec_enc);
5620 need_clip = masklen < 16;
5621 break;
5622 case T_BYTE:
5623 vpmovmskb(tmp, mask, vec_enc);
5624 need_clip = masklen < 16;
5625 break;
5626 case T_SHORT:
5627 vpacksswb(xtmp, mask, mask, vec_enc);
5628 if (masklen >= 16) {
5629 vpermpd(xtmp, xtmp, 8, vec_enc);
5630 }
5631 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5632 need_clip = masklen < 16;
5633 break;
5634 case T_INT:
5635 case T_FLOAT:
5636 vmovmskps(tmp, mask, vec_enc);
5637 need_clip = masklen < 4;
5638 break;
5639 case T_LONG:
5640 case T_DOUBLE:
5641 vmovmskpd(tmp, mask, vec_enc);
5642 need_clip = masklen < 2;
5643 break;
5644 default: assert(false, "Unhandled type, %s", type2name(bt));
5645 }
5646
5647 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5648 // operations needs to be clipped.
5649 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5650 // need_clip implies masklen < 32
5651 andq(tmp, (1 << masklen) - 1);
5652 }
5653
5654 vector_mask_operation_helper(opc, dst, tmp, masklen);
5655 }
5656
5657 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5658 Register rtmp2, int mask_len) {
5659 kmov(rtmp1, src);
5660 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5661 mov64(rtmp2, -1L);
5662 pextq(rtmp2, rtmp2, rtmp1);
5663 kmov(dst, rtmp2);
5664 }
5665
5666 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5667 XMMRegister mask, Register rtmp, Register rscratch,
5668 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5669 int vec_enc) {
5670 assert(type2aelembytes(bt) >= 4, "");
5671 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5672 address compress_perm_table = nullptr;
5673 address expand_perm_table = nullptr;
5674 if (type2aelembytes(bt) == 8) {
5675 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5676 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5677 vmovmskpd(rtmp, mask, vec_enc);
5678 } else {
5679 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5680 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5681 vmovmskps(rtmp, mask, vec_enc);
5682 }
5683 shlq(rtmp, 5); // for 32 byte permute row.
5684 if (opcode == Op_CompressV) {
5685 lea(rscratch, ExternalAddress(compress_perm_table));
5686 } else {
5687 lea(rscratch, ExternalAddress(expand_perm_table));
5688 }
5689 addptr(rtmp, rscratch);
5690 vmovdqu(permv, Address(rtmp));
5691 vpermps(dst, permv, src, Assembler::AVX_256bit);
5692 vpxor(xtmp, xtmp, xtmp, vec_enc);
5693 // Blend the result with zero vector using permute mask, each column entry
5694 // in a permute table row contains either a valid permute index or a -1 (default)
5695 // value, this can potentially be used as a blending mask after
5696 // compressing/expanding the source vector lanes.
5697 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5698 }
5699
5700 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5701 bool merge, BasicType bt, int vec_enc) {
5702 if (opcode == Op_CompressV) {
5703 switch(bt) {
5704 case T_BYTE:
5705 evpcompressb(dst, mask, src, merge, vec_enc);
5706 break;
5707 case T_CHAR:
5708 case T_SHORT:
5709 evpcompressw(dst, mask, src, merge, vec_enc);
5710 break;
5711 case T_INT:
5712 evpcompressd(dst, mask, src, merge, vec_enc);
5713 break;
5714 case T_FLOAT:
5715 evcompressps(dst, mask, src, merge, vec_enc);
5716 break;
5717 case T_LONG:
5718 evpcompressq(dst, mask, src, merge, vec_enc);
5719 break;
5720 case T_DOUBLE:
5721 evcompresspd(dst, mask, src, merge, vec_enc);
5722 break;
5723 default:
5724 fatal("Unsupported type %s", type2name(bt));
5725 break;
5726 }
5727 } else {
5728 assert(opcode == Op_ExpandV, "");
5729 switch(bt) {
5730 case T_BYTE:
5731 evpexpandb(dst, mask, src, merge, vec_enc);
5732 break;
5733 case T_CHAR:
5734 case T_SHORT:
5735 evpexpandw(dst, mask, src, merge, vec_enc);
5736 break;
5737 case T_INT:
5738 evpexpandd(dst, mask, src, merge, vec_enc);
5739 break;
5740 case T_FLOAT:
5741 evexpandps(dst, mask, src, merge, vec_enc);
5742 break;
5743 case T_LONG:
5744 evpexpandq(dst, mask, src, merge, vec_enc);
5745 break;
5746 case T_DOUBLE:
5747 evexpandpd(dst, mask, src, merge, vec_enc);
5748 break;
5749 default:
5750 fatal("Unsupported type %s", type2name(bt));
5751 break;
5752 }
5753 }
5754 }
5755
5756 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5757 KRegister ktmp1, int vec_enc) {
5758 if (opcode == Op_SignumVD) {
5759 vsubpd(dst, zero, one, vec_enc);
5760 // if src < 0 ? -1 : 1
5761 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5762 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5763 // if src == NaN, -0.0 or 0.0 return src.
5764 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5765 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5766 } else {
5767 assert(opcode == Op_SignumVF, "");
5768 vsubps(dst, zero, one, vec_enc);
5769 // if src < 0 ? -1 : 1
5770 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5771 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5772 // if src == NaN, -0.0 or 0.0 return src.
5773 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5774 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5775 }
5776 }
5777
5778 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5779 XMMRegister xtmp1, int vec_enc) {
5780 if (opcode == Op_SignumVD) {
5781 vsubpd(dst, zero, one, vec_enc);
5782 // if src < 0 ? -1 : 1
5783 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5784 // if src == NaN, -0.0 or 0.0 return src.
5785 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5786 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5787 } else {
5788 assert(opcode == Op_SignumVF, "");
5789 vsubps(dst, zero, one, vec_enc);
5790 // if src < 0 ? -1 : 1
5791 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5792 // if src == NaN, -0.0 or 0.0 return src.
5793 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5794 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5795 }
5796 }
5797
5798 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5799 if (VM_Version::supports_avx512bw()) {
5800 if (mask_len > 32) {
5801 kmovql(dst, src);
5802 } else {
5803 kmovdl(dst, src);
5804 if (mask_len != 32) {
5805 kshiftrdl(dst, dst, 32 - mask_len);
5806 }
5807 }
5808 } else {
5809 assert(mask_len <= 16, "");
5810 kmovwl(dst, src);
5811 if (mask_len != 16) {
5812 kshiftrwl(dst, dst, 16 - mask_len);
5813 }
5814 }
5815 }
5816
5817 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5818 int lane_size = type2aelembytes(bt);
5819 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5820 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5821 movptr(rtmp, imm32);
5822 switch(lane_size) {
5823 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5824 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5825 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5826 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5827 fatal("Unsupported lane size %d", lane_size);
5828 break;
5829 }
5830 } else {
5831 movptr(rtmp, imm32);
5832 movq(dst, rtmp);
5833 switch(lane_size) {
5834 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5835 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5836 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5837 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5838 fatal("Unsupported lane size %d", lane_size);
5839 break;
5840 }
5841 }
5842 }
5843
5844 //
5845 // Following is lookup table based popcount computation algorithm:-
5846 // Index Bit set count
5847 // [ 0000 -> 0,
5848 // 0001 -> 1,
5849 // 0010 -> 1,
5850 // 0011 -> 2,
5851 // 0100 -> 1,
5852 // 0101 -> 2,
5853 // 0110 -> 2,
5854 // 0111 -> 3,
5855 // 1000 -> 1,
5856 // 1001 -> 2,
5857 // 1010 -> 3,
5858 // 1011 -> 3,
5859 // 1100 -> 2,
5860 // 1101 -> 3,
5861 // 1111 -> 4 ]
5862 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5863 // shuffle indices for lookup table access.
5864 // b. Right shift each byte of vector lane by 4 positions.
5865 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5866 // shuffle indices for lookup table access.
5867 // d. Add the bitset count of upper and lower 4 bits of each byte.
5868 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5869 // count of all the bytes of a quadword.
5870 // f. Perform step e. for upper 128bit vector lane.
5871 // g. Pack the bitset count of quadwords back to double word.
5872 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5873
5874 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5875 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5876 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5877 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5878 vpsrlw(dst, src, 4, vec_enc);
5879 vpand(dst, dst, xtmp1, vec_enc);
5880 vpand(xtmp1, src, xtmp1, vec_enc);
5881 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5882 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5883 vpshufb(dst, xtmp2, dst, vec_enc);
5884 vpaddb(dst, dst, xtmp1, vec_enc);
5885 }
5886
5887 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5888 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5889 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5890 // Following code is as per steps e,f,g and h of above algorithm.
5891 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5892 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5893 vpsadbw(dst, dst, xtmp2, vec_enc);
5894 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5895 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5896 vpackuswb(dst, xtmp1, dst, vec_enc);
5897 }
5898
5899 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5900 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5901 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5902 // Add the popcount of upper and lower bytes of word.
5903 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5904 vpsrlw(dst, xtmp1, 8, vec_enc);
5905 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5906 vpaddw(dst, dst, xtmp1, vec_enc);
5907 }
5908
5909 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5910 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5911 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5912 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5913 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5914 }
5915
5916 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5917 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5918 switch(bt) {
5919 case T_LONG:
5920 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5921 break;
5922 case T_INT:
5923 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5924 break;
5925 case T_CHAR:
5926 case T_SHORT:
5927 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5928 break;
5929 case T_BYTE:
5930 case T_BOOLEAN:
5931 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5932 break;
5933 default:
5934 fatal("Unsupported type %s", type2name(bt));
5935 break;
5936 }
5937 }
5938
5939 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5940 KRegister mask, bool merge, int vec_enc) {
5941 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5942 switch(bt) {
5943 case T_LONG:
5944 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5945 evpopcntq(dst, mask, src, merge, vec_enc);
5946 break;
5947 case T_INT:
5948 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5949 evpopcntd(dst, mask, src, merge, vec_enc);
5950 break;
5951 case T_CHAR:
5952 case T_SHORT:
5953 assert(VM_Version::supports_avx512_bitalg(), "");
5954 evpopcntw(dst, mask, src, merge, vec_enc);
5955 break;
5956 case T_BYTE:
5957 case T_BOOLEAN:
5958 assert(VM_Version::supports_avx512_bitalg(), "");
5959 evpopcntb(dst, mask, src, merge, vec_enc);
5960 break;
5961 default:
5962 fatal("Unsupported type %s", type2name(bt));
5963 break;
5964 }
5965 }
5966
5967 // Bit reversal algorithm first reverses the bits of each byte followed by
5968 // a byte level reversal for multi-byte primitive types (short/int/long).
5969 // Algorithm performs a lookup table access to get reverse bit sequence
5970 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5971 // is obtained by swapping the reverse bit sequences of upper and lower
5972 // nibble of a byte.
5973 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5974 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5975 if (VM_Version::supports_avx512vlbw()) {
5976
5977 // Get the reverse bit sequence of lower nibble of each byte.
5978 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5979 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5980 evpandq(dst, xtmp2, src, vec_enc);
5981 vpshufb(dst, xtmp1, dst, vec_enc);
5982 vpsllq(dst, dst, 4, vec_enc);
5983
5984 // Get the reverse bit sequence of upper nibble of each byte.
5985 vpandn(xtmp2, xtmp2, src, vec_enc);
5986 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5987 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5988
5989 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5990 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5991 evporq(xtmp2, dst, xtmp2, vec_enc);
5992 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5993
5994 } else if(vec_enc == Assembler::AVX_512bit) {
5995 // Shift based bit reversal.
5996 assert(bt == T_LONG || bt == T_INT, "");
5997
5998 // Swap lower and upper nibble of each byte.
5999 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6000
6001 // Swap two least and most significant bits of each nibble.
6002 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6003
6004 // Swap adjacent pair of bits.
6005 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6006 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6007
6008 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6009 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6010 } else {
6011 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6012 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6013
6014 // Get the reverse bit sequence of lower nibble of each byte.
6015 vpand(dst, xtmp2, src, vec_enc);
6016 vpshufb(dst, xtmp1, dst, vec_enc);
6017 vpsllq(dst, dst, 4, vec_enc);
6018
6019 // Get the reverse bit sequence of upper nibble of each byte.
6020 vpandn(xtmp2, xtmp2, src, vec_enc);
6021 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6022 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6023
6024 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6025 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6026 vpor(xtmp2, dst, xtmp2, vec_enc);
6027 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6028 }
6029 }
6030
6031 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6032 XMMRegister xtmp, Register rscratch) {
6033 assert(VM_Version::supports_gfni(), "");
6034 assert(rscratch != noreg || always_reachable(mask), "missing");
6035
6036 // Galois field instruction based bit reversal based on following algorithm.
6037 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6038 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6039 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6040 vector_reverse_byte(bt, dst, xtmp, vec_enc);
6041 }
6042
6043 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6044 XMMRegister xtmp1, Register rtmp, int vec_enc) {
6045 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6046 evpandq(dst, xtmp1, src, vec_enc);
6047 vpsllq(dst, dst, nbits, vec_enc);
6048 vpandn(xtmp1, xtmp1, src, vec_enc);
6049 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6050 evporq(dst, dst, xtmp1, vec_enc);
6051 }
6052
6053 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6054 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6055 // Shift based bit reversal.
6056 assert(VM_Version::supports_evex(), "");
6057 switch(bt) {
6058 case T_LONG:
6059 // Swap upper and lower double word of each quad word.
6060 evprorq(xtmp1, k0, src, 32, true, vec_enc);
6061 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6062 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6063 break;
6064 case T_INT:
6065 // Swap upper and lower word of each double word.
6066 evprord(xtmp1, k0, src, 16, true, vec_enc);
6067 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6068 break;
6069 case T_CHAR:
6070 case T_SHORT:
6071 // Swap upper and lower byte of each word.
6072 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6073 break;
6074 case T_BYTE:
6075 evmovdquq(dst, k0, src, true, vec_enc);
6076 break;
6077 default:
6078 fatal("Unsupported type %s", type2name(bt));
6079 break;
6080 }
6081 }
6082
6083 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6084 if (bt == T_BYTE) {
6085 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6086 evmovdquq(dst, k0, src, true, vec_enc);
6087 } else {
6088 vmovdqu(dst, src);
6089 }
6090 return;
6091 }
6092 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6093 // pre-computed shuffle indices.
6094 switch(bt) {
6095 case T_LONG:
6096 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6097 break;
6098 case T_INT:
6099 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6100 break;
6101 case T_CHAR:
6102 case T_SHORT:
6103 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6104 break;
6105 default:
6106 fatal("Unsupported type %s", type2name(bt));
6107 break;
6108 }
6109 vpshufb(dst, src, dst, vec_enc);
6110 }
6111
6112 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6113 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6114 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6115 assert(is_integral_type(bt), "");
6116 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6117 assert(VM_Version::supports_avx512cd(), "");
6118 switch(bt) {
6119 case T_LONG:
6120 evplzcntq(dst, ktmp, src, merge, vec_enc);
6121 break;
6122 case T_INT:
6123 evplzcntd(dst, ktmp, src, merge, vec_enc);
6124 break;
6125 case T_SHORT:
6126 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6127 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6128 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6129 vpunpckhwd(dst, xtmp1, src, vec_enc);
6130 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6131 vpackusdw(dst, xtmp2, dst, vec_enc);
6132 break;
6133 case T_BYTE:
6134 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6135 // accessing the lookup table.
6136 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6137 // accessing the lookup table.
6138 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6139 assert(VM_Version::supports_avx512bw(), "");
6140 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6141 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6142 vpand(xtmp2, dst, src, vec_enc);
6143 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6144 vpsrlw(xtmp3, src, 4, vec_enc);
6145 vpand(xtmp3, dst, xtmp3, vec_enc);
6146 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6147 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6148 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6149 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6150 break;
6151 default:
6152 fatal("Unsupported type %s", type2name(bt));
6153 break;
6154 }
6155 }
6156
6157 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6158 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6159 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6160 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6161 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6162 // accessing the lookup table.
6163 vpand(dst, xtmp2, src, vec_enc);
6164 vpshufb(dst, xtmp1, dst, vec_enc);
6165 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6166 // accessing the lookup table.
6167 vpsrlw(xtmp3, src, 4, vec_enc);
6168 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6169 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6170 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6171 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6172 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6173 vpaddb(dst, dst, xtmp2, vec_enc);
6174 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6175 }
6176
6177 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6178 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6179 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6180 // Add zero counts of lower byte and upper byte of a word if
6181 // upper byte holds a zero value.
6182 vpsrlw(xtmp3, src, 8, vec_enc);
6183 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6184 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6185 vpsllw(xtmp2, dst, 8, vec_enc);
6186 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6187 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6188 vpsrlw(dst, dst, 8, vec_enc);
6189 }
6190
6191 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6192 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6193 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6194 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6195 // exponent as the leading zero count.
6196
6197 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6198 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6199 // contributes to the leading number of zeros.
6200 vpsrld(dst, src, 1, vec_enc);
6201 vpandn(dst, dst, src, vec_enc);
6202
6203 vcvtdq2ps(dst, dst, vec_enc);
6204
6205 // By comparing the register to itself, all the bits in the destination are set.
6206 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6207
6208 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6209 vpsrld(xtmp2, xtmp1, 24, vec_enc);
6210 vpsrld(dst, dst, 23, vec_enc);
6211 vpand(dst, xtmp2, dst, vec_enc);
6212
6213 // Subtract 127 from the exponent, which removes the bias from the exponent.
6214 vpsrld(xtmp2, xtmp1, 25, vec_enc);
6215 vpsubd(dst, dst, xtmp2, vec_enc);
6216
6217 vpsrld(xtmp2, xtmp1, 27, vec_enc);
6218
6219 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6220 // is found in any of the lanes, replace the lane with -1 from xtmp1.
6221 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6222
6223 // If the original value is negative, replace the lane with 31.
6224 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6225
6226 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6227 // and for negative numbers the result is 0 as the exponent was replaced with 31.
6228 vpsubd(dst, xtmp2, dst, vec_enc);
6229 }
6230
6231 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6232 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6233 // Find the leading zeros of the top and bottom halves of the long individually.
6234 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6235
6236 // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6237 vpsrlq(xtmp1, dst, 32, vec_enc);
6238 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6239 // be in the most significant position of the bottom half.
6240 vpsrlq(xtmp2, dst, 6, vec_enc);
6241
6242 // In the bottom half, add the top half and bottom half results.
6243 vpaddq(dst, xtmp1, dst, vec_enc);
6244
6245 // For the bottom half, choose between the values using the most significant bit of xtmp2.
6246 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6247 // which contains only the top half result.
6248 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6249 // the lane as required.
6250 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6251 }
6252
6253 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6254 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6255 Register rtmp, int vec_enc) {
6256 assert(is_integral_type(bt), "unexpected type");
6257 assert(vec_enc < Assembler::AVX_512bit, "");
6258 switch(bt) {
6259 case T_LONG:
6260 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6261 break;
6262 case T_INT:
6263 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6264 break;
6265 case T_SHORT:
6266 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6267 break;
6268 case T_BYTE:
6269 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6270 break;
6271 default:
6272 fatal("Unsupported type %s", type2name(bt));
6273 break;
6274 }
6275 }
6276
6277 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6278 switch(bt) {
6279 case T_BYTE:
6280 vpsubb(dst, src1, src2, vec_enc);
6281 break;
6282 case T_SHORT:
6283 vpsubw(dst, src1, src2, vec_enc);
6284 break;
6285 case T_INT:
6286 vpsubd(dst, src1, src2, vec_enc);
6287 break;
6288 case T_LONG:
6289 vpsubq(dst, src1, src2, vec_enc);
6290 break;
6291 default:
6292 fatal("Unsupported type %s", type2name(bt));
6293 break;
6294 }
6295 }
6296
6297 // Trailing zero count computation is based on leading zero count operation as per
6298 // following equation. All AVX3 targets support AVX512CD feature which offers
6299 // direct vector instruction to compute leading zero count.
6300 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6301 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6302 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6303 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6304 assert(is_integral_type(bt), "");
6305 // xtmp = -1
6306 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6307 // xtmp = xtmp + src
6308 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6309 // xtmp = xtmp & ~src
6310 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6311 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6312 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6313 vpsub(bt, dst, xtmp4, dst, vec_enc);
6314 }
6315
6316 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6317 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6318 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6319 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6320 assert(is_integral_type(bt), "");
6321 // xtmp = 0
6322 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6323 // xtmp = 0 - src
6324 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6325 // xtmp = xtmp | src
6326 vpor(xtmp3, xtmp3, src, vec_enc);
6327 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6328 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6329 vpsub(bt, dst, xtmp1, dst, vec_enc);
6330 }
6331
6332 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6333 Label done;
6334 Label neg_divisor_fastpath;
6335 cmpl(divisor, 0);
6336 jccb(Assembler::less, neg_divisor_fastpath);
6337 xorl(rdx, rdx);
6338 divl(divisor);
6339 jmpb(done);
6340 bind(neg_divisor_fastpath);
6341 // Fastpath for divisor < 0:
6342 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6343 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6344 movl(rdx, rax);
6345 subl(rdx, divisor);
6346 if (VM_Version::supports_bmi1()) {
6347 andnl(rax, rdx, rax);
6348 } else {
6349 notl(rdx);
6350 andl(rax, rdx);
6351 }
6352 shrl(rax, 31);
6353 bind(done);
6354 }
6355
6356 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6357 Label done;
6358 Label neg_divisor_fastpath;
6359 cmpl(divisor, 0);
6360 jccb(Assembler::less, neg_divisor_fastpath);
6361 xorl(rdx, rdx);
6362 divl(divisor);
6363 jmpb(done);
6364 bind(neg_divisor_fastpath);
6365 // Fastpath when divisor < 0:
6366 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6367 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6368 movl(rdx, rax);
6369 subl(rax, divisor);
6370 if (VM_Version::supports_bmi1()) {
6371 andnl(rax, rax, rdx);
6372 } else {
6373 notl(rax);
6374 andl(rax, rdx);
6375 }
6376 sarl(rax, 31);
6377 andl(rax, divisor);
6378 subl(rdx, rax);
6379 bind(done);
6380 }
6381
6382 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6383 Label done;
6384 Label neg_divisor_fastpath;
6385
6386 cmpl(divisor, 0);
6387 jccb(Assembler::less, neg_divisor_fastpath);
6388 xorl(rdx, rdx);
6389 divl(divisor);
6390 jmpb(done);
6391 bind(neg_divisor_fastpath);
6392 // Fastpath for divisor < 0:
6393 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6394 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6395 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6396 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6397 movl(rdx, rax);
6398 subl(rax, divisor);
6399 if (VM_Version::supports_bmi1()) {
6400 andnl(rax, rax, rdx);
6401 } else {
6402 notl(rax);
6403 andl(rax, rdx);
6404 }
6405 movl(tmp, rax);
6406 shrl(rax, 31); // quotient
6407 sarl(tmp, 31);
6408 andl(tmp, divisor);
6409 subl(rdx, tmp); // remainder
6410 bind(done);
6411 }
6412
6413 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6414 XMMRegister xtmp2, Register rtmp) {
6415 if(VM_Version::supports_gfni()) {
6416 // Galois field instruction based bit reversal based on following algorithm.
6417 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6418 mov64(rtmp, 0x8040201008040201L);
6419 movq(xtmp1, src);
6420 movq(xtmp2, rtmp);
6421 gf2p8affineqb(xtmp1, xtmp2, 0);
6422 movq(dst, xtmp1);
6423 } else {
6424 // Swap even and odd numbered bits.
6425 movl(rtmp, src);
6426 andl(rtmp, 0x55555555);
6427 shll(rtmp, 1);
6428 movl(dst, src);
6429 andl(dst, 0xAAAAAAAA);
6430 shrl(dst, 1);
6431 orl(dst, rtmp);
6432
6433 // Swap LSB and MSB 2 bits of each nibble.
6434 movl(rtmp, dst);
6435 andl(rtmp, 0x33333333);
6436 shll(rtmp, 2);
6437 andl(dst, 0xCCCCCCCC);
6438 shrl(dst, 2);
6439 orl(dst, rtmp);
6440
6441 // Swap LSB and MSB 4 bits of each byte.
6442 movl(rtmp, dst);
6443 andl(rtmp, 0x0F0F0F0F);
6444 shll(rtmp, 4);
6445 andl(dst, 0xF0F0F0F0);
6446 shrl(dst, 4);
6447 orl(dst, rtmp);
6448 }
6449 bswapl(dst);
6450 }
6451
6452 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6453 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6454 if(VM_Version::supports_gfni()) {
6455 // Galois field instruction based bit reversal based on following algorithm.
6456 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6457 mov64(rtmp1, 0x8040201008040201L);
6458 movq(xtmp1, src);
6459 movq(xtmp2, rtmp1);
6460 gf2p8affineqb(xtmp1, xtmp2, 0);
6461 movq(dst, xtmp1);
6462 } else {
6463 // Swap even and odd numbered bits.
6464 movq(rtmp1, src);
6465 mov64(rtmp2, 0x5555555555555555L);
6466 andq(rtmp1, rtmp2);
6467 shlq(rtmp1, 1);
6468 movq(dst, src);
6469 notq(rtmp2);
6470 andq(dst, rtmp2);
6471 shrq(dst, 1);
6472 orq(dst, rtmp1);
6473
6474 // Swap LSB and MSB 2 bits of each nibble.
6475 movq(rtmp1, dst);
6476 mov64(rtmp2, 0x3333333333333333L);
6477 andq(rtmp1, rtmp2);
6478 shlq(rtmp1, 2);
6479 notq(rtmp2);
6480 andq(dst, rtmp2);
6481 shrq(dst, 2);
6482 orq(dst, rtmp1);
6483
6484 // Swap LSB and MSB 4 bits of each byte.
6485 movq(rtmp1, dst);
6486 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6487 andq(rtmp1, rtmp2);
6488 shlq(rtmp1, 4);
6489 notq(rtmp2);
6490 andq(dst, rtmp2);
6491 shrq(dst, 4);
6492 orq(dst, rtmp1);
6493 }
6494 bswapq(dst);
6495 }
6496
6497 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6498 Label done;
6499 Label neg_divisor_fastpath;
6500 cmpq(divisor, 0);
6501 jccb(Assembler::less, neg_divisor_fastpath);
6502 xorl(rdx, rdx);
6503 divq(divisor);
6504 jmpb(done);
6505 bind(neg_divisor_fastpath);
6506 // Fastpath for divisor < 0:
6507 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6508 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6509 movq(rdx, rax);
6510 subq(rdx, divisor);
6511 if (VM_Version::supports_bmi1()) {
6512 andnq(rax, rdx, rax);
6513 } else {
6514 notq(rdx);
6515 andq(rax, rdx);
6516 }
6517 shrq(rax, 63);
6518 bind(done);
6519 }
6520
6521 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6522 Label done;
6523 Label neg_divisor_fastpath;
6524 cmpq(divisor, 0);
6525 jccb(Assembler::less, neg_divisor_fastpath);
6526 xorq(rdx, rdx);
6527 divq(divisor);
6528 jmp(done);
6529 bind(neg_divisor_fastpath);
6530 // Fastpath when divisor < 0:
6531 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6532 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6533 movq(rdx, rax);
6534 subq(rax, divisor);
6535 if (VM_Version::supports_bmi1()) {
6536 andnq(rax, rax, rdx);
6537 } else {
6538 notq(rax);
6539 andq(rax, rdx);
6540 }
6541 sarq(rax, 63);
6542 andq(rax, divisor);
6543 subq(rdx, rax);
6544 bind(done);
6545 }
6546
6547 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6548 Label done;
6549 Label neg_divisor_fastpath;
6550 cmpq(divisor, 0);
6551 jccb(Assembler::less, neg_divisor_fastpath);
6552 xorq(rdx, rdx);
6553 divq(divisor);
6554 jmp(done);
6555 bind(neg_divisor_fastpath);
6556 // Fastpath for divisor < 0:
6557 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6558 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6559 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6560 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6561 movq(rdx, rax);
6562 subq(rax, divisor);
6563 if (VM_Version::supports_bmi1()) {
6564 andnq(rax, rax, rdx);
6565 } else {
6566 notq(rax);
6567 andq(rax, rdx);
6568 }
6569 movq(tmp, rax);
6570 shrq(rax, 63); // quotient
6571 sarq(tmp, 63);
6572 andq(tmp, divisor);
6573 subq(rdx, tmp); // remainder
6574 bind(done);
6575 }
6576
6577 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6578 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6579 int vlen_enc) {
6580 assert(VM_Version::supports_avx512bw(), "");
6581 // Byte shuffles are inlane operations and indices are determined using
6582 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6583 // normalized to index range 0-15. This makes sure that all the multiples
6584 // of an index value are placed at same relative position in 128 bit
6585 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6586 // will be 16th element in their respective 128 bit lanes.
6587 movl(rtmp, 16);
6588 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6589
6590 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6591 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6592 // original shuffle indices and move the shuffled lanes corresponding to true
6593 // mask to destination vector.
6594 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6595 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6596 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6597
6598 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6599 // and broadcasting second 128 bit lane.
6600 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6601 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6602 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6603 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6604 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6605
6606 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6607 // and broadcasting third 128 bit lane.
6608 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6609 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6610 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6611 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6612 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6613
6614 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6615 // and broadcasting third 128 bit lane.
6616 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6617 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6618 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6619 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6620 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6621 }
6622
6623 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6624 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6625 if (vlen_enc == AVX_128bit) {
6626 vpermilps(dst, src, shuffle, vlen_enc);
6627 } else if (bt == T_INT) {
6628 vpermd(dst, shuffle, src, vlen_enc);
6629 } else {
6630 assert(bt == T_FLOAT, "");
6631 vpermps(dst, shuffle, src, vlen_enc);
6632 }
6633 }
6634
6635 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6636 switch(opcode) {
6637 case Op_AddHF: vaddsh(dst, src1, src2); break;
6638 case Op_SubHF: vsubsh(dst, src1, src2); break;
6639 case Op_MulHF: vmulsh(dst, src1, src2); break;
6640 case Op_DivHF: vdivsh(dst, src1, src2); break;
6641 default: assert(false, "%s", NodeClassNames[opcode]); break;
6642 }
6643 }
6644
6645 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6646 switch(elem_bt) {
6647 case T_BYTE:
6648 if (ideal_opc == Op_SaturatingAddV) {
6649 vpaddsb(dst, src1, src2, vlen_enc);
6650 } else {
6651 assert(ideal_opc == Op_SaturatingSubV, "");
6652 vpsubsb(dst, src1, src2, vlen_enc);
6653 }
6654 break;
6655 case T_SHORT:
6656 if (ideal_opc == Op_SaturatingAddV) {
6657 vpaddsw(dst, src1, src2, vlen_enc);
6658 } else {
6659 assert(ideal_opc == Op_SaturatingSubV, "");
6660 vpsubsw(dst, src1, src2, vlen_enc);
6661 }
6662 break;
6663 default:
6664 fatal("Unsupported type %s", type2name(elem_bt));
6665 break;
6666 }
6667 }
6668
6669 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6670 switch(elem_bt) {
6671 case T_BYTE:
6672 if (ideal_opc == Op_SaturatingAddV) {
6673 vpaddusb(dst, src1, src2, vlen_enc);
6674 } else {
6675 assert(ideal_opc == Op_SaturatingSubV, "");
6676 vpsubusb(dst, src1, src2, vlen_enc);
6677 }
6678 break;
6679 case T_SHORT:
6680 if (ideal_opc == Op_SaturatingAddV) {
6681 vpaddusw(dst, src1, src2, vlen_enc);
6682 } else {
6683 assert(ideal_opc == Op_SaturatingSubV, "");
6684 vpsubusw(dst, src1, src2, vlen_enc);
6685 }
6686 break;
6687 default:
6688 fatal("Unsupported type %s", type2name(elem_bt));
6689 break;
6690 }
6691 }
6692
6693 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6694 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6695 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6696 // overflow_mask = Inp1 <u Inp2
6697 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6698 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6699 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6700 }
6701
6702 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6703 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6704 // Emulate unsigned comparison using signed comparison
6705 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6706 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6707 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6708 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6709
6710 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6711
6712 // Res = INP1 - INP2 (non-commutative and non-associative)
6713 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6714 // Res = Mask ? Zero : Res
6715 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6716 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6717 }
6718
6719 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6720 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6721 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6722 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6723 // Res = Signed Add INP1, INP2
6724 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6725 // T1 = SRC1 | SRC2
6726 vpor(xtmp1, src1, src2, vlen_enc);
6727 // Max_Unsigned = -1
6728 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6729 // Unsigned compare: Mask = Res <u T1
6730 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6731 // res = Mask ? Max_Unsigned : Res
6732 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6733 }
6734
6735 //
6736 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6737 // unsigned addition operation.
6738 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6739 //
6740 // We empirically determined its semantic equivalence to following reduced expression
6741 // overflow_mask = (a + b) <u (a | b)
6742 //
6743 // and also verified it though Alive2 solver.
6744 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6745 //
6746
6747 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6748 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6749 // Res = Signed Add INP1, INP2
6750 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6751 // Compute T1 = INP1 | INP2
6752 vpor(xtmp3, src1, src2, vlen_enc);
6753 // T1 = Minimum signed value.
6754 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6755 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6756 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6757 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6758 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6759 // Compute overflow detection mask = Res<1> <s T1
6760 if (elem_bt == T_INT) {
6761 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6762 } else {
6763 assert(elem_bt == T_LONG, "");
6764 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6765 }
6766 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6767 }
6768
6769 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6770 int vlen_enc, bool xtmp2_hold_M1) {
6771 if (VM_Version::supports_avx512dq()) {
6772 evpmovq2m(ktmp, src, vlen_enc);
6773 } else {
6774 assert(VM_Version::supports_evex(), "");
6775 if (!xtmp2_hold_M1) {
6776 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6777 }
6778 evpsraq(xtmp1, src, 63, vlen_enc);
6779 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6780 }
6781 }
6782
6783 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6784 int vlen_enc, bool xtmp2_hold_M1) {
6785 if (VM_Version::supports_avx512dq()) {
6786 evpmovd2m(ktmp, src, vlen_enc);
6787 } else {
6788 assert(VM_Version::supports_evex(), "");
6789 if (!xtmp2_hold_M1) {
6790 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6791 }
6792 vpsrad(xtmp1, src, 31, vlen_enc);
6793 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6794 }
6795 }
6796
6797
6798 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6799 if (elem_bt == T_LONG) {
6800 if (VM_Version::supports_evex()) {
6801 evpsraq(dst, src, 63, vlen_enc);
6802 } else {
6803 vpsrad(dst, src, 31, vlen_enc);
6804 vpshufd(dst, dst, 0xF5, vlen_enc);
6805 }
6806 } else {
6807 assert(elem_bt == T_INT, "");
6808 vpsrad(dst, src, 31, vlen_enc);
6809 }
6810 }
6811
6812 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6813 if (compute_allones) {
6814 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6815 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6816 } else {
6817 vpcmpeqq(allones, allones, allones, vlen_enc);
6818 }
6819 }
6820 if (elem_bt == T_LONG) {
6821 vpsrlq(dst, allones, 1, vlen_enc);
6822 } else {
6823 assert(elem_bt == T_INT, "");
6824 vpsrld(dst, allones, 1, vlen_enc);
6825 }
6826 }
6827
6828 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6829 if (compute_allones) {
6830 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6831 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6832 } else {
6833 vpcmpeqq(allones, allones, allones, vlen_enc);
6834 }
6835 }
6836 if (elem_bt == T_LONG) {
6837 vpsllq(dst, allones, 63, vlen_enc);
6838 } else {
6839 assert(elem_bt == T_INT, "");
6840 vpslld(dst, allones, 31, vlen_enc);
6841 }
6842 }
6843
6844 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6845 Assembler::ComparisonPredicate cond, int vlen_enc) {
6846 switch(elem_bt) {
6847 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6848 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6849 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6850 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6851 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6852 }
6853 }
6854
6855 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6856 switch(elem_bt) {
6857 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6858 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6859 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6860 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6861 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6862 }
6863 }
6864
6865 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6866 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6867 if (elem_bt == T_LONG) {
6868 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6869 } else {
6870 assert(elem_bt == T_INT, "");
6871 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6872 }
6873 }
6874
6875 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6876 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6877 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6878 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6879 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6880 // Overflow detection based on Hacker's delight section 2-13.
6881 if (ideal_opc == Op_SaturatingAddV) {
6882 // res = src1 + src2
6883 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6884 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6885 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6886 vpxor(xtmp1, dst, src1, vlen_enc);
6887 vpxor(xtmp2, dst, src2, vlen_enc);
6888 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6889 } else {
6890 assert(ideal_opc == Op_SaturatingSubV, "");
6891 // res = src1 - src2
6892 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6893 // Overflow occurs when both inputs have opposite polarity and
6894 // result polarity does not comply with first input polarity.
6895 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6896 vpxor(xtmp1, src1, src2, vlen_enc);
6897 vpxor(xtmp2, dst, src1, vlen_enc);
6898 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6899 }
6900
6901 // Compute overflow detection mask.
6902 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6903 // Note: xtmp1 hold -1 in all its lanes after above call.
6904
6905 // Compute mask based on first input polarity.
6906 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6907
6908 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6909 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6910
6911 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6912 // set bits in first input polarity mask holds a min value.
6913 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6914 // Blend destination lanes with saturated values using overflow detection mask.
6915 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6916 }
6917
6918
6919 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6920 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6921 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6922 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6923 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6924 // Overflow detection based on Hacker's delight section 2-13.
6925 if (ideal_opc == Op_SaturatingAddV) {
6926 // res = src1 + src2
6927 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6928 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6929 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6930 vpxor(xtmp1, dst, src1, vlen_enc);
6931 vpxor(xtmp2, dst, src2, vlen_enc);
6932 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6933 } else {
6934 assert(ideal_opc == Op_SaturatingSubV, "");
6935 // res = src1 - src2
6936 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6937 // Overflow occurs when both inputs have opposite polarity and
6938 // result polarity does not comply with first input polarity.
6939 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6940 vpxor(xtmp1, src1, src2, vlen_enc);
6941 vpxor(xtmp2, dst, src1, vlen_enc);
6942 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6943 }
6944
6945 // Sign-extend to compute overflow detection mask.
6946 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6947
6948 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6949 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6950 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6951
6952 // Compose saturating min/max vector using first input polarity mask.
6953 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6954 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6955
6956 // Blend result with saturating vector using overflow detection mask.
6957 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6958 }
6959
6960 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6961 switch(elem_bt) {
6962 case T_BYTE:
6963 if (ideal_opc == Op_SaturatingAddV) {
6964 vpaddsb(dst, src1, src2, vlen_enc);
6965 } else {
6966 assert(ideal_opc == Op_SaturatingSubV, "");
6967 vpsubsb(dst, src1, src2, vlen_enc);
6968 }
6969 break;
6970 case T_SHORT:
6971 if (ideal_opc == Op_SaturatingAddV) {
6972 vpaddsw(dst, src1, src2, vlen_enc);
6973 } else {
6974 assert(ideal_opc == Op_SaturatingSubV, "");
6975 vpsubsw(dst, src1, src2, vlen_enc);
6976 }
6977 break;
6978 default:
6979 fatal("Unsupported type %s", type2name(elem_bt));
6980 break;
6981 }
6982 }
6983
6984 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6985 switch(elem_bt) {
6986 case T_BYTE:
6987 if (ideal_opc == Op_SaturatingAddV) {
6988 vpaddusb(dst, src1, src2, vlen_enc);
6989 } else {
6990 assert(ideal_opc == Op_SaturatingSubV, "");
6991 vpsubusb(dst, src1, src2, vlen_enc);
6992 }
6993 break;
6994 case T_SHORT:
6995 if (ideal_opc == Op_SaturatingAddV) {
6996 vpaddusw(dst, src1, src2, vlen_enc);
6997 } else {
6998 assert(ideal_opc == Op_SaturatingSubV, "");
6999 vpsubusw(dst, src1, src2, vlen_enc);
7000 }
7001 break;
7002 default:
7003 fatal("Unsupported type %s", type2name(elem_bt));
7004 break;
7005 }
7006 }
7007
7008 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7009 XMMRegister src2, int vlen_enc) {
7010 switch(elem_bt) {
7011 case T_BYTE:
7012 evpermi2b(dst, src1, src2, vlen_enc);
7013 break;
7014 case T_SHORT:
7015 evpermi2w(dst, src1, src2, vlen_enc);
7016 break;
7017 case T_INT:
7018 evpermi2d(dst, src1, src2, vlen_enc);
7019 break;
7020 case T_LONG:
7021 evpermi2q(dst, src1, src2, vlen_enc);
7022 break;
7023 case T_FLOAT:
7024 evpermi2ps(dst, src1, src2, vlen_enc);
7025 break;
7026 case T_DOUBLE:
7027 evpermi2pd(dst, src1, src2, vlen_enc);
7028 break;
7029 default:
7030 fatal("Unsupported type %s", type2name(elem_bt));
7031 break;
7032 }
7033 }
7034
7035 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7036 if (is_unsigned) {
7037 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7038 } else {
7039 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7040 }
7041 }
7042
7043 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7044 if (is_unsigned) {
7045 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7046 } else {
7047 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7048 }
7049 }
7050
7051 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7052 switch(opcode) {
7053 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7054 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7055 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7056 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7057 default: assert(false, "%s", NodeClassNames[opcode]); break;
7058 }
7059 }
7060
7061 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7062 switch(opcode) {
7063 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7064 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7065 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7066 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7067 default: assert(false, "%s", NodeClassNames[opcode]); break;
7068 }
7069 }
7070
7071 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7072 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7073 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7074 }
7075
7076 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7077 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7078 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7079 // Move sign bits of src2 to mask register.
7080 evpmovw2m(ktmp, src2, vlen_enc);
7081 // xtmp1 = src2 < 0 ? src2 : src1
7082 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7083 // xtmp2 = src2 < 0 ? ? src1 : src2
7084 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7085 // Idea behind above swapping is to make seconds source operand a +ve value.
7086 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7087 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7088 // the second source operand, either a NaN or a valid floating-point value, is returned
7089 // dst = max(xtmp1, xtmp2)
7090 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7091 // isNaN = is_unordered_quiet(xtmp1)
7092 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7093 // Final result is same as first source if its a NaN value,
7094 // in case second operand holds a NaN value then as per above semantics
7095 // result is same as second operand.
7096 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7097 } else {
7098 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7099 // Move sign bits of src1 to mask register.
7100 evpmovw2m(ktmp, src1, vlen_enc);
7101 // xtmp1 = src1 < 0 ? src2 : src1
7102 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7103 // xtmp2 = src1 < 0 ? src1 : src2
7104 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7105 // Idea behind above swapping is to make seconds source operand a -ve value.
7106 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7107 // the second source operand is returned.
7108 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7109 // or a valid floating-point value, is written to the result.
7110 // dst = min(xtmp1, xtmp2)
7111 evminph(dst, xtmp1, xtmp2, vlen_enc);
7112 // isNaN = is_unordered_quiet(xtmp1)
7113 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7114 // Final result is same as first source if its a NaN value,
7115 // in case second operand holds a NaN value then as per above semantics
7116 // result is same as second operand.
7117 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7118 }
7119 }