1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "oops/methodData.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/opcodes.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/globals.hpp"
36 #include "runtime/objectMonitor.hpp"
37 #include "runtime/objectMonitorTable.hpp"
38 #include "runtime/stubRoutines.hpp"
39 #include "runtime/synchronizer.hpp"
40 #include "utilities/checkedCast.hpp"
41 #include "utilities/globalDefinitions.hpp"
42 #include "utilities/powerOfTwo.hpp"
43 #include "utilities/sizes.hpp"
44
45 #ifdef PRODUCT
46 #define BLOCK_COMMENT(str) /* nothing */
47 #define STOP(error) stop(error)
48 #else
49 #define BLOCK_COMMENT(str) block_comment(str)
50 #define STOP(error) block_comment(error); stop(error)
51 #endif
52
53 // C2 compiled method's prolog code.
54 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
55 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
56
57 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
58 // Remove word for return addr
59 framesize -= wordSize;
60 stack_bang_size -= wordSize;
61
62 // Calls to C2R adapters often do not accept exceptional returns.
63 // We require that their callers must bang for them. But be careful, because
64 // some VM calls (such as call site linkage) can use several kilobytes of
65 // stack. But the stack safety zone should account for that.
66 // See bugs 4446381, 4468289, 4497237.
67 if (stack_bang_size > 0) {
68 generate_stack_overflow_check(stack_bang_size);
69
70 // We always push rbp, so that on return to interpreter rbp, will be
71 // restored correctly and we can correct the stack.
72 push(rbp);
73 // Save caller's stack pointer into RBP if the frame pointer is preserved.
74 if (PreserveFramePointer) {
75 mov(rbp, rsp);
76 }
77 // Remove word for ebp
78 framesize -= wordSize;
79
80 // Create frame
81 if (framesize) {
82 subptr(rsp, framesize);
83 }
84 } else {
85 subptr(rsp, framesize);
86
87 // Save RBP register now.
88 framesize -= wordSize;
89 movptr(Address(rsp, framesize), rbp);
90 // Save caller's stack pointer into RBP if the frame pointer is preserved.
91 if (PreserveFramePointer) {
92 movptr(rbp, rsp);
93 if (framesize > 0) {
94 addptr(rbp, framesize);
95 }
96 }
97 }
98
99 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
100 framesize -= wordSize;
101 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
102 }
103
104 #ifdef ASSERT
105 if (VerifyStackAtCalls) {
106 Label L;
107 push(rax);
108 mov(rax, rsp);
109 andptr(rax, StackAlignmentInBytes-1);
110 cmpptr(rax, StackAlignmentInBytes-wordSize);
111 pop(rax);
112 jcc(Assembler::equal, L);
113 STOP("Stack is not properly aligned!");
114 bind(L);
115 }
116 #endif
117
118 if (!is_stub) {
119 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
120 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
121 Label dummy_slow_path;
122 Label dummy_continuation;
123 Label* slow_path = &dummy_slow_path;
124 Label* continuation = &dummy_continuation;
125 if (!Compile::current()->output()->in_scratch_emit_size()) {
126 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
127 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
128 Compile::current()->output()->add_stub(stub);
129 slow_path = &stub->entry();
130 continuation = &stub->continuation();
131 }
132 bs->nmethod_entry_barrier(this, slow_path, continuation);
133 }
134 }
135
136 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
137 switch (vlen_in_bytes) {
138 case 4: // fall-through
139 case 8: // fall-through
140 case 16: return Assembler::AVX_128bit;
141 case 32: return Assembler::AVX_256bit;
142 case 64: return Assembler::AVX_512bit;
143
144 default: {
145 ShouldNotReachHere();
146 return Assembler::AVX_NoVec;
147 }
148 }
149 }
150
151 // fast_lock and fast_unlock used by C2
152
153 // Because the transitions from emitted code to the runtime
154 // monitorenter/exit helper stubs are so slow it's critical that
155 // we inline both the lock-stack fast path and the inflated fast path.
156 //
157 // See also: cmpFastLock and cmpFastUnlock.
158 //
159 // What follows is a specialized inline transliteration of the code
160 // in enter() and exit(). If we're concerned about I$ bloat another
161 // option would be to emit TrySlowEnter and TrySlowExit methods
162 // at startup-time. These methods would accept arguments as
163 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
164 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
165 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
166 // In practice, however, the # of lock sites is bounded and is usually small.
167 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
168 // if the processor uses simple bimodal branch predictors keyed by EIP
169 // Since the helper routines would be called from multiple synchronization
170 // sites.
171 //
172 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
173 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
174 // to those specialized methods. That'd give us a mostly platform-independent
175 // implementation that the JITs could optimize and inline at their pleasure.
176 // Done correctly, the only time we'd need to cross to native could would be
177 // to park() or unpark() threads. We'd also need a few more unsafe operators
178 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
179 // (b) explicit barriers or fence operations.
180 //
181 // TODO:
182 //
183 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
184 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
185 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
186 // the lock operators would typically be faster than reifying Self.
187 //
188 // * Ideally I'd define the primitives as:
189 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
190 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
191 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
192 // Instead, we're stuck with a rather awkward and brittle register assignments below.
193 // Furthermore the register assignments are overconstrained, possibly resulting in
194 // sub-optimal code near the synchronization site.
195 //
196 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
197 // Alternately, use a better sp-proximity test.
198 //
199 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
200 // Either one is sufficient to uniquely identify a thread.
201 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
202 //
203 // * Intrinsify notify() and notifyAll() for the common cases where the
204 // object is locked by the calling thread but the waitlist is empty.
205 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
206 //
207 // * use jccb and jmpb instead of jcc and jmp to improve code density.
208 // But beware of excessive branch density on AMD Opterons.
209 //
210 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
211 // or failure of the fast path. If the fast path fails then we pass
212 // control to the slow path, typically in C. In fast_lock and
213 // fast_unlock we often branch to DONE_LABEL, just to find that C2
214 // will emit a conditional branch immediately after the node.
215 // So we have branches to branches and lots of ICC.ZF games.
216 // Instead, it might be better to have C2 pass a "FailureLabel"
217 // into fast_lock and fast_unlock. In the case of success, control
218 // will drop through the node. ICC.ZF is undefined at exit.
219 // In the case of failure, the node will branch directly to the
220 // FailureLabel
221
222 // obj: object to lock
223 // box: on-stack box address -- KILLED
224 // rax: tmp -- KILLED
225 // t : tmp -- KILLED
226 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
227 Register t, Register thread) {
228 assert(rax_reg == rax, "Used for CAS");
229 assert_different_registers(obj, box, rax_reg, t, thread);
230
231 // Handle inflated monitor.
232 Label inflated;
233 // Finish fast lock successfully. ZF value is irrelevant.
234 Label locked;
235 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
236 Label slow_path;
237
238 if (UseObjectMonitorTable) {
239 // Clear cache in case fast locking succeeds or we need to take the slow-path.
240 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
241 }
242
243 if (DiagnoseSyncOnValueBasedClasses != 0) {
244 load_klass(rax_reg, obj, t);
245 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
246 jcc(Assembler::notZero, slow_path);
247 }
248
249 const Register mark = t;
250
251 { // Fast Lock
252
253 Label push;
254
255 const Register top = UseObjectMonitorTable ? rax_reg : box;
256
257 // Load the mark.
258 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
259
260 // Prefetch top.
261 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
262
263 // Check for monitor (0b10).
264 testptr(mark, markWord::monitor_value);
265 jcc(Assembler::notZero, inflated);
266
267 // Check if lock-stack is full.
268 cmpl(top, LockStack::end_offset() - 1);
269 jcc(Assembler::greater, slow_path);
270
271 // Check if recursive.
272 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
273 jccb(Assembler::equal, push);
274
275 // Try to lock. Transition lock bits 0b01 => 0b00
276 movptr(rax_reg, mark);
277 orptr(rax_reg, markWord::unlocked_value);
278 andptr(mark, ~(int32_t)markWord::unlocked_value);
279 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
280 jcc(Assembler::notEqual, slow_path);
281
282 if (UseObjectMonitorTable) {
283 // Need to reload top, clobbered by CAS.
284 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
285 }
286 bind(push);
287 // After successful lock, push object on lock-stack.
288 movptr(Address(thread, top), obj);
289 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
290 jmp(locked);
291 }
292
293 { // Handle inflated monitor.
294 bind(inflated);
295
296 const Register monitor = t;
297
298 if (!UseObjectMonitorTable) {
299 assert(mark == monitor, "should be the same here");
300 } else {
301 const Register hash = t;
302 Label monitor_found;
303
304 // Look for the monitor in the om_cache.
305
306 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
307 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
308 const int num_unrolled = OMCache::CAPACITY;
309 for (int i = 0; i < num_unrolled; i++) {
310 movptr(monitor, Address(thread, cache_offset + monitor_offset));
311 cmpptr(obj, Address(thread, cache_offset));
312 jccb(Assembler::equal, monitor_found);
313 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
314 }
315
316 // Look for the monitor in the table.
317
318 // Get the hash code.
319 movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
320 shrq(hash, markWord::hash_shift);
321 andq(hash, markWord::hash_mask);
322
323 // Get the table and calculate the bucket's address.
324 lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
325 movptr(rax_reg, Address(rax_reg));
326 andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
327 movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
328
329 // Read the monitor from the bucket.
330 movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
331
332 // Check if the monitor in the bucket is special (empty, tombstone or removed)
333 cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
334 jcc(Assembler::below, slow_path);
335
336 // Check if object matches.
337 movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
338 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
339 bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
340 cmpptr(rax_reg, obj);
341 jcc(Assembler::notEqual, slow_path);
342
343 bind(monitor_found);
344 }
345 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
346 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
347 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
348
349 Label monitor_locked;
350 // Lock the monitor.
351
352 if (UseObjectMonitorTable) {
353 // Cache the monitor for unlock before trashing box. On failure to acquire
354 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
355 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
356 }
357
358 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
359 xorptr(rax_reg, rax_reg);
360 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
361 lock(); cmpxchgptr(box, owner_address);
362 jccb(Assembler::equal, monitor_locked);
363
364 // Check if recursive.
365 cmpptr(box, rax_reg);
366 jccb(Assembler::notEqual, slow_path);
367
368 // Recursive.
369 increment(recursions_address);
370
371 bind(monitor_locked);
372 }
373
374 bind(locked);
375 // Set ZF = 1
376 xorl(rax_reg, rax_reg);
377
378 #ifdef ASSERT
379 // Check that locked label is reached with ZF set.
380 Label zf_correct;
381 Label zf_bad_zero;
382 jcc(Assembler::zero, zf_correct);
383 jmp(zf_bad_zero);
384 #endif
385
386 bind(slow_path);
387 #ifdef ASSERT
388 // Check that slow_path label is reached with ZF not set.
389 jcc(Assembler::notZero, zf_correct);
390 stop("Fast Lock ZF != 0");
391 bind(zf_bad_zero);
392 stop("Fast Lock ZF != 1");
393 bind(zf_correct);
394 #endif
395 // C2 uses the value of ZF to determine the continuation.
396 }
397
398 // obj: object to lock
399 // rax: tmp -- KILLED
400 // t : tmp - cannot be obj nor rax -- KILLED
401 //
402 // Some commentary on balanced locking:
403 //
404 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
405 // Methods that don't have provably balanced locking are forced to run in the
406 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
407 // The interpreter provides two properties:
408 // I1: At return-time the interpreter automatically and quietly unlocks any
409 // objects acquired in the current activation (frame). Recall that the
410 // interpreter maintains an on-stack list of locks currently held by
411 // a frame.
412 // I2: If a method attempts to unlock an object that is not held by the
413 // frame the interpreter throws IMSX.
414 //
415 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
416 // B() doesn't have provably balanced locking so it runs in the interpreter.
417 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
418 // is still locked by A().
419 //
420 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
421 // Specification" states that an object locked by JNI's MonitorEnter should not be
422 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
423 // specify what will occur if a program engages in such mixed-mode locking, however.
424 // Arguably given that the spec legislates the JNI case as undefined our implementation
425 // could reasonably *avoid* checking owner in fast_unlock().
426 // In the interest of performance we elide m->Owner==Self check in unlock.
427 // A perfectly viable alternative is to elide the owner check except when
428 // Xcheck:jni is enabled.
429
430 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
431 assert(reg_rax == rax, "Used for CAS");
432 assert_different_registers(obj, reg_rax, t);
433
434 // Handle inflated monitor.
435 Label inflated, inflated_check_lock_stack;
436 // Finish fast unlock successfully. MUST jump with ZF == 1
437 Label unlocked, slow_path;
438
439 const Register mark = t;
440 const Register monitor = t;
441 const Register top = UseObjectMonitorTable ? t : reg_rax;
442 const Register box = reg_rax;
443
444 Label dummy;
445 C2FastUnlockStub* stub = nullptr;
446
447 if (!Compile::current()->output()->in_scratch_emit_size()) {
448 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
449 Compile::current()->output()->add_stub(stub);
450 }
451
452 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
453
454 { // Fast Unlock
455
456 // Load top.
457 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
458
459 if (!UseObjectMonitorTable) {
460 // Prefetch mark.
461 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
462 }
463
464 // Check if obj is top of lock-stack.
465 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
466 // Top of lock stack was not obj. Must be monitor.
467 jcc(Assembler::notEqual, inflated_check_lock_stack);
468
469 // Pop lock-stack.
470 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
471 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
472
473 // Check if recursive.
474 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
475 jcc(Assembler::equal, unlocked);
476
477 // We elide the monitor check, let the CAS fail instead.
478
479 if (UseObjectMonitorTable) {
480 // Load mark.
481 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
482 }
483
484 // Try to unlock. Transition lock bits 0b00 => 0b01
485 movptr(reg_rax, mark);
486 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
487 orptr(mark, markWord::unlocked_value);
488 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
489 jcc(Assembler::notEqual, push_and_slow_path);
490 jmp(unlocked);
491 }
492
493
494 { // Handle inflated monitor.
495 bind(inflated_check_lock_stack);
496 #ifdef ASSERT
497 Label check_done;
498 subl(top, oopSize);
499 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
500 jcc(Assembler::below, check_done);
501 cmpptr(obj, Address(thread, top));
502 jcc(Assembler::notEqual, inflated_check_lock_stack);
503 stop("Fast Unlock lock on stack");
504 bind(check_done);
505 if (UseObjectMonitorTable) {
506 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
507 }
508 testptr(mark, markWord::monitor_value);
509 jcc(Assembler::notZero, inflated);
510 stop("Fast Unlock not monitor");
511 #endif
512
513 bind(inflated);
514
515 if (!UseObjectMonitorTable) {
516 assert(mark == monitor, "should be the same here");
517 } else {
518 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
519 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
520 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
521 cmpptr(monitor, alignof(ObjectMonitor*));
522 jcc(Assembler::below, slow_path);
523 }
524 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
525 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
526 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
527 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
528 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
529
530 Label recursive;
531
532 // Check if recursive.
533 cmpptr(recursions_address, 0);
534 jcc(Assembler::notZero, recursive);
535
536 // Set owner to null.
537 // Release to satisfy the JMM
538 movptr(owner_address, NULL_WORD);
539 // We need a full fence after clearing owner to avoid stranding.
540 // StoreLoad achieves this.
541 membar(StoreLoad);
542
543 // Check if the entry_list is empty.
544 cmpptr(entry_list_address, NULL_WORD);
545 jcc(Assembler::zero, unlocked); // If so we are done.
546
547 // Check if there is a successor.
548 cmpptr(succ_address, NULL_WORD);
549 jcc(Assembler::notZero, unlocked); // If so we are done.
550
551 // Save the monitor pointer in the current thread, so we can try to
552 // reacquire the lock in SharedRuntime::monitor_exit_helper().
553 if (!UseObjectMonitorTable) {
554 andptr(monitor, ~(int32_t)markWord::monitor_value);
555 }
556 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
557
558 orl(t, 1); // Fast Unlock ZF = 0
559 jmpb(slow_path);
560
561 // Recursive unlock.
562 bind(recursive);
563 decrement(recursions_address);
564 }
565
566 bind(unlocked);
567 xorl(t, t); // Fast Unlock ZF = 1
568
569 #ifdef ASSERT
570 // Check that unlocked label is reached with ZF set.
571 Label zf_correct;
572 Label zf_bad_zero;
573 jcc(Assembler::zero, zf_correct);
574 jmp(zf_bad_zero);
575 #endif
576
577 bind(slow_path);
578 if (stub != nullptr) {
579 bind(stub->slow_path_continuation());
580 }
581 #ifdef ASSERT
582 // Check that stub->continuation() label is reached with ZF not set.
583 jcc(Assembler::notZero, zf_correct);
584 stop("Fast Unlock ZF != 0");
585 bind(zf_bad_zero);
586 stop("Fast Unlock ZF != 1");
587 bind(zf_correct);
588 #endif
589 // C2 uses the value of ZF to determine the continuation.
590 }
591
592 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
593 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
594 }
595
596 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
597 const int framesize = Compile::current()->output()->frame_size_in_bytes();
598 masm->movptr(dst, rsp);
599 if (framesize > 2 * wordSize) {
600 masm->addptr(dst, framesize - 2 * wordSize);
601 }
602 }
603
604 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
605 if (PreserveFramePointer) {
606 // frame pointer is valid
607 #ifdef ASSERT
608 // Verify frame pointer value in rbp.
609 reconstruct_frame_pointer_helper(this, rtmp);
610 Label L_success;
611 cmpq(rbp, rtmp);
612 jccb(Assembler::equal, L_success);
613 STOP("frame pointer mismatch");
614 bind(L_success);
615 #endif // ASSERT
616 } else {
617 reconstruct_frame_pointer_helper(this, rbp);
618 }
619 }
620
621 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
622 jint lo = t->_lo;
623 jint hi = t->_hi;
624 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
625 if (t == TypeInt::INT) {
626 return;
627 }
628
629 BLOCK_COMMENT("CastII {");
630 Label fail;
631 Label succeed;
632
633 if (lo != min_jint) {
634 cmpl(val, lo);
635 jccb(Assembler::less, fail);
636 }
637 if (hi != max_jint) {
638 cmpl(val, hi);
639 jccb(Assembler::greater, fail);
640 }
641 jmpb(succeed);
642
643 bind(fail);
644 movl(c_rarg0, idx);
645 movl(c_rarg1, val);
646 movl(c_rarg2, lo);
647 movl(c_rarg3, hi);
648 reconstruct_frame_pointer(rscratch1);
649 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
650 hlt();
651 bind(succeed);
652 BLOCK_COMMENT("} // CastII");
653 }
654
655 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
656 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
657 }
658
659 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
660 jlong lo = t->_lo;
661 jlong hi = t->_hi;
662 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
663 if (t == TypeLong::LONG) {
664 return;
665 }
666
667 BLOCK_COMMENT("CastLL {");
668 Label fail;
669 Label succeed;
670
671 auto cmp_val = [&](jlong bound) {
672 if (is_simm32(bound)) {
673 cmpq(val, checked_cast<int>(bound));
674 } else {
675 mov64(tmp, bound);
676 cmpq(val, tmp);
677 }
678 };
679
680 if (lo != min_jlong) {
681 cmp_val(lo);
682 jccb(Assembler::less, fail);
683 }
684 if (hi != max_jlong) {
685 cmp_val(hi);
686 jccb(Assembler::greater, fail);
687 }
688 jmpb(succeed);
689
690 bind(fail);
691 movl(c_rarg0, idx);
692 movq(c_rarg1, val);
693 mov64(c_rarg2, lo);
694 mov64(c_rarg3, hi);
695 reconstruct_frame_pointer(rscratch1);
696 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
697 hlt();
698 bind(succeed);
699 BLOCK_COMMENT("} // CastLL");
700 }
701
702 //-------------------------------------------------------------------------------------------
703 // Generic instructions support for use in .ad files C2 code generation
704
705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
706 if (dst != src) {
707 movdqu(dst, src);
708 }
709 if (opcode == Op_AbsVD) {
710 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
711 } else {
712 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
713 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
714 }
715 }
716
717 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
718 if (opcode == Op_AbsVD) {
719 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
720 } else {
721 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
722 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
723 }
724 }
725
726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
727 if (dst != src) {
728 movdqu(dst, src);
729 }
730 if (opcode == Op_AbsVF) {
731 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
732 } else {
733 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
734 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
735 }
736 }
737
738 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
739 if (opcode == Op_AbsVF) {
740 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
741 } else {
742 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
743 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
744 }
745 }
746
747 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
748 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
749 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
750
751 if (opcode == Op_MinV) {
752 if (elem_bt == T_BYTE) {
753 pminsb(dst, src);
754 } else if (elem_bt == T_SHORT) {
755 pminsw(dst, src);
756 } else if (elem_bt == T_INT) {
757 pminsd(dst, src);
758 } else {
759 assert(elem_bt == T_LONG, "required");
760 assert(tmp == xmm0, "required");
761 assert_different_registers(dst, src, tmp);
762 movdqu(xmm0, dst);
763 pcmpgtq(xmm0, src);
764 blendvpd(dst, src); // xmm0 as mask
765 }
766 } else { // opcode == Op_MaxV
767 if (elem_bt == T_BYTE) {
768 pmaxsb(dst, src);
769 } else if (elem_bt == T_SHORT) {
770 pmaxsw(dst, src);
771 } else if (elem_bt == T_INT) {
772 pmaxsd(dst, src);
773 } else {
774 assert(elem_bt == T_LONG, "required");
775 assert(tmp == xmm0, "required");
776 assert_different_registers(dst, src, tmp);
777 movdqu(xmm0, src);
778 pcmpgtq(xmm0, dst);
779 blendvpd(dst, src); // xmm0 as mask
780 }
781 }
782 }
783
784 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
785 XMMRegister src1, Address src2, int vlen_enc) {
786 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
787 if (opcode == Op_UMinV) {
788 switch(elem_bt) {
789 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
790 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
791 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
792 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
793 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
794 }
795 } else {
796 assert(opcode == Op_UMaxV, "required");
797 switch(elem_bt) {
798 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
799 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
800 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
801 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
802 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
803 }
804 }
805 }
806
807 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
808 // For optimality, leverage a full vector width of 512 bits
809 // for operations over smaller vector sizes on AVX512 targets.
810 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
811 if (opcode == Op_UMaxV) {
812 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
813 } else {
814 assert(opcode == Op_UMinV, "required");
815 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
816 }
817 } else {
818 // T1 = -1
819 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
820 // T1 = -1 << 63
821 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
822 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
823 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
824 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
825 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
826 // Mask = T2 > T1
827 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
828 if (opcode == Op_UMaxV) {
829 // Res = Mask ? Src2 : Src1
830 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
831 } else {
832 // Res = Mask ? Src1 : Src2
833 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
834 }
835 }
836 }
837
838 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
839 XMMRegister src1, XMMRegister src2, int vlen_enc) {
840 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
841 if (opcode == Op_UMinV) {
842 switch(elem_bt) {
843 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
844 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
845 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
846 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
847 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
848 }
849 } else {
850 assert(opcode == Op_UMaxV, "required");
851 switch(elem_bt) {
852 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
853 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
854 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
855 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
856 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
857 }
858 }
859 }
860
861 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
862 XMMRegister dst, XMMRegister src1, XMMRegister src2,
863 int vlen_enc) {
864 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
865
866 if (opcode == Op_MinV) {
867 if (elem_bt == T_BYTE) {
868 vpminsb(dst, src1, src2, vlen_enc);
869 } else if (elem_bt == T_SHORT) {
870 vpminsw(dst, src1, src2, vlen_enc);
871 } else if (elem_bt == T_INT) {
872 vpminsd(dst, src1, src2, vlen_enc);
873 } else {
874 assert(elem_bt == T_LONG, "required");
875 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
876 vpminsq(dst, src1, src2, vlen_enc);
877 } else {
878 assert_different_registers(dst, src1, src2);
879 vpcmpgtq(dst, src1, src2, vlen_enc);
880 vblendvpd(dst, src1, src2, dst, vlen_enc);
881 }
882 }
883 } else { // opcode == Op_MaxV
884 if (elem_bt == T_BYTE) {
885 vpmaxsb(dst, src1, src2, vlen_enc);
886 } else if (elem_bt == T_SHORT) {
887 vpmaxsw(dst, src1, src2, vlen_enc);
888 } else if (elem_bt == T_INT) {
889 vpmaxsd(dst, src1, src2, vlen_enc);
890 } else {
891 assert(elem_bt == T_LONG, "required");
892 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
893 vpmaxsq(dst, src1, src2, vlen_enc);
894 } else {
895 assert_different_registers(dst, src1, src2);
896 vpcmpgtq(dst, src1, src2, vlen_enc);
897 vblendvpd(dst, src2, src1, dst, vlen_enc);
898 }
899 }
900 }
901 }
902
903 // Float/Double min max
904
905 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
906 XMMRegister dst, XMMRegister a, XMMRegister b,
907 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
908 int vlen_enc) {
909 assert(UseAVX > 0, "required");
910 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
911 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
912 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
913 assert_different_registers(a, tmp, atmp, btmp);
914 assert_different_registers(b, tmp, atmp, btmp);
915
916 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
917 bool is_double_word = is_double_word_type(elem_bt);
918
919 /* Note on 'non-obvious' assembly sequence:
920 *
921 * While there are vminps/vmaxps instructions, there are two important differences between hardware
922 * and Java on how they handle floats:
923 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
924 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
925 *
926 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
927 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
928 * (only useful when signs differ, noop otherwise)
929 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
930
931 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
932 * btmp = (b < +0.0) ? a : b
933 * atmp = (b < +0.0) ? b : a
934 * Tmp = Max_Float(atmp , btmp)
935 * Res = (atmp == NaN) ? atmp : Tmp
936 */
937
938 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
939 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
940 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
941 XMMRegister mask;
942
943 if (!is_double_word && is_min) {
944 mask = a;
945 vblend = &MacroAssembler::vblendvps;
946 vmaxmin = &MacroAssembler::vminps;
947 vcmp = &MacroAssembler::vcmpps;
948 } else if (!is_double_word && !is_min) {
949 mask = b;
950 vblend = &MacroAssembler::vblendvps;
951 vmaxmin = &MacroAssembler::vmaxps;
952 vcmp = &MacroAssembler::vcmpps;
953 } else if (is_double_word && is_min) {
954 mask = a;
955 vblend = &MacroAssembler::vblendvpd;
956 vmaxmin = &MacroAssembler::vminpd;
957 vcmp = &MacroAssembler::vcmppd;
958 } else {
959 assert(is_double_word && !is_min, "sanity");
960 mask = b;
961 vblend = &MacroAssembler::vblendvpd;
962 vmaxmin = &MacroAssembler::vmaxpd;
963 vcmp = &MacroAssembler::vcmppd;
964 }
965
966 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
967 XMMRegister maxmin, scratch;
968 if (dst == btmp) {
969 maxmin = btmp;
970 scratch = tmp;
971 } else {
972 maxmin = tmp;
973 scratch = btmp;
974 }
975
976 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
977 if (precompute_mask && !is_double_word) {
978 vpsrad(tmp, mask, 32, vlen_enc);
979 mask = tmp;
980 } else if (precompute_mask && is_double_word) {
981 vpxor(tmp, tmp, tmp, vlen_enc);
982 vpcmpgtq(tmp, tmp, mask, vlen_enc);
983 mask = tmp;
984 }
985
986 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
987 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
988 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
989 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
990 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
991 }
992
993 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
994 XMMRegister dst, XMMRegister a, XMMRegister b,
995 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
996 int vlen_enc) {
997 assert(UseAVX > 2, "required");
998 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
999 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1000 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1001 assert_different_registers(dst, a, atmp, btmp);
1002 assert_different_registers(dst, b, atmp, btmp);
1003
1004 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1005 bool is_double_word = is_double_word_type(elem_bt);
1006 bool merge = true;
1007
1008 if (!is_double_word && is_min) {
1009 evpmovd2m(ktmp, a, vlen_enc);
1010 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1011 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1012 vminps(dst, atmp, btmp, vlen_enc);
1013 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1015 } else if (!is_double_word && !is_min) {
1016 evpmovd2m(ktmp, b, vlen_enc);
1017 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019 vmaxps(dst, atmp, btmp, vlen_enc);
1020 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022 } else if (is_double_word && is_min) {
1023 evpmovq2m(ktmp, a, vlen_enc);
1024 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1025 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1026 vminpd(dst, atmp, btmp, vlen_enc);
1027 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1029 } else {
1030 assert(is_double_word && !is_min, "sanity");
1031 evpmovq2m(ktmp, b, vlen_enc);
1032 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1033 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1034 vmaxpd(dst, atmp, btmp, vlen_enc);
1035 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1037 }
1038 }
1039
1040 void C2_MacroAssembler::vminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1041 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1042 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1043 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1044
1045 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1046 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1047 if (elem_bt == T_FLOAT) {
1048 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1049 } else {
1050 assert(elem_bt == T_DOUBLE, "");
1051 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1052 }
1053 }
1054
1055 void C2_MacroAssembler::sminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1056 XMMRegister src1, XMMRegister src2) {
1057 assert(opc == Op_MinF || opc == Op_MaxF ||
1058 opc == Op_MinD || opc == Op_MaxD, "sanity");
1059
1060 int imm8 = (opc == Op_MinF || opc == Op_MinD) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1061 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1062 if (elem_bt == T_FLOAT) {
1063 evminmaxss(dst, mask, src1, src2, true, imm8);
1064 } else {
1065 assert(elem_bt == T_DOUBLE, "");
1066 evminmaxsd(dst, mask, src1, src2, true, imm8);
1067 }
1068 }
1069
1070 // Float/Double signum
1071 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1072 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1073
1074 Label DONE_LABEL;
1075
1076 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1077 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1078 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1079 if (opcode == Op_SignumF) {
1080 if (VM_Version::supports_avx10_2()) {
1081 evucomxss(dst, zero);
1082 jcc(Assembler::negative, DONE_LABEL);
1083 } else {
1084 ucomiss(dst, zero);
1085 jcc(Assembler::equal, DONE_LABEL);
1086 }
1087 movflt(dst, one);
1088 jcc(Assembler::above, DONE_LABEL);
1089 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1090 } else if (opcode == Op_SignumD) {
1091 if (VM_Version::supports_avx10_2()) {
1092 evucomxsd(dst, zero);
1093 jcc(Assembler::negative, DONE_LABEL);
1094 } else {
1095 ucomisd(dst, zero);
1096 jcc(Assembler::equal, DONE_LABEL);
1097 }
1098 movdbl(dst, one);
1099 jcc(Assembler::above, DONE_LABEL);
1100 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1101 }
1102
1103 bind(DONE_LABEL);
1104 }
1105
1106 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1107 if (sign) {
1108 pmovsxbw(dst, src);
1109 } else {
1110 pmovzxbw(dst, src);
1111 }
1112 }
1113
1114 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1115 if (sign) {
1116 vpmovsxbw(dst, src, vector_len);
1117 } else {
1118 vpmovzxbw(dst, src, vector_len);
1119 }
1120 }
1121
1122 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1123 if (sign) {
1124 vpmovsxbd(dst, src, vector_len);
1125 } else {
1126 vpmovzxbd(dst, src, vector_len);
1127 }
1128 }
1129
1130 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1131 if (sign) {
1132 vpmovsxwd(dst, src, vector_len);
1133 } else {
1134 vpmovzxwd(dst, src, vector_len);
1135 }
1136 }
1137
1138 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1139 int shift, int vector_len) {
1140 if (opcode == Op_RotateLeftV) {
1141 if (etype == T_INT) {
1142 evprold(dst, src, shift, vector_len);
1143 } else {
1144 assert(etype == T_LONG, "expected type T_LONG");
1145 evprolq(dst, src, shift, vector_len);
1146 }
1147 } else {
1148 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1149 if (etype == T_INT) {
1150 evprord(dst, src, shift, vector_len);
1151 } else {
1152 assert(etype == T_LONG, "expected type T_LONG");
1153 evprorq(dst, src, shift, vector_len);
1154 }
1155 }
1156 }
1157
1158 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1159 XMMRegister shift, int vector_len) {
1160 if (opcode == Op_RotateLeftV) {
1161 if (etype == T_INT) {
1162 evprolvd(dst, src, shift, vector_len);
1163 } else {
1164 assert(etype == T_LONG, "expected type T_LONG");
1165 evprolvq(dst, src, shift, vector_len);
1166 }
1167 } else {
1168 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1169 if (etype == T_INT) {
1170 evprorvd(dst, src, shift, vector_len);
1171 } else {
1172 assert(etype == T_LONG, "expected type T_LONG");
1173 evprorvq(dst, src, shift, vector_len);
1174 }
1175 }
1176 }
1177
1178 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1179 if (opcode == Op_RShiftVI) {
1180 psrad(dst, shift);
1181 } else if (opcode == Op_LShiftVI) {
1182 pslld(dst, shift);
1183 } else {
1184 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1185 psrld(dst, shift);
1186 }
1187 }
1188
1189 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1190 switch (opcode) {
1191 case Op_RShiftVI: psrad(dst, shift); break;
1192 case Op_LShiftVI: pslld(dst, shift); break;
1193 case Op_URShiftVI: psrld(dst, shift); break;
1194
1195 default: assert(false, "%s", NodeClassNames[opcode]);
1196 }
1197 }
1198
1199 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1200 if (opcode == Op_RShiftVI) {
1201 vpsrad(dst, nds, shift, vector_len);
1202 } else if (opcode == Op_LShiftVI) {
1203 vpslld(dst, nds, shift, vector_len);
1204 } else {
1205 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1206 vpsrld(dst, nds, shift, vector_len);
1207 }
1208 }
1209
1210 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1211 switch (opcode) {
1212 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1213 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1214 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1215
1216 default: assert(false, "%s", NodeClassNames[opcode]);
1217 }
1218 }
1219
1220 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1221 switch (opcode) {
1222 case Op_RShiftVB: // fall-through
1223 case Op_RShiftVS: psraw(dst, shift); break;
1224
1225 case Op_LShiftVB: // fall-through
1226 case Op_LShiftVS: psllw(dst, shift); break;
1227
1228 case Op_URShiftVS: // fall-through
1229 case Op_URShiftVB: psrlw(dst, shift); break;
1230
1231 default: assert(false, "%s", NodeClassNames[opcode]);
1232 }
1233 }
1234
1235 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1236 switch (opcode) {
1237 case Op_RShiftVB: // fall-through
1238 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1239
1240 case Op_LShiftVB: // fall-through
1241 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1242
1243 case Op_URShiftVS: // fall-through
1244 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1245
1246 default: assert(false, "%s", NodeClassNames[opcode]);
1247 }
1248 }
1249
1250 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1251 switch (opcode) {
1252 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1253 case Op_LShiftVL: psllq(dst, shift); break;
1254 case Op_URShiftVL: psrlq(dst, shift); break;
1255
1256 default: assert(false, "%s", NodeClassNames[opcode]);
1257 }
1258 }
1259
1260 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1261 if (opcode == Op_RShiftVL) {
1262 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1263 } else if (opcode == Op_LShiftVL) {
1264 psllq(dst, shift);
1265 } else {
1266 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1267 psrlq(dst, shift);
1268 }
1269 }
1270
1271 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1272 switch (opcode) {
1273 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1274 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1275 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1276
1277 default: assert(false, "%s", NodeClassNames[opcode]);
1278 }
1279 }
1280
1281 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1282 if (opcode == Op_RShiftVL) {
1283 evpsraq(dst, nds, shift, vector_len);
1284 } else if (opcode == Op_LShiftVL) {
1285 vpsllq(dst, nds, shift, vector_len);
1286 } else {
1287 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1288 vpsrlq(dst, nds, shift, vector_len);
1289 }
1290 }
1291
1292 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1293 switch (opcode) {
1294 case Op_RShiftVB: // fall-through
1295 case Op_RShiftVS: // fall-through
1296 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1297
1298 case Op_LShiftVB: // fall-through
1299 case Op_LShiftVS: // fall-through
1300 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1301
1302 case Op_URShiftVB: // fall-through
1303 case Op_URShiftVS: // fall-through
1304 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1305
1306 default: assert(false, "%s", NodeClassNames[opcode]);
1307 }
1308 }
1309
1310 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1311 switch (opcode) {
1312 case Op_RShiftVB: // fall-through
1313 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1314
1315 case Op_LShiftVB: // fall-through
1316 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1317
1318 case Op_URShiftVB: // fall-through
1319 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1320
1321 default: assert(false, "%s", NodeClassNames[opcode]);
1322 }
1323 }
1324
1325 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1326 assert(UseAVX >= 2, "required");
1327 switch (opcode) {
1328 case Op_RShiftVL: {
1329 if (UseAVX > 2) {
1330 assert(tmp == xnoreg, "not used");
1331 if (!VM_Version::supports_avx512vl()) {
1332 vlen_enc = Assembler::AVX_512bit;
1333 }
1334 evpsravq(dst, src, shift, vlen_enc);
1335 } else {
1336 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1337 vpsrlvq(dst, src, shift, vlen_enc);
1338 vpsrlvq(tmp, tmp, shift, vlen_enc);
1339 vpxor(dst, dst, tmp, vlen_enc);
1340 vpsubq(dst, dst, tmp, vlen_enc);
1341 }
1342 break;
1343 }
1344 case Op_LShiftVL: {
1345 assert(tmp == xnoreg, "not used");
1346 vpsllvq(dst, src, shift, vlen_enc);
1347 break;
1348 }
1349 case Op_URShiftVL: {
1350 assert(tmp == xnoreg, "not used");
1351 vpsrlvq(dst, src, shift, vlen_enc);
1352 break;
1353 }
1354 default: assert(false, "%s", NodeClassNames[opcode]);
1355 }
1356 }
1357
1358 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1359 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1360 assert(opcode == Op_LShiftVB ||
1361 opcode == Op_RShiftVB ||
1362 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1363 bool sign = (opcode != Op_URShiftVB);
1364 assert(vector_len == 0, "required");
1365 vextendbd(sign, dst, src, 1);
1366 vpmovzxbd(vtmp, shift, 1);
1367 varshiftd(opcode, dst, dst, vtmp, 1);
1368 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1369 vextracti128_high(vtmp, dst);
1370 vpackusdw(dst, dst, vtmp, 0);
1371 }
1372
1373 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1374 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1375 assert(opcode == Op_LShiftVB ||
1376 opcode == Op_RShiftVB ||
1377 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1378 bool sign = (opcode != Op_URShiftVB);
1379 int ext_vector_len = vector_len + 1;
1380 vextendbw(sign, dst, src, ext_vector_len);
1381 vpmovzxbw(vtmp, shift, ext_vector_len);
1382 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1383 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1384 if (vector_len == 0) {
1385 vextracti128_high(vtmp, dst);
1386 vpackuswb(dst, dst, vtmp, vector_len);
1387 } else {
1388 vextracti64x4_high(vtmp, dst);
1389 vpackuswb(dst, dst, vtmp, vector_len);
1390 vpermq(dst, dst, 0xD8, vector_len);
1391 }
1392 }
1393
1394 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1395 switch(typ) {
1396 case T_BYTE:
1397 pinsrb(dst, val, idx);
1398 break;
1399 case T_SHORT:
1400 pinsrw(dst, val, idx);
1401 break;
1402 case T_INT:
1403 pinsrd(dst, val, idx);
1404 break;
1405 case T_LONG:
1406 pinsrq(dst, val, idx);
1407 break;
1408 default:
1409 assert(false,"Should not reach here.");
1410 break;
1411 }
1412 }
1413
1414 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1415 switch(typ) {
1416 case T_BYTE:
1417 vpinsrb(dst, src, val, idx);
1418 break;
1419 case T_SHORT:
1420 vpinsrw(dst, src, val, idx);
1421 break;
1422 case T_INT:
1423 vpinsrd(dst, src, val, idx);
1424 break;
1425 case T_LONG:
1426 vpinsrq(dst, src, val, idx);
1427 break;
1428 default:
1429 assert(false,"Should not reach here.");
1430 break;
1431 }
1432 }
1433
1434 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1435 Register base, Register idx_base,
1436 Register mask, Register mask_idx,
1437 Register rtmp, int vlen_enc) {
1438 vpxor(dst, dst, dst, vlen_enc);
1439 if (elem_bt == T_SHORT) {
1440 for (int i = 0; i < 4; i++) {
1441 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1442 Label skip_load;
1443 btq(mask, mask_idx);
1444 jccb(Assembler::carryClear, skip_load);
1445 movl(rtmp, Address(idx_base, i * 4));
1446 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1447 bind(skip_load);
1448 incq(mask_idx);
1449 }
1450 } else {
1451 assert(elem_bt == T_BYTE, "");
1452 for (int i = 0; i < 8; i++) {
1453 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1454 Label skip_load;
1455 btq(mask, mask_idx);
1456 jccb(Assembler::carryClear, skip_load);
1457 movl(rtmp, Address(idx_base, i * 4));
1458 pinsrb(dst, Address(base, rtmp), i);
1459 bind(skip_load);
1460 incq(mask_idx);
1461 }
1462 }
1463 }
1464
1465 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1466 Register base, Register idx_base,
1467 Register rtmp, int vlen_enc) {
1468 vpxor(dst, dst, dst, vlen_enc);
1469 if (elem_bt == T_SHORT) {
1470 for (int i = 0; i < 4; i++) {
1471 // dst[i] = src[idx_base[i]]
1472 movl(rtmp, Address(idx_base, i * 4));
1473 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1474 }
1475 } else {
1476 assert(elem_bt == T_BYTE, "");
1477 for (int i = 0; i < 8; i++) {
1478 // dst[i] = src[idx_base[i]]
1479 movl(rtmp, Address(idx_base, i * 4));
1480 pinsrb(dst, Address(base, rtmp), i);
1481 }
1482 }
1483 }
1484
1485 /*
1486 * Gather using hybrid algorithm, first partially unroll scalar loop
1487 * to accumulate values from gather indices into a quad-word(64bit) slice.
1488 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1489 * permutation to place the slice into appropriate vector lane
1490 * locations in destination vector. Following pseudo code describes the
1491 * algorithm in detail:
1492 *
1493 * DST_VEC = ZERO_VEC
1494 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1495 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1496 * FOREACH_ITER:
1497 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1498 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1499 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1500 * PERM_INDEX = PERM_INDEX - TWO_VEC
1501 *
1502 * With each iteration, doubleword permute indices (0,1) corresponding
1503 * to gathered quadword gets right shifted by two lane positions.
1504 *
1505 */
1506 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1507 Register base, Register idx_base,
1508 Register mask, XMMRegister xtmp1,
1509 XMMRegister xtmp2, XMMRegister temp_dst,
1510 Register rtmp, Register mask_idx,
1511 Register length, int vector_len, int vlen_enc) {
1512 Label GATHER8_LOOP;
1513 assert(is_subword_type(elem_ty), "");
1514 movl(length, vector_len);
1515 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1516 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1517 vallones(xtmp2, vlen_enc);
1518 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1519 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1520 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1521
1522 bind(GATHER8_LOOP);
1523 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1524 if (mask == noreg) {
1525 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1526 } else {
1527 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1528 }
1529 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1530 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1531 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1532 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1533 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1534 vpor(dst, dst, temp_dst, vlen_enc);
1535 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1536 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1537 jcc(Assembler::notEqual, GATHER8_LOOP);
1538 }
1539
1540 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1541 switch(typ) {
1542 case T_INT:
1543 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1544 break;
1545 case T_FLOAT:
1546 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1547 break;
1548 case T_LONG:
1549 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1550 break;
1551 case T_DOUBLE:
1552 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1553 break;
1554 default:
1555 assert(false,"Should not reach here.");
1556 break;
1557 }
1558 }
1559
1560 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1561 switch(typ) {
1562 case T_INT:
1563 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1564 break;
1565 case T_FLOAT:
1566 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1567 break;
1568 case T_LONG:
1569 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1570 break;
1571 case T_DOUBLE:
1572 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1573 break;
1574 default:
1575 assert(false,"Should not reach here.");
1576 break;
1577 }
1578 }
1579
1580 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1581 switch(typ) {
1582 case T_INT:
1583 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1584 break;
1585 case T_FLOAT:
1586 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1587 break;
1588 case T_LONG:
1589 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1590 break;
1591 case T_DOUBLE:
1592 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1593 break;
1594 default:
1595 assert(false,"Should not reach here.");
1596 break;
1597 }
1598 }
1599
1600 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1601 if (vlen_in_bytes <= 16) {
1602 pxor (dst, dst);
1603 psubb(dst, src);
1604 switch (elem_bt) {
1605 case T_BYTE: /* nothing to do */ break;
1606 case T_SHORT: pmovsxbw(dst, dst); break;
1607 case T_INT: pmovsxbd(dst, dst); break;
1608 case T_FLOAT: pmovsxbd(dst, dst); break;
1609 case T_LONG: pmovsxbq(dst, dst); break;
1610 case T_DOUBLE: pmovsxbq(dst, dst); break;
1611
1612 default: assert(false, "%s", type2name(elem_bt));
1613 }
1614 } else {
1615 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1616 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1617
1618 vpxor (dst, dst, dst, vlen_enc);
1619 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1620
1621 switch (elem_bt) {
1622 case T_BYTE: /* nothing to do */ break;
1623 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1624 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1625 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1626 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1627 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1628
1629 default: assert(false, "%s", type2name(elem_bt));
1630 }
1631 }
1632 }
1633
1634 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1635 if (novlbwdq) {
1636 vpmovsxbd(xtmp, src, vlen_enc);
1637 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1638 Assembler::eq, true, vlen_enc, noreg);
1639 } else {
1640 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1641 vpsubb(xtmp, xtmp, src, vlen_enc);
1642 evpmovb2m(dst, xtmp, vlen_enc);
1643 }
1644 }
1645
1646 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1647 if (is_integral_type(bt)) {
1648 switch (vlen_in_bytes) {
1649 case 4: movdl(dst, src); break;
1650 case 8: movq(dst, src); break;
1651 case 16: movdqu(dst, src); break;
1652 case 32: vmovdqu(dst, src); break;
1653 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1654 default: ShouldNotReachHere();
1655 }
1656 } else {
1657 switch (vlen_in_bytes) {
1658 case 4: movflt(dst, src); break;
1659 case 8: movdbl(dst, src); break;
1660 case 16: movups(dst, src); break;
1661 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1662 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1663 default: ShouldNotReachHere();
1664 }
1665 }
1666 }
1667
1668 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1669 assert(rscratch != noreg || always_reachable(src), "missing");
1670
1671 if (reachable(src)) {
1672 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1673 } else {
1674 lea(rscratch, src);
1675 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1676 }
1677 }
1678
1679 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1680 int vlen_enc = vector_length_encoding(vlen);
1681 if (VM_Version::supports_avx()) {
1682 if (bt == T_LONG) {
1683 if (VM_Version::supports_avx2()) {
1684 vpbroadcastq(dst, src, vlen_enc);
1685 } else {
1686 vmovddup(dst, src, vlen_enc);
1687 }
1688 } else if (bt == T_DOUBLE) {
1689 if (vlen_enc != Assembler::AVX_128bit) {
1690 vbroadcastsd(dst, src, vlen_enc, noreg);
1691 } else {
1692 vmovddup(dst, src, vlen_enc);
1693 }
1694 } else {
1695 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1696 vpbroadcastd(dst, src, vlen_enc);
1697 } else {
1698 vbroadcastss(dst, src, vlen_enc);
1699 }
1700 }
1701 } else if (VM_Version::supports_sse3()) {
1702 movddup(dst, src);
1703 } else {
1704 load_vector(bt, dst, src, vlen);
1705 }
1706 }
1707
1708 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1709 int entry_idx = vector_iota_entry_index(bt);
1710 ExternalAddress addr(StubRoutines::x86::vector_iota_indices(entry_idx));
1711 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1712 }
1713
1714 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1715
1716 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1717 int vector_len = Assembler::AVX_128bit;
1718
1719 switch (opcode) {
1720 case Op_AndReductionV: pand(dst, src); break;
1721 case Op_OrReductionV: por (dst, src); break;
1722 case Op_XorReductionV: pxor(dst, src); break;
1723 case Op_MinReductionV:
1724 switch (typ) {
1725 case T_BYTE: pminsb(dst, src); break;
1726 case T_SHORT: pminsw(dst, src); break;
1727 case T_INT: pminsd(dst, src); break;
1728 case T_LONG: assert(UseAVX > 2, "required");
1729 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1730 default: assert(false, "wrong type");
1731 }
1732 break;
1733 case Op_MaxReductionV:
1734 switch (typ) {
1735 case T_BYTE: pmaxsb(dst, src); break;
1736 case T_SHORT: pmaxsw(dst, src); break;
1737 case T_INT: pmaxsd(dst, src); break;
1738 case T_LONG: assert(UseAVX > 2, "required");
1739 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1740 default: assert(false, "wrong type");
1741 }
1742 break;
1743 case Op_UMinReductionV:
1744 switch (typ) {
1745 case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1746 case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1747 case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1748 case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1749 default: assert(false, "wrong type");
1750 }
1751 break;
1752 case Op_UMaxReductionV:
1753 switch (typ) {
1754 case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1755 case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1756 case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1757 case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1758 default: assert(false, "wrong type");
1759 }
1760 break;
1761 case Op_AddReductionVF: addss(dst, src); break;
1762 case Op_AddReductionVD: addsd(dst, src); break;
1763 case Op_AddReductionVI:
1764 switch (typ) {
1765 case T_BYTE: paddb(dst, src); break;
1766 case T_SHORT: paddw(dst, src); break;
1767 case T_INT: paddd(dst, src); break;
1768 default: assert(false, "wrong type");
1769 }
1770 break;
1771 case Op_AddReductionVL: paddq(dst, src); break;
1772 case Op_MulReductionVF: mulss(dst, src); break;
1773 case Op_MulReductionVD: mulsd(dst, src); break;
1774 case Op_MulReductionVI:
1775 switch (typ) {
1776 case T_SHORT: pmullw(dst, src); break;
1777 case T_INT: pmulld(dst, src); break;
1778 default: assert(false, "wrong type");
1779 }
1780 break;
1781 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1782 evpmullq(dst, dst, src, vector_len); break;
1783 default: assert(false, "wrong opcode");
1784 }
1785 }
1786
1787 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1788 switch (opcode) {
1789 case Op_AddReductionVF: addps(dst, src); break;
1790 case Op_AddReductionVD: addpd(dst, src); break;
1791 case Op_MulReductionVF: mulps(dst, src); break;
1792 case Op_MulReductionVD: mulpd(dst, src); break;
1793 default: assert(false, "%s", NodeClassNames[opcode]);
1794 }
1795 }
1796
1797 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1798 int vector_len = Assembler::AVX_256bit;
1799
1800 switch (opcode) {
1801 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1802 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1803 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1804 case Op_MinReductionV:
1805 switch (typ) {
1806 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1807 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1808 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1809 case T_LONG: assert(UseAVX > 2, "required");
1810 vpminsq(dst, src1, src2, vector_len); break;
1811 default: assert(false, "wrong type");
1812 }
1813 break;
1814 case Op_MaxReductionV:
1815 switch (typ) {
1816 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1817 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1818 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1819 case T_LONG: assert(UseAVX > 2, "required");
1820 vpmaxsq(dst, src1, src2, vector_len); break;
1821 default: assert(false, "wrong type");
1822 }
1823 break;
1824 case Op_UMinReductionV:
1825 switch (typ) {
1826 case T_BYTE: vpminub(dst, src1, src2, vector_len); break;
1827 case T_SHORT: vpminuw(dst, src1, src2, vector_len); break;
1828 case T_INT: vpminud(dst, src1, src2, vector_len); break;
1829 case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break;
1830 default: assert(false, "wrong type");
1831 }
1832 break;
1833 case Op_UMaxReductionV:
1834 switch (typ) {
1835 case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break;
1836 case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break;
1837 case T_INT: vpmaxud(dst, src1, src2, vector_len); break;
1838 case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1839 default: assert(false, "wrong type");
1840 }
1841 break;
1842 case Op_AddReductionVI:
1843 switch (typ) {
1844 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1845 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1846 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1847 default: assert(false, "wrong type");
1848 }
1849 break;
1850 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1851 case Op_MulReductionVI:
1852 switch (typ) {
1853 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1854 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1855 default: assert(false, "wrong type");
1856 }
1857 break;
1858 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1859 default: assert(false, "wrong opcode");
1860 }
1861 }
1862
1863 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1864 int vector_len = Assembler::AVX_256bit;
1865
1866 switch (opcode) {
1867 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1868 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1869 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1870 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1871 default: assert(false, "%s", NodeClassNames[opcode]);
1872 }
1873 }
1874
1875 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1876 XMMRegister dst, XMMRegister src,
1877 XMMRegister vtmp1, XMMRegister vtmp2) {
1878 switch (opcode) {
1879 case Op_AddReductionVF:
1880 case Op_MulReductionVF:
1881 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1882 break;
1883
1884 case Op_AddReductionVD:
1885 case Op_MulReductionVD:
1886 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1887 break;
1888
1889 default: assert(false, "wrong opcode");
1890 }
1891 }
1892
1893 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1894 XMMRegister dst, XMMRegister src,
1895 XMMRegister vtmp1, XMMRegister vtmp2) {
1896 switch (opcode) {
1897 case Op_AddReductionVF:
1898 case Op_MulReductionVF:
1899 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1900 break;
1901
1902 case Op_AddReductionVD:
1903 case Op_MulReductionVD:
1904 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1905 break;
1906
1907 default: assert(false, "%s", NodeClassNames[opcode]);
1908 }
1909 }
1910
1911 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1912 Register dst, Register src1, XMMRegister src2,
1913 XMMRegister vtmp1, XMMRegister vtmp2) {
1914 switch (vlen) {
1915 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1916 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1917 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1918 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1919
1920 default: assert(false, "wrong vector length");
1921 }
1922 }
1923
1924 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1925 Register dst, Register src1, XMMRegister src2,
1926 XMMRegister vtmp1, XMMRegister vtmp2) {
1927 switch (vlen) {
1928 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1929 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1930 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1931 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1932
1933 default: assert(false, "wrong vector length");
1934 }
1935 }
1936
1937 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1938 Register dst, Register src1, XMMRegister src2,
1939 XMMRegister vtmp1, XMMRegister vtmp2) {
1940 switch (vlen) {
1941 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1942 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1943 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1944 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1945
1946 default: assert(false, "wrong vector length");
1947 }
1948 }
1949
1950 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1951 Register dst, Register src1, XMMRegister src2,
1952 XMMRegister vtmp1, XMMRegister vtmp2) {
1953 switch (vlen) {
1954 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1955 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1956 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1957 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1958
1959 default: assert(false, "wrong vector length");
1960 }
1961 }
1962
1963 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1964 Register dst, Register src1, XMMRegister src2,
1965 XMMRegister vtmp1, XMMRegister vtmp2) {
1966 switch (vlen) {
1967 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1968 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1969 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1970
1971 default: assert(false, "wrong vector length");
1972 }
1973 }
1974
1975 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1976 switch (vlen) {
1977 case 2:
1978 assert(vtmp2 == xnoreg, "");
1979 reduce2F(opcode, dst, src, vtmp1);
1980 break;
1981 case 4:
1982 assert(vtmp2 == xnoreg, "");
1983 reduce4F(opcode, dst, src, vtmp1);
1984 break;
1985 case 8:
1986 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1987 break;
1988 case 16:
1989 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1990 break;
1991 default: assert(false, "wrong vector length");
1992 }
1993 }
1994
1995 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1996 switch (vlen) {
1997 case 2:
1998 assert(vtmp2 == xnoreg, "");
1999 reduce2D(opcode, dst, src, vtmp1);
2000 break;
2001 case 4:
2002 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2003 break;
2004 case 8:
2005 reduce8D(opcode, dst, src, vtmp1, vtmp2);
2006 break;
2007 default: assert(false, "wrong vector length");
2008 }
2009 }
2010
2011 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2012 switch (vlen) {
2013 case 2:
2014 assert(vtmp1 == xnoreg, "");
2015 assert(vtmp2 == xnoreg, "");
2016 unorderedReduce2F(opcode, dst, src);
2017 break;
2018 case 4:
2019 assert(vtmp2 == xnoreg, "");
2020 unorderedReduce4F(opcode, dst, src, vtmp1);
2021 break;
2022 case 8:
2023 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2024 break;
2025 case 16:
2026 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2027 break;
2028 default: assert(false, "wrong vector length");
2029 }
2030 }
2031
2032 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2033 switch (vlen) {
2034 case 2:
2035 assert(vtmp1 == xnoreg, "");
2036 assert(vtmp2 == xnoreg, "");
2037 unorderedReduce2D(opcode, dst, src);
2038 break;
2039 case 4:
2040 assert(vtmp2 == xnoreg, "");
2041 unorderedReduce4D(opcode, dst, src, vtmp1);
2042 break;
2043 case 8:
2044 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2045 break;
2046 default: assert(false, "wrong vector length");
2047 }
2048 }
2049
2050 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2051 if (opcode == Op_AddReductionVI) {
2052 if (vtmp1 != src2) {
2053 movdqu(vtmp1, src2);
2054 }
2055 phaddd(vtmp1, vtmp1);
2056 } else {
2057 pshufd(vtmp1, src2, 0x1);
2058 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2059 }
2060 movdl(vtmp2, src1);
2061 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2062 movdl(dst, vtmp1);
2063 }
2064
2065 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2066 if (opcode == Op_AddReductionVI) {
2067 if (vtmp1 != src2) {
2068 movdqu(vtmp1, src2);
2069 }
2070 phaddd(vtmp1, src2);
2071 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2072 } else {
2073 pshufd(vtmp2, src2, 0xE);
2074 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2075 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2076 }
2077 }
2078
2079 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2080 if (opcode == Op_AddReductionVI) {
2081 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2082 vextracti128_high(vtmp2, vtmp1);
2083 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2084 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2085 } else {
2086 vextracti128_high(vtmp1, src2);
2087 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2088 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2089 }
2090 }
2091
2092 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2093 vextracti64x4_high(vtmp2, src2);
2094 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2095 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2096 }
2097
2098 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2099 pshufd(vtmp2, src2, 0x1);
2100 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2101 movdqu(vtmp1, vtmp2);
2102 psrldq(vtmp1, 2);
2103 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2104 movdqu(vtmp2, vtmp1);
2105 psrldq(vtmp2, 1);
2106 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2107 movdl(vtmp2, src1);
2108 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2109 pmovzxbd(vtmp1, vtmp1);
2110 } else {
2111 pmovsxbd(vtmp1, vtmp1);
2112 }
2113 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2114 pextrb(dst, vtmp1, 0x0);
2115 movsbl(dst, dst);
2116 }
2117
2118 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2119 pshufd(vtmp1, src2, 0xE);
2120 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2121 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2122 }
2123
2124 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2125 vextracti128_high(vtmp2, src2);
2126 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2127 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2128 }
2129
2130 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2131 vextracti64x4_high(vtmp1, src2);
2132 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2133 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2134 }
2135
2136 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2137 pmovsxbw(vtmp2, src2);
2138 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2139 }
2140
2141 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2142 if (UseAVX > 1) {
2143 int vector_len = Assembler::AVX_256bit;
2144 vpmovsxbw(vtmp1, src2, vector_len);
2145 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2146 } else {
2147 pmovsxbw(vtmp2, src2);
2148 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2149 pshufd(vtmp2, src2, 0xe);
2150 pmovsxbw(vtmp2, vtmp2);
2151 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2152 }
2153 }
2154
2155 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2156 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2157 int vector_len = Assembler::AVX_512bit;
2158 vpmovsxbw(vtmp1, src2, vector_len);
2159 reduce32S(opcode, dst, src1, vtmp1, vtmp2, vtmp1);
2160 } else {
2161 assert(UseAVX >= 2,"Should not reach here.");
2162 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2163 vextracti128_high(vtmp2, src2);
2164 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2165 }
2166 }
2167
2168 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2169 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2170 vextracti64x4_high(vtmp2, src2);
2171 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2172 }
2173
2174 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2175 if (opcode == Op_AddReductionVI) {
2176 if (vtmp1 != src2) {
2177 movdqu(vtmp1, src2);
2178 }
2179 phaddw(vtmp1, vtmp1);
2180 phaddw(vtmp1, vtmp1);
2181 } else {
2182 pshufd(vtmp2, src2, 0x1);
2183 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2184 movdqu(vtmp1, vtmp2);
2185 psrldq(vtmp1, 2);
2186 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2187 }
2188 movdl(vtmp2, src1);
2189 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2190 pmovzxwd(vtmp1, vtmp1);
2191 } else {
2192 pmovsxwd(vtmp1, vtmp1);
2193 }
2194 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2195 pextrw(dst, vtmp1, 0x0);
2196 movswl(dst, dst);
2197 }
2198
2199 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2200 if (opcode == Op_AddReductionVI) {
2201 if (vtmp1 != src2) {
2202 movdqu(vtmp1, src2);
2203 }
2204 phaddw(vtmp1, src2);
2205 } else {
2206 assert_different_registers(src2, vtmp1);
2207 pshufd(vtmp1, src2, 0xE);
2208 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2209 }
2210 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2211 }
2212
2213 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2214 if (opcode == Op_AddReductionVI) {
2215 int vector_len = Assembler::AVX_256bit;
2216 vphaddw(vtmp2, src2, src2, vector_len);
2217 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2218 } else {
2219 assert_different_registers(src2, vtmp2);
2220 vextracti128_high(vtmp2, src2);
2221 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2222 }
2223 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2224 }
2225
2226 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2227 assert_different_registers(src2, vtmp1);
2228 int vector_len = Assembler::AVX_256bit;
2229 vextracti64x4_high(vtmp1, src2);
2230 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2231 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2232 }
2233
2234 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2235 pshufd(vtmp2, src2, 0xE);
2236 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2237 movdq(vtmp1, src1);
2238 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2239 movdq(dst, vtmp1);
2240 }
2241
2242 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2243 vextracti128_high(vtmp1, src2);
2244 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2245 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2246 }
2247
2248 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2249 vextracti64x4_high(vtmp2, src2);
2250 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2251 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2252 }
2253
2254 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2255 mov64(temp, -1L);
2256 bzhiq(temp, temp, len);
2257 kmovql(dst, temp);
2258 }
2259
2260 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2261 reduce_operation_128(T_FLOAT, opcode, dst, src);
2262 pshufd(vtmp, src, 0x1);
2263 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2264 }
2265
2266 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2267 reduce2F(opcode, dst, src, vtmp);
2268 pshufd(vtmp, src, 0x2);
2269 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2270 pshufd(vtmp, src, 0x3);
2271 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2272 }
2273
2274 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2275 reduce4F(opcode, dst, src, vtmp2);
2276 vextractf128_high(vtmp2, src);
2277 reduce4F(opcode, dst, vtmp2, vtmp1);
2278 }
2279
2280 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2281 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2282 vextracti64x4_high(vtmp1, src);
2283 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2284 }
2285
2286 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2287 pshufd(dst, src, 0x1);
2288 reduce_operation_128(T_FLOAT, opcode, dst, src);
2289 }
2290
2291 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2292 pshufd(vtmp, src, 0xE);
2293 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2294 unorderedReduce2F(opcode, dst, vtmp);
2295 }
2296
2297 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2298 vextractf128_high(vtmp1, src);
2299 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2300 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2301 }
2302
2303 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2304 vextractf64x4_high(vtmp2, src);
2305 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2306 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2307 }
2308
2309 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2310 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2311 pshufd(vtmp, src, 0xE);
2312 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2313 }
2314
2315 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2316 reduce2D(opcode, dst, src, vtmp2);
2317 vextractf128_high(vtmp2, src);
2318 reduce2D(opcode, dst, vtmp2, vtmp1);
2319 }
2320
2321 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2322 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2323 vextracti64x4_high(vtmp1, src);
2324 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2325 }
2326
2327 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2328 pshufd(dst, src, 0xE);
2329 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2330 }
2331
2332 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2333 vextractf128_high(vtmp, src);
2334 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2335 unorderedReduce2D(opcode, dst, vtmp);
2336 }
2337
2338 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2339 vextractf64x4_high(vtmp2, src);
2340 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2341 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2342 }
2343
2344 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2345 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2346 }
2347
2348 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2349 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2350 }
2351
2352 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2353 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2354 }
2355
2356 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2357 int vec_enc) {
2358 switch(elem_bt) {
2359 case T_INT:
2360 case T_FLOAT:
2361 vmaskmovps(dst, src, mask, vec_enc);
2362 break;
2363 case T_LONG:
2364 case T_DOUBLE:
2365 vmaskmovpd(dst, src, mask, vec_enc);
2366 break;
2367 default:
2368 fatal("Unsupported type %s", type2name(elem_bt));
2369 break;
2370 }
2371 }
2372
2373 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2374 int vec_enc) {
2375 switch(elem_bt) {
2376 case T_INT:
2377 case T_FLOAT:
2378 vmaskmovps(dst, src, mask, vec_enc);
2379 break;
2380 case T_LONG:
2381 case T_DOUBLE:
2382 vmaskmovpd(dst, src, mask, vec_enc);
2383 break;
2384 default:
2385 fatal("Unsupported type %s", type2name(elem_bt));
2386 break;
2387 }
2388 }
2389
2390 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2391 XMMRegister dst, XMMRegister src,
2392 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2393 XMMRegister xmm_0, XMMRegister xmm_1) {
2394 const int permconst[] = {1, 14};
2395 XMMRegister wsrc = src;
2396 XMMRegister wdst = xmm_0;
2397 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2398
2399 int vlen_enc = Assembler::AVX_128bit;
2400 if (vlen == 16) {
2401 vlen_enc = Assembler::AVX_256bit;
2402 }
2403
2404 for (int i = log2(vlen) - 1; i >=0; i--) {
2405 if (i == 0 && !is_dst_valid) {
2406 wdst = dst;
2407 }
2408 if (i == 3) {
2409 vextracti64x4_high(wtmp, wsrc);
2410 } else if (i == 2) {
2411 vextracti128_high(wtmp, wsrc);
2412 } else { // i = [0,1]
2413 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2414 }
2415
2416 if (VM_Version::supports_avx10_2()) {
2417 vminmax_fp_avx10_2(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2418 } else {
2419 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2420 }
2421 wsrc = wdst;
2422 vlen_enc = Assembler::AVX_128bit;
2423 }
2424 if (is_dst_valid) {
2425 if (VM_Version::supports_avx10_2()) {
2426 vminmax_fp_avx10_2(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2427 } else {
2428 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2429 }
2430 }
2431 }
2432
2433 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2434 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2435 XMMRegister xmm_0, XMMRegister xmm_1) {
2436 XMMRegister wsrc = src;
2437 XMMRegister wdst = xmm_0;
2438 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2439 int vlen_enc = Assembler::AVX_128bit;
2440 if (vlen == 8) {
2441 vlen_enc = Assembler::AVX_256bit;
2442 }
2443 for (int i = log2(vlen) - 1; i >=0; i--) {
2444 if (i == 0 && !is_dst_valid) {
2445 wdst = dst;
2446 }
2447 if (i == 1) {
2448 vextracti128_high(wtmp, wsrc);
2449 } else if (i == 2) {
2450 vextracti64x4_high(wtmp, wsrc);
2451 } else {
2452 assert(i == 0, "%d", i);
2453 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2454 }
2455
2456 if (VM_Version::supports_avx10_2()) {
2457 vminmax_fp_avx10_2(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2458 } else {
2459 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2460 }
2461
2462 wsrc = wdst;
2463 vlen_enc = Assembler::AVX_128bit;
2464 }
2465
2466 if (is_dst_valid) {
2467 if (VM_Version::supports_avx10_2()) {
2468 vminmax_fp_avx10_2(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2469 } else {
2470 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2471 }
2472 }
2473 }
2474
2475 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2476 switch (bt) {
2477 case T_BYTE: pextrb(dst, src, idx); break;
2478 case T_SHORT: pextrw(dst, src, idx); break;
2479 case T_INT: pextrd(dst, src, idx); break;
2480 case T_LONG: pextrq(dst, src, idx); break;
2481
2482 default:
2483 assert(false,"Should not reach here.");
2484 break;
2485 }
2486 }
2487
2488 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2489 int esize = type2aelembytes(typ);
2490 int elem_per_lane = 16/esize;
2491 int lane = elemindex / elem_per_lane;
2492 int eindex = elemindex % elem_per_lane;
2493
2494 if (lane >= 2) {
2495 assert(UseAVX > 2, "required");
2496 vextractf32x4(dst, src, lane & 3);
2497 return dst;
2498 } else if (lane > 0) {
2499 assert(UseAVX > 0, "required");
2500 vextractf128(dst, src, lane);
2501 return dst;
2502 } else {
2503 return src;
2504 }
2505 }
2506
2507 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2508 if (typ == T_BYTE) {
2509 movsbl(dst, dst);
2510 } else if (typ == T_SHORT) {
2511 movswl(dst, dst);
2512 }
2513 }
2514
2515 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2516 int esize = type2aelembytes(typ);
2517 int elem_per_lane = 16/esize;
2518 int eindex = elemindex % elem_per_lane;
2519 assert(is_integral_type(typ),"required");
2520
2521 if (eindex == 0) {
2522 if (typ == T_LONG) {
2523 movq(dst, src);
2524 } else {
2525 movdl(dst, src);
2526 movsxl(typ, dst);
2527 }
2528 } else {
2529 extract(typ, dst, src, eindex);
2530 movsxl(typ, dst);
2531 }
2532 }
2533
2534 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2535 int esize = type2aelembytes(typ);
2536 int elem_per_lane = 16/esize;
2537 int eindex = elemindex % elem_per_lane;
2538 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2539
2540 if (eindex == 0) {
2541 movq(dst, src);
2542 } else {
2543 if (typ == T_FLOAT) {
2544 if (UseAVX == 0) {
2545 movdqu(dst, src);
2546 shufps(dst, dst, eindex);
2547 } else {
2548 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2549 }
2550 } else {
2551 if (UseAVX == 0) {
2552 movdqu(dst, src);
2553 psrldq(dst, eindex*esize);
2554 } else {
2555 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2556 }
2557 movq(dst, dst);
2558 }
2559 }
2560 // Zero upper bits
2561 if (typ == T_FLOAT) {
2562 if (UseAVX == 0) {
2563 assert(vtmp != xnoreg, "required.");
2564 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2565 pand(dst, vtmp);
2566 } else {
2567 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2568 }
2569 }
2570 }
2571
2572 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2573 switch(typ) {
2574 case T_BYTE:
2575 case T_BOOLEAN:
2576 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2577 break;
2578 case T_SHORT:
2579 case T_CHAR:
2580 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2581 break;
2582 case T_INT:
2583 case T_FLOAT:
2584 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2585 break;
2586 case T_LONG:
2587 case T_DOUBLE:
2588 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2589 break;
2590 default:
2591 assert(false,"Should not reach here.");
2592 break;
2593 }
2594 }
2595
2596 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2597 assert(rscratch != noreg || always_reachable(src2), "missing");
2598
2599 switch(typ) {
2600 case T_BOOLEAN:
2601 case T_BYTE:
2602 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2603 break;
2604 case T_CHAR:
2605 case T_SHORT:
2606 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2607 break;
2608 case T_INT:
2609 case T_FLOAT:
2610 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2611 break;
2612 case T_LONG:
2613 case T_DOUBLE:
2614 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2615 break;
2616 default:
2617 assert(false,"Should not reach here.");
2618 break;
2619 }
2620 }
2621
2622 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2623 switch(typ) {
2624 case T_BYTE:
2625 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2626 break;
2627 case T_SHORT:
2628 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2629 break;
2630 case T_INT:
2631 case T_FLOAT:
2632 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2633 break;
2634 case T_LONG:
2635 case T_DOUBLE:
2636 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2637 break;
2638 default:
2639 assert(false,"Should not reach here.");
2640 break;
2641 }
2642 }
2643
2644 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2645 assert(vlen_in_bytes <= 32, "");
2646 int esize = type2aelembytes(bt);
2647 if (vlen_in_bytes == 32) {
2648 assert(vtmp == xnoreg, "required.");
2649 if (esize >= 4) {
2650 vtestps(src1, src2, AVX_256bit);
2651 } else {
2652 vptest(src1, src2, AVX_256bit);
2653 }
2654 return;
2655 }
2656 if (vlen_in_bytes < 16) {
2657 // Duplicate the lower part to fill the whole register,
2658 // Don't need to do so for src2
2659 assert(vtmp != xnoreg, "required");
2660 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2661 pshufd(vtmp, src1, shuffle_imm);
2662 } else {
2663 assert(vtmp == xnoreg, "required");
2664 vtmp = src1;
2665 }
2666 if (esize >= 4 && VM_Version::supports_avx()) {
2667 vtestps(vtmp, src2, AVX_128bit);
2668 } else {
2669 ptest(vtmp, src2);
2670 }
2671 }
2672
2673 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2674 #ifdef ASSERT
2675 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2676 bool is_bw_supported = VM_Version::supports_avx512bw();
2677 if (is_bw && !is_bw_supported) {
2678 assert(vlen_enc != Assembler::AVX_512bit, "required");
2679 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2680 "XMM register should be 0-15");
2681 }
2682 #endif // ASSERT
2683 switch (elem_bt) {
2684 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2685 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2686 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2687 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2688 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2689 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2690 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2691 }
2692 }
2693
2694 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2695 assert(UseAVX >= 2, "required");
2696 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2697 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2698 if ((UseAVX > 2) &&
2699 (!is_bw || VM_Version::supports_avx512bw()) &&
2700 (!is_vl || VM_Version::supports_avx512vl())) {
2701 switch (elem_bt) {
2702 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2703 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2704 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2705 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2706 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2707 }
2708 } else {
2709 assert(vlen_enc != Assembler::AVX_512bit, "required");
2710 assert((dst->encoding() < 16),"XMM register should be 0-15");
2711 switch (elem_bt) {
2712 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2713 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2714 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2715 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2716 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2717 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2718 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2719 }
2720 }
2721 }
2722
2723 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2724 switch (to_elem_bt) {
2725 case T_SHORT:
2726 vpmovsxbw(dst, src, vlen_enc);
2727 break;
2728 case T_INT:
2729 vpmovsxbd(dst, src, vlen_enc);
2730 break;
2731 case T_FLOAT:
2732 vpmovsxbd(dst, src, vlen_enc);
2733 vcvtdq2ps(dst, dst, vlen_enc);
2734 break;
2735 case T_LONG:
2736 vpmovsxbq(dst, src, vlen_enc);
2737 break;
2738 case T_DOUBLE: {
2739 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2740 vpmovsxbd(dst, src, mid_vlen_enc);
2741 vcvtdq2pd(dst, dst, vlen_enc);
2742 break;
2743 }
2744 default:
2745 fatal("Unsupported type %s", type2name(to_elem_bt));
2746 break;
2747 }
2748 }
2749
2750 //-------------------------------------------------------------------------------------------
2751
2752 // IndexOf for constant substrings with size >= 8 chars
2753 // which don't need to be loaded through stack.
2754 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2755 Register cnt1, Register cnt2,
2756 int int_cnt2, Register result,
2757 XMMRegister vec, Register tmp,
2758 int ae) {
2759 ShortBranchVerifier sbv(this);
2760 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2761 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2762
2763 // This method uses the pcmpestri instruction with bound registers
2764 // inputs:
2765 // xmm - substring
2766 // rax - substring length (elements count)
2767 // mem - scanned string
2768 // rdx - string length (elements count)
2769 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2770 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2771 // outputs:
2772 // rcx - matched index in string
2773 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2774 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2775 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2776 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2777 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2778
2779 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2780 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2781 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2782
2783 // Note, inline_string_indexOf() generates checks:
2784 // if (substr.count > string.count) return -1;
2785 // if (substr.count == 0) return 0;
2786 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2787
2788 // Load substring.
2789 if (ae == StrIntrinsicNode::UL) {
2790 pmovzxbw(vec, Address(str2, 0));
2791 } else {
2792 movdqu(vec, Address(str2, 0));
2793 }
2794 movl(cnt2, int_cnt2);
2795 movptr(result, str1); // string addr
2796
2797 if (int_cnt2 > stride) {
2798 jmpb(SCAN_TO_SUBSTR);
2799
2800 // Reload substr for rescan, this code
2801 // is executed only for large substrings (> 8 chars)
2802 bind(RELOAD_SUBSTR);
2803 if (ae == StrIntrinsicNode::UL) {
2804 pmovzxbw(vec, Address(str2, 0));
2805 } else {
2806 movdqu(vec, Address(str2, 0));
2807 }
2808 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2809
2810 bind(RELOAD_STR);
2811 // We came here after the beginning of the substring was
2812 // matched but the rest of it was not so we need to search
2813 // again. Start from the next element after the previous match.
2814
2815 // cnt2 is number of substring reminding elements and
2816 // cnt1 is number of string reminding elements when cmp failed.
2817 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2818 subl(cnt1, cnt2);
2819 addl(cnt1, int_cnt2);
2820 movl(cnt2, int_cnt2); // Now restore cnt2
2821
2822 decrementl(cnt1); // Shift to next element
2823 cmpl(cnt1, cnt2);
2824 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2825
2826 addptr(result, (1<<scale1));
2827
2828 } // (int_cnt2 > 8)
2829
2830 // Scan string for start of substr in 16-byte vectors
2831 bind(SCAN_TO_SUBSTR);
2832 pcmpestri(vec, Address(result, 0), mode);
2833 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2834 subl(cnt1, stride);
2835 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2836 cmpl(cnt1, cnt2);
2837 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2838 addptr(result, 16);
2839 jmpb(SCAN_TO_SUBSTR);
2840
2841 // Found a potential substr
2842 bind(FOUND_CANDIDATE);
2843 // Matched whole vector if first element matched (tmp(rcx) == 0).
2844 if (int_cnt2 == stride) {
2845 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2846 } else { // int_cnt2 > 8
2847 jccb(Assembler::overflow, FOUND_SUBSTR);
2848 }
2849 // After pcmpestri tmp(rcx) contains matched element index
2850 // Compute start addr of substr
2851 lea(result, Address(result, tmp, scale1));
2852
2853 // Make sure string is still long enough
2854 subl(cnt1, tmp);
2855 cmpl(cnt1, cnt2);
2856 if (int_cnt2 == stride) {
2857 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2858 } else { // int_cnt2 > 8
2859 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2860 }
2861 // Left less then substring.
2862
2863 bind(RET_NOT_FOUND);
2864 movl(result, -1);
2865 jmp(EXIT);
2866
2867 if (int_cnt2 > stride) {
2868 // This code is optimized for the case when whole substring
2869 // is matched if its head is matched.
2870 bind(MATCH_SUBSTR_HEAD);
2871 pcmpestri(vec, Address(result, 0), mode);
2872 // Reload only string if does not match
2873 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2874
2875 Label CONT_SCAN_SUBSTR;
2876 // Compare the rest of substring (> 8 chars).
2877 bind(FOUND_SUBSTR);
2878 // First 8 chars are already matched.
2879 negptr(cnt2);
2880 addptr(cnt2, stride);
2881
2882 bind(SCAN_SUBSTR);
2883 subl(cnt1, stride);
2884 cmpl(cnt2, -stride); // Do not read beyond substring
2885 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2886 // Back-up strings to avoid reading beyond substring:
2887 // cnt1 = cnt1 - cnt2 + 8
2888 addl(cnt1, cnt2); // cnt2 is negative
2889 addl(cnt1, stride);
2890 movl(cnt2, stride); negptr(cnt2);
2891 bind(CONT_SCAN_SUBSTR);
2892 if (int_cnt2 < (int)G) {
2893 int tail_off1 = int_cnt2<<scale1;
2894 int tail_off2 = int_cnt2<<scale2;
2895 if (ae == StrIntrinsicNode::UL) {
2896 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2897 } else {
2898 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2899 }
2900 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2901 } else {
2902 // calculate index in register to avoid integer overflow (int_cnt2*2)
2903 movl(tmp, int_cnt2);
2904 addptr(tmp, cnt2);
2905 if (ae == StrIntrinsicNode::UL) {
2906 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2907 } else {
2908 movdqu(vec, Address(str2, tmp, scale2, 0));
2909 }
2910 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2911 }
2912 // Need to reload strings pointers if not matched whole vector
2913 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2914 addptr(cnt2, stride);
2915 jcc(Assembler::negative, SCAN_SUBSTR);
2916 // Fall through if found full substring
2917
2918 } // (int_cnt2 > 8)
2919
2920 bind(RET_FOUND);
2921 // Found result if we matched full small substring.
2922 // Compute substr offset
2923 subptr(result, str1);
2924 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2925 shrl(result, 1); // index
2926 }
2927 bind(EXIT);
2928
2929 } // string_indexofC8
2930
2931 // Small strings are loaded through stack if they cross page boundary.
2932 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2933 Register cnt1, Register cnt2,
2934 int int_cnt2, Register result,
2935 XMMRegister vec, Register tmp,
2936 int ae) {
2937 ShortBranchVerifier sbv(this);
2938 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2939 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2940
2941 //
2942 // int_cnt2 is length of small (< 8 chars) constant substring
2943 // or (-1) for non constant substring in which case its length
2944 // is in cnt2 register.
2945 //
2946 // Note, inline_string_indexOf() generates checks:
2947 // if (substr.count > string.count) return -1;
2948 // if (substr.count == 0) return 0;
2949 //
2950 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2951 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2952 // This method uses the pcmpestri instruction with bound registers
2953 // inputs:
2954 // xmm - substring
2955 // rax - substring length (elements count)
2956 // mem - scanned string
2957 // rdx - string length (elements count)
2958 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2959 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2960 // outputs:
2961 // rcx - matched index in string
2962 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2963 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2964 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2965 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2966
2967 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2968 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2969 FOUND_CANDIDATE;
2970
2971 { //========================================================
2972 // We don't know where these strings are located
2973 // and we can't read beyond them. Load them through stack.
2974 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2975
2976 movptr(tmp, rsp); // save old SP
2977
2978 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2979 if (int_cnt2 == (1>>scale2)) { // One byte
2980 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2981 load_unsigned_byte(result, Address(str2, 0));
2982 movdl(vec, result); // move 32 bits
2983 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2984 // Not enough header space in 32-bit VM: 12+3 = 15.
2985 movl(result, Address(str2, -1));
2986 shrl(result, 8);
2987 movdl(vec, result); // move 32 bits
2988 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2989 load_unsigned_short(result, Address(str2, 0));
2990 movdl(vec, result); // move 32 bits
2991 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2992 movdl(vec, Address(str2, 0)); // move 32 bits
2993 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2994 movq(vec, Address(str2, 0)); // move 64 bits
2995 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2996 // Array header size is 12 bytes in 32-bit VM
2997 // + 6 bytes for 3 chars == 18 bytes,
2998 // enough space to load vec and shift.
2999 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3000 if (ae == StrIntrinsicNode::UL) {
3001 int tail_off = int_cnt2-8;
3002 pmovzxbw(vec, Address(str2, tail_off));
3003 psrldq(vec, -2*tail_off);
3004 }
3005 else {
3006 int tail_off = int_cnt2*(1<<scale2);
3007 movdqu(vec, Address(str2, tail_off-16));
3008 psrldq(vec, 16-tail_off);
3009 }
3010 }
3011 } else { // not constant substring
3012 cmpl(cnt2, stride);
3013 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3014
3015 // We can read beyond string if srt+16 does not cross page boundary
3016 // since heaps are aligned and mapped by pages.
3017 assert(os::vm_page_size() < (int)G, "default page should be small");
3018 movl(result, str2); // We need only low 32 bits
3019 andl(result, ((int)os::vm_page_size()-1));
3020 cmpl(result, ((int)os::vm_page_size()-16));
3021 jccb(Assembler::belowEqual, CHECK_STR);
3022
3023 // Move small strings to stack to allow load 16 bytes into vec.
3024 subptr(rsp, 16);
3025 int stk_offset = wordSize-(1<<scale2);
3026 push(cnt2);
3027
3028 bind(COPY_SUBSTR);
3029 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3030 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3031 movb(Address(rsp, cnt2, scale2, stk_offset), result);
3032 } else if (ae == StrIntrinsicNode::UU) {
3033 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3034 movw(Address(rsp, cnt2, scale2, stk_offset), result);
3035 }
3036 decrement(cnt2);
3037 jccb(Assembler::notZero, COPY_SUBSTR);
3038
3039 pop(cnt2);
3040 movptr(str2, rsp); // New substring address
3041 } // non constant
3042
3043 bind(CHECK_STR);
3044 cmpl(cnt1, stride);
3045 jccb(Assembler::aboveEqual, BIG_STRINGS);
3046
3047 // Check cross page boundary.
3048 movl(result, str1); // We need only low 32 bits
3049 andl(result, ((int)os::vm_page_size()-1));
3050 cmpl(result, ((int)os::vm_page_size()-16));
3051 jccb(Assembler::belowEqual, BIG_STRINGS);
3052
3053 subptr(rsp, 16);
3054 int stk_offset = -(1<<scale1);
3055 if (int_cnt2 < 0) { // not constant
3056 push(cnt2);
3057 stk_offset += wordSize;
3058 }
3059 movl(cnt2, cnt1);
3060
3061 bind(COPY_STR);
3062 if (ae == StrIntrinsicNode::LL) {
3063 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3064 movb(Address(rsp, cnt2, scale1, stk_offset), result);
3065 } else {
3066 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3067 movw(Address(rsp, cnt2, scale1, stk_offset), result);
3068 }
3069 decrement(cnt2);
3070 jccb(Assembler::notZero, COPY_STR);
3071
3072 if (int_cnt2 < 0) { // not constant
3073 pop(cnt2);
3074 }
3075 movptr(str1, rsp); // New string address
3076
3077 bind(BIG_STRINGS);
3078 // Load substring.
3079 if (int_cnt2 < 0) { // -1
3080 if (ae == StrIntrinsicNode::UL) {
3081 pmovzxbw(vec, Address(str2, 0));
3082 } else {
3083 movdqu(vec, Address(str2, 0));
3084 }
3085 push(cnt2); // substr count
3086 push(str2); // substr addr
3087 push(str1); // string addr
3088 } else {
3089 // Small (< 8 chars) constant substrings are loaded already.
3090 movl(cnt2, int_cnt2);
3091 }
3092 push(tmp); // original SP
3093
3094 } // Finished loading
3095
3096 //========================================================
3097 // Start search
3098 //
3099
3100 movptr(result, str1); // string addr
3101
3102 if (int_cnt2 < 0) { // Only for non constant substring
3103 jmpb(SCAN_TO_SUBSTR);
3104
3105 // SP saved at sp+0
3106 // String saved at sp+1*wordSize
3107 // Substr saved at sp+2*wordSize
3108 // Substr count saved at sp+3*wordSize
3109
3110 // Reload substr for rescan, this code
3111 // is executed only for large substrings (> 8 chars)
3112 bind(RELOAD_SUBSTR);
3113 movptr(str2, Address(rsp, 2*wordSize));
3114 movl(cnt2, Address(rsp, 3*wordSize));
3115 if (ae == StrIntrinsicNode::UL) {
3116 pmovzxbw(vec, Address(str2, 0));
3117 } else {
3118 movdqu(vec, Address(str2, 0));
3119 }
3120 // We came here after the beginning of the substring was
3121 // matched but the rest of it was not so we need to search
3122 // again. Start from the next element after the previous match.
3123 subptr(str1, result); // Restore counter
3124 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3125 shrl(str1, 1);
3126 }
3127 addl(cnt1, str1);
3128 decrementl(cnt1); // Shift to next element
3129 cmpl(cnt1, cnt2);
3130 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3131
3132 addptr(result, (1<<scale1));
3133 } // non constant
3134
3135 // Scan string for start of substr in 16-byte vectors
3136 bind(SCAN_TO_SUBSTR);
3137 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3138 pcmpestri(vec, Address(result, 0), mode);
3139 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3140 subl(cnt1, stride);
3141 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3142 cmpl(cnt1, cnt2);
3143 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3144 addptr(result, 16);
3145
3146 bind(ADJUST_STR);
3147 cmpl(cnt1, stride); // Do not read beyond string
3148 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3149 // Back-up string to avoid reading beyond string.
3150 lea(result, Address(result, cnt1, scale1, -16));
3151 movl(cnt1, stride);
3152 jmpb(SCAN_TO_SUBSTR);
3153
3154 // Found a potential substr
3155 bind(FOUND_CANDIDATE);
3156 // After pcmpestri tmp(rcx) contains matched element index
3157
3158 // Make sure string is still long enough
3159 subl(cnt1, tmp);
3160 cmpl(cnt1, cnt2);
3161 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3162 // Left less then substring.
3163
3164 bind(RET_NOT_FOUND);
3165 movl(result, -1);
3166 jmp(CLEANUP);
3167
3168 bind(FOUND_SUBSTR);
3169 // Compute start addr of substr
3170 lea(result, Address(result, tmp, scale1));
3171 if (int_cnt2 > 0) { // Constant substring
3172 // Repeat search for small substring (< 8 chars)
3173 // from new point without reloading substring.
3174 // Have to check that we don't read beyond string.
3175 cmpl(tmp, stride-int_cnt2);
3176 jccb(Assembler::greater, ADJUST_STR);
3177 // Fall through if matched whole substring.
3178 } else { // non constant
3179 assert(int_cnt2 == -1, "should be != 0");
3180
3181 addl(tmp, cnt2);
3182 // Found result if we matched whole substring.
3183 cmpl(tmp, stride);
3184 jcc(Assembler::lessEqual, RET_FOUND);
3185
3186 // Repeat search for small substring (<= 8 chars)
3187 // from new point 'str1' without reloading substring.
3188 cmpl(cnt2, stride);
3189 // Have to check that we don't read beyond string.
3190 jccb(Assembler::lessEqual, ADJUST_STR);
3191
3192 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3193 // Compare the rest of substring (> 8 chars).
3194 movptr(str1, result);
3195
3196 cmpl(tmp, cnt2);
3197 // First 8 chars are already matched.
3198 jccb(Assembler::equal, CHECK_NEXT);
3199
3200 bind(SCAN_SUBSTR);
3201 pcmpestri(vec, Address(str1, 0), mode);
3202 // Need to reload strings pointers if not matched whole vector
3203 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3204
3205 bind(CHECK_NEXT);
3206 subl(cnt2, stride);
3207 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3208 addptr(str1, 16);
3209 if (ae == StrIntrinsicNode::UL) {
3210 addptr(str2, 8);
3211 } else {
3212 addptr(str2, 16);
3213 }
3214 subl(cnt1, stride);
3215 cmpl(cnt2, stride); // Do not read beyond substring
3216 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3217 // Back-up strings to avoid reading beyond substring.
3218
3219 if (ae == StrIntrinsicNode::UL) {
3220 lea(str2, Address(str2, cnt2, scale2, -8));
3221 lea(str1, Address(str1, cnt2, scale1, -16));
3222 } else {
3223 lea(str2, Address(str2, cnt2, scale2, -16));
3224 lea(str1, Address(str1, cnt2, scale1, -16));
3225 }
3226 subl(cnt1, cnt2);
3227 movl(cnt2, stride);
3228 addl(cnt1, stride);
3229 bind(CONT_SCAN_SUBSTR);
3230 if (ae == StrIntrinsicNode::UL) {
3231 pmovzxbw(vec, Address(str2, 0));
3232 } else {
3233 movdqu(vec, Address(str2, 0));
3234 }
3235 jmp(SCAN_SUBSTR);
3236
3237 bind(RET_FOUND_LONG);
3238 movptr(str1, Address(rsp, wordSize));
3239 } // non constant
3240
3241 bind(RET_FOUND);
3242 // Compute substr offset
3243 subptr(result, str1);
3244 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3245 shrl(result, 1); // index
3246 }
3247 bind(CLEANUP);
3248 pop(rsp); // restore SP
3249
3250 } // string_indexof
3251
3252 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3253 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3254 ShortBranchVerifier sbv(this);
3255 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3256
3257 int stride = 8;
3258
3259 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3260 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3261 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3262 FOUND_SEQ_CHAR, DONE_LABEL;
3263
3264 movptr(result, str1);
3265 if (UseAVX >= 2) {
3266 cmpl(cnt1, stride);
3267 jcc(Assembler::less, SCAN_TO_CHAR);
3268 cmpl(cnt1, 2*stride);
3269 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3270 movdl(vec1, ch);
3271 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3272 vpxor(vec2, vec2);
3273 movl(tmp, cnt1);
3274 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3275 andl(cnt1,0x0000000F); //tail count (in chars)
3276
3277 bind(SCAN_TO_16_CHAR_LOOP);
3278 vmovdqu(vec3, Address(result, 0));
3279 vpcmpeqw(vec3, vec3, vec1, 1);
3280 vptest(vec2, vec3);
3281 jcc(Assembler::carryClear, FOUND_CHAR);
3282 addptr(result, 32);
3283 subl(tmp, 2*stride);
3284 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3285 jmp(SCAN_TO_8_CHAR);
3286 bind(SCAN_TO_8_CHAR_INIT);
3287 movdl(vec1, ch);
3288 pshuflw(vec1, vec1, 0x00);
3289 pshufd(vec1, vec1, 0);
3290 pxor(vec2, vec2);
3291 }
3292 bind(SCAN_TO_8_CHAR);
3293 cmpl(cnt1, stride);
3294 jcc(Assembler::less, SCAN_TO_CHAR);
3295 if (UseAVX < 2) {
3296 movdl(vec1, ch);
3297 pshuflw(vec1, vec1, 0x00);
3298 pshufd(vec1, vec1, 0);
3299 pxor(vec2, vec2);
3300 }
3301 movl(tmp, cnt1);
3302 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3303 andl(cnt1,0x00000007); //tail count (in chars)
3304
3305 bind(SCAN_TO_8_CHAR_LOOP);
3306 movdqu(vec3, Address(result, 0));
3307 pcmpeqw(vec3, vec1);
3308 ptest(vec2, vec3);
3309 jcc(Assembler::carryClear, FOUND_CHAR);
3310 addptr(result, 16);
3311 subl(tmp, stride);
3312 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3313 bind(SCAN_TO_CHAR);
3314 testl(cnt1, cnt1);
3315 jcc(Assembler::zero, RET_NOT_FOUND);
3316 bind(SCAN_TO_CHAR_LOOP);
3317 load_unsigned_short(tmp, Address(result, 0));
3318 cmpl(ch, tmp);
3319 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3320 addptr(result, 2);
3321 subl(cnt1, 1);
3322 jccb(Assembler::zero, RET_NOT_FOUND);
3323 jmp(SCAN_TO_CHAR_LOOP);
3324
3325 bind(RET_NOT_FOUND);
3326 movl(result, -1);
3327 jmpb(DONE_LABEL);
3328
3329 bind(FOUND_CHAR);
3330 if (UseAVX >= 2) {
3331 vpmovmskb(tmp, vec3);
3332 } else {
3333 pmovmskb(tmp, vec3);
3334 }
3335 bsfl(ch, tmp);
3336 addptr(result, ch);
3337
3338 bind(FOUND_SEQ_CHAR);
3339 subptr(result, str1);
3340 shrl(result, 1);
3341
3342 bind(DONE_LABEL);
3343 } // string_indexof_char
3344
3345 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3346 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3347 ShortBranchVerifier sbv(this);
3348 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3349
3350 int stride = 16;
3351
3352 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3353 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3354 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3355 FOUND_SEQ_CHAR, DONE_LABEL;
3356
3357 movptr(result, str1);
3358 if (UseAVX >= 2) {
3359 cmpl(cnt1, stride);
3360 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3361 cmpl(cnt1, stride*2);
3362 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3363 movdl(vec1, ch);
3364 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3365 vpxor(vec2, vec2);
3366 movl(tmp, cnt1);
3367 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3368 andl(cnt1,0x0000001F); //tail count (in chars)
3369
3370 bind(SCAN_TO_32_CHAR_LOOP);
3371 vmovdqu(vec3, Address(result, 0));
3372 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3373 vptest(vec2, vec3);
3374 jcc(Assembler::carryClear, FOUND_CHAR);
3375 addptr(result, 32);
3376 subl(tmp, stride*2);
3377 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3378 jmp(SCAN_TO_16_CHAR);
3379
3380 bind(SCAN_TO_16_CHAR_INIT);
3381 movdl(vec1, ch);
3382 pxor(vec2, vec2);
3383 pshufb(vec1, vec2);
3384 }
3385
3386 bind(SCAN_TO_16_CHAR);
3387 cmpl(cnt1, stride);
3388 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3389 if (UseAVX < 2) {
3390 movdl(vec1, ch);
3391 pxor(vec2, vec2);
3392 pshufb(vec1, vec2);
3393 }
3394 movl(tmp, cnt1);
3395 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3396 andl(cnt1,0x0000000F); //tail count (in bytes)
3397
3398 bind(SCAN_TO_16_CHAR_LOOP);
3399 movdqu(vec3, Address(result, 0));
3400 pcmpeqb(vec3, vec1);
3401 ptest(vec2, vec3);
3402 jcc(Assembler::carryClear, FOUND_CHAR);
3403 addptr(result, 16);
3404 subl(tmp, stride);
3405 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3406
3407 bind(SCAN_TO_CHAR_INIT);
3408 testl(cnt1, cnt1);
3409 jcc(Assembler::zero, RET_NOT_FOUND);
3410 bind(SCAN_TO_CHAR_LOOP);
3411 load_unsigned_byte(tmp, Address(result, 0));
3412 cmpl(ch, tmp);
3413 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3414 addptr(result, 1);
3415 subl(cnt1, 1);
3416 jccb(Assembler::zero, RET_NOT_FOUND);
3417 jmp(SCAN_TO_CHAR_LOOP);
3418
3419 bind(RET_NOT_FOUND);
3420 movl(result, -1);
3421 jmpb(DONE_LABEL);
3422
3423 bind(FOUND_CHAR);
3424 if (UseAVX >= 2) {
3425 vpmovmskb(tmp, vec3);
3426 } else {
3427 pmovmskb(tmp, vec3);
3428 }
3429 bsfl(ch, tmp);
3430 addptr(result, ch);
3431
3432 bind(FOUND_SEQ_CHAR);
3433 subptr(result, str1);
3434
3435 bind(DONE_LABEL);
3436 } // stringL_indexof_char
3437
3438 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3439 switch (eltype) {
3440 case T_BOOLEAN: return sizeof(jboolean);
3441 case T_BYTE: return sizeof(jbyte);
3442 case T_SHORT: return sizeof(jshort);
3443 case T_CHAR: return sizeof(jchar);
3444 case T_INT: return sizeof(jint);
3445 default:
3446 ShouldNotReachHere();
3447 return -1;
3448 }
3449 }
3450
3451 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3452 switch (eltype) {
3453 // T_BOOLEAN used as surrogate for unsigned byte
3454 case T_BOOLEAN: movzbl(dst, src); break;
3455 case T_BYTE: movsbl(dst, src); break;
3456 case T_SHORT: movswl(dst, src); break;
3457 case T_CHAR: movzwl(dst, src); break;
3458 case T_INT: movl(dst, src); break;
3459 default:
3460 ShouldNotReachHere();
3461 }
3462 }
3463
3464 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3465 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3466 }
3467
3468 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3469 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3470 }
3471
3472 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3473 const int vlen = Assembler::AVX_256bit;
3474 switch (eltype) {
3475 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3476 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3477 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3478 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3479 case T_INT:
3480 // do nothing
3481 break;
3482 default:
3483 ShouldNotReachHere();
3484 }
3485 }
3486
3487 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3488 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3489 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3490 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3491 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3492 BasicType eltype) {
3493 ShortBranchVerifier sbv(this);
3494 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3495 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3496 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3497
3498 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3499 SHORT_UNROLLED_LOOP_EXIT,
3500 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3501 UNROLLED_VECTOR_LOOP_BEGIN,
3502 END;
3503 switch (eltype) {
3504 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3505 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3506 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3507 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3508 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3509 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3510 }
3511
3512 // For "renaming" for readibility of the code
3513 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3514 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3515 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3516
3517 const int elsize = arrays_hashcode_elsize(eltype);
3518
3519 /*
3520 if (cnt1 >= 2) {
3521 if (cnt1 >= 32) {
3522 UNROLLED VECTOR LOOP
3523 }
3524 UNROLLED SCALAR LOOP
3525 }
3526 SINGLE SCALAR
3527 */
3528
3529 cmpl(cnt1, 32);
3530 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3531
3532 // cnt1 >= 32 && generate_vectorized_loop
3533 xorl(index, index);
3534
3535 // vresult = IntVector.zero(I256);
3536 for (int idx = 0; idx < 4; idx++) {
3537 vpxor(vresult[idx], vresult[idx]);
3538 }
3539 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3540 Register bound = tmp2;
3541 Register next = tmp3;
3542 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3543 movl(next, Address(tmp2, 0));
3544 movdl(vnext, next);
3545 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3546
3547 // index = 0;
3548 // bound = cnt1 & ~(32 - 1);
3549 movl(bound, cnt1);
3550 andl(bound, ~(32 - 1));
3551 // for (; index < bound; index += 32) {
3552 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3553 // result *= next;
3554 imull(result, next);
3555 // loop fission to upfront the cost of fetching from memory, OOO execution
3556 // can then hopefully do a better job of prefetching
3557 for (int idx = 0; idx < 4; idx++) {
3558 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3559 }
3560 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3561 for (int idx = 0; idx < 4; idx++) {
3562 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3563 arrays_hashcode_elvcast(vtmp[idx], eltype);
3564 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3565 }
3566 // index += 32;
3567 addl(index, 32);
3568 // index < bound;
3569 cmpl(index, bound);
3570 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3571 // }
3572
3573 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3574 subl(cnt1, bound);
3575 // release bound
3576
3577 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3578 for (int idx = 0; idx < 4; idx++) {
3579 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3580 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3581 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3582 }
3583 // result += vresult.reduceLanes(ADD);
3584 for (int idx = 0; idx < 4; idx++) {
3585 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3586 }
3587
3588 // } else if (cnt1 < 32) {
3589
3590 bind(SHORT_UNROLLED_BEGIN);
3591 // int i = 1;
3592 movl(index, 1);
3593 cmpl(index, cnt1);
3594 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3595
3596 // for (; i < cnt1 ; i += 2) {
3597 bind(SHORT_UNROLLED_LOOP_BEGIN);
3598 movl(tmp3, 961);
3599 imull(result, tmp3);
3600 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3601 movl(tmp3, tmp2);
3602 shll(tmp3, 5);
3603 subl(tmp3, tmp2);
3604 addl(result, tmp3);
3605 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3606 addl(result, tmp3);
3607 addl(index, 2);
3608 cmpl(index, cnt1);
3609 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3610
3611 // }
3612 // if (i >= cnt1) {
3613 bind(SHORT_UNROLLED_LOOP_EXIT);
3614 jccb(Assembler::greater, END);
3615 movl(tmp2, result);
3616 shll(result, 5);
3617 subl(result, tmp2);
3618 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3619 addl(result, tmp3);
3620 // }
3621 bind(END);
3622
3623 BLOCK_COMMENT("} // arrays_hashcode");
3624
3625 } // arrays_hashcode
3626
3627 // helper function for string_compare
3628 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3629 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3630 Address::ScaleFactor scale2, Register index, int ae) {
3631 if (ae == StrIntrinsicNode::LL) {
3632 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3633 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3634 } else if (ae == StrIntrinsicNode::UU) {
3635 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3636 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3637 } else {
3638 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3639 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3640 }
3641 }
3642
3643 // Compare strings, used for char[] and byte[].
3644 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3645 Register cnt1, Register cnt2, Register result,
3646 XMMRegister vec1, int ae, KRegister mask) {
3647 ShortBranchVerifier sbv(this);
3648 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3649 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3650 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3651 int stride2x2 = 0x40;
3652 Address::ScaleFactor scale = Address::no_scale;
3653 Address::ScaleFactor scale1 = Address::no_scale;
3654 Address::ScaleFactor scale2 = Address::no_scale;
3655
3656 if (ae != StrIntrinsicNode::LL) {
3657 stride2x2 = 0x20;
3658 }
3659
3660 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3661 shrl(cnt2, 1);
3662 }
3663 // Compute the minimum of the string lengths and the
3664 // difference of the string lengths (stack).
3665 // Do the conditional move stuff
3666 movl(result, cnt1);
3667 subl(cnt1, cnt2);
3668 push(cnt1);
3669 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3670
3671 // Is the minimum length zero?
3672 testl(cnt2, cnt2);
3673 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3674 if (ae == StrIntrinsicNode::LL) {
3675 // Load first bytes
3676 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3677 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3678 } else if (ae == StrIntrinsicNode::UU) {
3679 // Load first characters
3680 load_unsigned_short(result, Address(str1, 0));
3681 load_unsigned_short(cnt1, Address(str2, 0));
3682 } else {
3683 load_unsigned_byte(result, Address(str1, 0));
3684 load_unsigned_short(cnt1, Address(str2, 0));
3685 }
3686 subl(result, cnt1);
3687 jcc(Assembler::notZero, POP_LABEL);
3688
3689 if (ae == StrIntrinsicNode::UU) {
3690 // Divide length by 2 to get number of chars
3691 shrl(cnt2, 1);
3692 }
3693 cmpl(cnt2, 1);
3694 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3695
3696 // Check if the strings start at the same location and setup scale and stride
3697 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3698 cmpptr(str1, str2);
3699 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3700 if (ae == StrIntrinsicNode::LL) {
3701 scale = Address::times_1;
3702 stride = 16;
3703 } else {
3704 scale = Address::times_2;
3705 stride = 8;
3706 }
3707 } else {
3708 scale1 = Address::times_1;
3709 scale2 = Address::times_2;
3710 // scale not used
3711 stride = 8;
3712 }
3713
3714 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3715 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3716 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3717 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3718 Label COMPARE_TAIL_LONG;
3719 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3720
3721 int pcmpmask = 0x19;
3722 if (ae == StrIntrinsicNode::LL) {
3723 pcmpmask &= ~0x01;
3724 }
3725
3726 // Setup to compare 16-chars (32-bytes) vectors,
3727 // start from first character again because it has aligned address.
3728 if (ae == StrIntrinsicNode::LL) {
3729 stride2 = 32;
3730 } else {
3731 stride2 = 16;
3732 }
3733 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3734 adr_stride = stride << scale;
3735 } else {
3736 adr_stride1 = 8; //stride << scale1;
3737 adr_stride2 = 16; //stride << scale2;
3738 }
3739
3740 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3741 // rax and rdx are used by pcmpestri as elements counters
3742 movl(result, cnt2);
3743 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3744 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3745
3746 // fast path : compare first 2 8-char vectors.
3747 bind(COMPARE_16_CHARS);
3748 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3749 movdqu(vec1, Address(str1, 0));
3750 } else {
3751 pmovzxbw(vec1, Address(str1, 0));
3752 }
3753 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3754 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3755
3756 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3757 movdqu(vec1, Address(str1, adr_stride));
3758 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3759 } else {
3760 pmovzxbw(vec1, Address(str1, adr_stride1));
3761 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3762 }
3763 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3764 addl(cnt1, stride);
3765
3766 // Compare the characters at index in cnt1
3767 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3768 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3769 subl(result, cnt2);
3770 jmp(POP_LABEL);
3771
3772 // Setup the registers to start vector comparison loop
3773 bind(COMPARE_WIDE_VECTORS);
3774 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3775 lea(str1, Address(str1, result, scale));
3776 lea(str2, Address(str2, result, scale));
3777 } else {
3778 lea(str1, Address(str1, result, scale1));
3779 lea(str2, Address(str2, result, scale2));
3780 }
3781 subl(result, stride2);
3782 subl(cnt2, stride2);
3783 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3784 negptr(result);
3785
3786 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3787 bind(COMPARE_WIDE_VECTORS_LOOP);
3788
3789 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3790 cmpl(cnt2, stride2x2);
3791 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3792 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3793 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3794
3795 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3796 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3797 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3798 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3799 } else {
3800 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3801 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3802 }
3803 kortestql(mask, mask);
3804 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3805 addptr(result, stride2x2); // update since we already compared at this addr
3806 subl(cnt2, stride2x2); // and sub the size too
3807 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3808
3809 vpxor(vec1, vec1);
3810 jmpb(COMPARE_WIDE_TAIL);
3811 }//if (VM_Version::supports_avx512vlbw())
3812
3813 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3814 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3815 vmovdqu(vec1, Address(str1, result, scale));
3816 vpxor(vec1, Address(str2, result, scale));
3817 } else {
3818 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3819 vpxor(vec1, Address(str2, result, scale2));
3820 }
3821 vptest(vec1, vec1);
3822 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3823 addptr(result, stride2);
3824 subl(cnt2, stride2);
3825 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3826 // clean upper bits of YMM registers
3827 vpxor(vec1, vec1);
3828
3829 // compare wide vectors tail
3830 bind(COMPARE_WIDE_TAIL);
3831 testptr(result, result);
3832 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3833
3834 movl(result, stride2);
3835 movl(cnt2, result);
3836 negptr(result);
3837 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3838
3839 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3840 bind(VECTOR_NOT_EQUAL);
3841 // clean upper bits of YMM registers
3842 vpxor(vec1, vec1);
3843 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3844 lea(str1, Address(str1, result, scale));
3845 lea(str2, Address(str2, result, scale));
3846 } else {
3847 lea(str1, Address(str1, result, scale1));
3848 lea(str2, Address(str2, result, scale2));
3849 }
3850 jmp(COMPARE_16_CHARS);
3851
3852 // Compare tail chars, length between 1 to 15 chars
3853 bind(COMPARE_TAIL_LONG);
3854 movl(cnt2, result);
3855 cmpl(cnt2, stride);
3856 jcc(Assembler::less, COMPARE_SMALL_STR);
3857
3858 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3859 movdqu(vec1, Address(str1, 0));
3860 } else {
3861 pmovzxbw(vec1, Address(str1, 0));
3862 }
3863 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3864 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3865 subptr(cnt2, stride);
3866 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3867 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3868 lea(str1, Address(str1, result, scale));
3869 lea(str2, Address(str2, result, scale));
3870 } else {
3871 lea(str1, Address(str1, result, scale1));
3872 lea(str2, Address(str2, result, scale2));
3873 }
3874 negptr(cnt2);
3875 jmpb(WHILE_HEAD_LABEL);
3876
3877 bind(COMPARE_SMALL_STR);
3878 } else if (UseSSE42Intrinsics) {
3879 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3880 int pcmpmask = 0x19;
3881 // Setup to compare 8-char (16-byte) vectors,
3882 // start from first character again because it has aligned address.
3883 movl(result, cnt2);
3884 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3885 if (ae == StrIntrinsicNode::LL) {
3886 pcmpmask &= ~0x01;
3887 }
3888 jcc(Assembler::zero, COMPARE_TAIL);
3889 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3890 lea(str1, Address(str1, result, scale));
3891 lea(str2, Address(str2, result, scale));
3892 } else {
3893 lea(str1, Address(str1, result, scale1));
3894 lea(str2, Address(str2, result, scale2));
3895 }
3896 negptr(result);
3897
3898 // pcmpestri
3899 // inputs:
3900 // vec1- substring
3901 // rax - negative string length (elements count)
3902 // mem - scanned string
3903 // rdx - string length (elements count)
3904 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3905 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3906 // outputs:
3907 // rcx - first mismatched element index
3908 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3909
3910 bind(COMPARE_WIDE_VECTORS);
3911 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3912 movdqu(vec1, Address(str1, result, scale));
3913 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3914 } else {
3915 pmovzxbw(vec1, Address(str1, result, scale1));
3916 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3917 }
3918 // After pcmpestri cnt1(rcx) contains mismatched element index
3919
3920 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3921 addptr(result, stride);
3922 subptr(cnt2, stride);
3923 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3924
3925 // compare wide vectors tail
3926 testptr(result, result);
3927 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3928
3929 movl(cnt2, stride);
3930 movl(result, stride);
3931 negptr(result);
3932 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3933 movdqu(vec1, Address(str1, result, scale));
3934 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3935 } else {
3936 pmovzxbw(vec1, Address(str1, result, scale1));
3937 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3938 }
3939 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3940
3941 // Mismatched characters in the vectors
3942 bind(VECTOR_NOT_EQUAL);
3943 addptr(cnt1, result);
3944 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3945 subl(result, cnt2);
3946 jmpb(POP_LABEL);
3947
3948 bind(COMPARE_TAIL); // limit is zero
3949 movl(cnt2, result);
3950 // Fallthru to tail compare
3951 }
3952 // Shift str2 and str1 to the end of the arrays, negate min
3953 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3954 lea(str1, Address(str1, cnt2, scale));
3955 lea(str2, Address(str2, cnt2, scale));
3956 } else {
3957 lea(str1, Address(str1, cnt2, scale1));
3958 lea(str2, Address(str2, cnt2, scale2));
3959 }
3960 decrementl(cnt2); // first character was compared already
3961 negptr(cnt2);
3962
3963 // Compare the rest of the elements
3964 bind(WHILE_HEAD_LABEL);
3965 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3966 subl(result, cnt1);
3967 jccb(Assembler::notZero, POP_LABEL);
3968 increment(cnt2);
3969 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3970
3971 // Strings are equal up to min length. Return the length difference.
3972 bind(LENGTH_DIFF_LABEL);
3973 pop(result);
3974 if (ae == StrIntrinsicNode::UU) {
3975 // Divide diff by 2 to get number of chars
3976 sarl(result, 1);
3977 }
3978 jmpb(DONE_LABEL);
3979
3980 if (VM_Version::supports_avx512vlbw()) {
3981
3982 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3983
3984 kmovql(cnt1, mask);
3985 notq(cnt1);
3986 bsfq(cnt2, cnt1);
3987 if (ae != StrIntrinsicNode::LL) {
3988 // Divide diff by 2 to get number of chars
3989 sarl(cnt2, 1);
3990 }
3991 addq(result, cnt2);
3992 if (ae == StrIntrinsicNode::LL) {
3993 load_unsigned_byte(cnt1, Address(str2, result));
3994 load_unsigned_byte(result, Address(str1, result));
3995 } else if (ae == StrIntrinsicNode::UU) {
3996 load_unsigned_short(cnt1, Address(str2, result, scale));
3997 load_unsigned_short(result, Address(str1, result, scale));
3998 } else {
3999 load_unsigned_short(cnt1, Address(str2, result, scale2));
4000 load_unsigned_byte(result, Address(str1, result, scale1));
4001 }
4002 subl(result, cnt1);
4003 jmpb(POP_LABEL);
4004 }//if (VM_Version::supports_avx512vlbw())
4005
4006 // Discard the stored length difference
4007 bind(POP_LABEL);
4008 pop(cnt1);
4009
4010 // That's it
4011 bind(DONE_LABEL);
4012 if(ae == StrIntrinsicNode::UL) {
4013 negl(result);
4014 }
4015
4016 }
4017
4018 // Search for Non-ASCII character (Negative byte value) in a byte array,
4019 // return the index of the first such character, otherwise the length
4020 // of the array segment searched.
4021 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4022 // @IntrinsicCandidate
4023 // public static int countPositives(byte[] ba, int off, int len) {
4024 // for (int i = off; i < off + len; i++) {
4025 // if (ba[i] < 0) {
4026 // return i - off;
4027 // }
4028 // }
4029 // return len;
4030 // }
4031 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4032 Register result, Register tmp1,
4033 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4034 // rsi: byte array
4035 // rcx: len
4036 // rax: result
4037 ShortBranchVerifier sbv(this);
4038 assert_different_registers(ary1, len, result, tmp1);
4039 assert_different_registers(vec1, vec2);
4040 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4041
4042 movl(result, len); // copy
4043 // len == 0
4044 testl(len, len);
4045 jcc(Assembler::zero, DONE);
4046
4047 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4048 VM_Version::supports_avx512vlbw() &&
4049 VM_Version::supports_bmi2()) {
4050
4051 Label test_64_loop, test_tail, BREAK_LOOP;
4052 movl(tmp1, len);
4053 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4054
4055 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4056 andl(len, 0xffffffc0); // vector count (in chars)
4057 jccb(Assembler::zero, test_tail);
4058
4059 lea(ary1, Address(ary1, len, Address::times_1));
4060 negptr(len);
4061
4062 bind(test_64_loop);
4063 // Check whether our 64 elements of size byte contain negatives
4064 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4065 kortestql(mask1, mask1);
4066 jcc(Assembler::notZero, BREAK_LOOP);
4067
4068 addptr(len, 64);
4069 jccb(Assembler::notZero, test_64_loop);
4070
4071 bind(test_tail);
4072 // bail out when there is nothing to be done
4073 testl(tmp1, -1);
4074 jcc(Assembler::zero, DONE);
4075
4076
4077 // check the tail for absense of negatives
4078 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4079 {
4080 Register tmp3_aliased = len;
4081 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4082 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4083 notq(tmp3_aliased);
4084 kmovql(mask2, tmp3_aliased);
4085 }
4086
4087 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4088 ktestq(mask1, mask2);
4089 jcc(Assembler::zero, DONE);
4090
4091 // do a full check for negative registers in the tail
4092 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4093 // ary1 already pointing to the right place
4094 jmpb(TAIL_START);
4095
4096 bind(BREAK_LOOP);
4097 // At least one byte in the last 64 byte block was negative.
4098 // Set up to look at the last 64 bytes as if they were a tail
4099 lea(ary1, Address(ary1, len, Address::times_1));
4100 addptr(result, len);
4101 // Ignore the very last byte: if all others are positive,
4102 // it must be negative, so we can skip right to the 2+1 byte
4103 // end comparison at this point
4104 orl(result, 63);
4105 movl(len, 63);
4106 // Fallthru to tail compare
4107 } else {
4108
4109 if (UseAVX >= 2) {
4110 // With AVX2, use 32-byte vector compare
4111 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4112
4113 // Compare 32-byte vectors
4114 testl(len, 0xffffffe0); // vector count (in bytes)
4115 jccb(Assembler::zero, TAIL_START);
4116
4117 andl(len, 0xffffffe0);
4118 lea(ary1, Address(ary1, len, Address::times_1));
4119 negptr(len);
4120
4121 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4122 movdl(vec2, tmp1);
4123 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4124
4125 bind(COMPARE_WIDE_VECTORS);
4126 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4127 vptest(vec1, vec2);
4128 jccb(Assembler::notZero, BREAK_LOOP);
4129 addptr(len, 32);
4130 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4131
4132 testl(result, 0x0000001f); // any bytes remaining?
4133 jcc(Assembler::zero, DONE);
4134
4135 // Quick test using the already prepared vector mask
4136 movl(len, result);
4137 andl(len, 0x0000001f);
4138 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4139 vptest(vec1, vec2);
4140 jcc(Assembler::zero, DONE);
4141 // There are zeros, jump to the tail to determine exactly where
4142 jmpb(TAIL_START);
4143
4144 bind(BREAK_LOOP);
4145 // At least one byte in the last 32-byte vector is negative.
4146 // Set up to look at the last 32 bytes as if they were a tail
4147 lea(ary1, Address(ary1, len, Address::times_1));
4148 addptr(result, len);
4149 // Ignore the very last byte: if all others are positive,
4150 // it must be negative, so we can skip right to the 2+1 byte
4151 // end comparison at this point
4152 orl(result, 31);
4153 movl(len, 31);
4154 // Fallthru to tail compare
4155 } else if (UseSSE42Intrinsics) {
4156 // With SSE4.2, use double quad vector compare
4157 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4158
4159 // Compare 16-byte vectors
4160 testl(len, 0xfffffff0); // vector count (in bytes)
4161 jcc(Assembler::zero, TAIL_START);
4162
4163 andl(len, 0xfffffff0);
4164 lea(ary1, Address(ary1, len, Address::times_1));
4165 negptr(len);
4166
4167 movl(tmp1, 0x80808080);
4168 movdl(vec2, tmp1);
4169 pshufd(vec2, vec2, 0);
4170
4171 bind(COMPARE_WIDE_VECTORS);
4172 movdqu(vec1, Address(ary1, len, Address::times_1));
4173 ptest(vec1, vec2);
4174 jccb(Assembler::notZero, BREAK_LOOP);
4175 addptr(len, 16);
4176 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4177
4178 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4179 jcc(Assembler::zero, DONE);
4180
4181 // Quick test using the already prepared vector mask
4182 movl(len, result);
4183 andl(len, 0x0000000f); // tail count (in bytes)
4184 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4185 ptest(vec1, vec2);
4186 jcc(Assembler::zero, DONE);
4187 jmpb(TAIL_START);
4188
4189 bind(BREAK_LOOP);
4190 // At least one byte in the last 16-byte vector is negative.
4191 // Set up and look at the last 16 bytes as if they were a tail
4192 lea(ary1, Address(ary1, len, Address::times_1));
4193 addptr(result, len);
4194 // Ignore the very last byte: if all others are positive,
4195 // it must be negative, so we can skip right to the 2+1 byte
4196 // end comparison at this point
4197 orl(result, 15);
4198 movl(len, 15);
4199 // Fallthru to tail compare
4200 }
4201 }
4202
4203 bind(TAIL_START);
4204 // Compare 4-byte vectors
4205 andl(len, 0xfffffffc); // vector count (in bytes)
4206 jccb(Assembler::zero, COMPARE_CHAR);
4207
4208 lea(ary1, Address(ary1, len, Address::times_1));
4209 negptr(len);
4210
4211 bind(COMPARE_VECTORS);
4212 movl(tmp1, Address(ary1, len, Address::times_1));
4213 andl(tmp1, 0x80808080);
4214 jccb(Assembler::notZero, TAIL_ADJUST);
4215 addptr(len, 4);
4216 jccb(Assembler::notZero, COMPARE_VECTORS);
4217
4218 // Compare trailing char (final 2-3 bytes), if any
4219 bind(COMPARE_CHAR);
4220
4221 testl(result, 0x2); // tail char
4222 jccb(Assembler::zero, COMPARE_BYTE);
4223 load_unsigned_short(tmp1, Address(ary1, 0));
4224 andl(tmp1, 0x00008080);
4225 jccb(Assembler::notZero, CHAR_ADJUST);
4226 lea(ary1, Address(ary1, 2));
4227
4228 bind(COMPARE_BYTE);
4229 testl(result, 0x1); // tail byte
4230 jccb(Assembler::zero, DONE);
4231 load_unsigned_byte(tmp1, Address(ary1, 0));
4232 testl(tmp1, 0x00000080);
4233 jccb(Assembler::zero, DONE);
4234 subptr(result, 1);
4235 jmpb(DONE);
4236
4237 bind(TAIL_ADJUST);
4238 // there are negative bits in the last 4 byte block.
4239 // Adjust result and check the next three bytes
4240 addptr(result, len);
4241 orl(result, 3);
4242 lea(ary1, Address(ary1, len, Address::times_1));
4243 jmpb(COMPARE_CHAR);
4244
4245 bind(CHAR_ADJUST);
4246 // We are looking at a char + optional byte tail, and found that one
4247 // of the bytes in the char is negative. Adjust the result, check the
4248 // first byte and readjust if needed.
4249 andl(result, 0xfffffffc);
4250 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4251 jccb(Assembler::notZero, DONE);
4252 addptr(result, 1);
4253
4254 // That's it
4255 bind(DONE);
4256 if (UseAVX >= 2) {
4257 // clean upper bits of YMM registers
4258 vpxor(vec1, vec1);
4259 vpxor(vec2, vec2);
4260 }
4261 }
4262
4263 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4264 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4265 Register limit, Register result, Register chr,
4266 XMMRegister vec1, XMMRegister vec2, bool is_char,
4267 KRegister mask, bool expand_ary2) {
4268 // for expand_ary2, limit is the (smaller) size of the second array.
4269 ShortBranchVerifier sbv(this);
4270 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4271
4272 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4273 "Expansion only implemented for AVX2");
4274
4275 int length_offset = arrayOopDesc::length_offset_in_bytes();
4276 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4277
4278 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4279 int scaleIncr = expand_ary2 ? 8 : 16;
4280
4281 if (is_array_equ) {
4282 // Check the input args
4283 cmpoop(ary1, ary2);
4284 jcc(Assembler::equal, TRUE_LABEL);
4285
4286 // Need additional checks for arrays_equals.
4287 testptr(ary1, ary1);
4288 jcc(Assembler::zero, FALSE_LABEL);
4289 testptr(ary2, ary2);
4290 jcc(Assembler::zero, FALSE_LABEL);
4291
4292 // Check the lengths
4293 movl(limit, Address(ary1, length_offset));
4294 cmpl(limit, Address(ary2, length_offset));
4295 jcc(Assembler::notEqual, FALSE_LABEL);
4296 }
4297
4298 // count == 0
4299 testl(limit, limit);
4300 jcc(Assembler::zero, TRUE_LABEL);
4301
4302 if (is_array_equ) {
4303 // Load array address
4304 lea(ary1, Address(ary1, base_offset));
4305 lea(ary2, Address(ary2, base_offset));
4306 }
4307
4308 if (is_array_equ && is_char) {
4309 // arrays_equals when used for char[].
4310 shll(limit, 1); // byte count != 0
4311 }
4312 movl(result, limit); // copy
4313
4314 if (UseAVX >= 2) {
4315 // With AVX2, use 32-byte vector compare
4316 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4317
4318 // Compare 32-byte vectors
4319 if (expand_ary2) {
4320 andl(result, 0x0000000f); // tail count (in bytes)
4321 andl(limit, 0xfffffff0); // vector count (in bytes)
4322 jcc(Assembler::zero, COMPARE_TAIL);
4323 } else {
4324 andl(result, 0x0000001f); // tail count (in bytes)
4325 andl(limit, 0xffffffe0); // vector count (in bytes)
4326 jcc(Assembler::zero, COMPARE_TAIL_16);
4327 }
4328
4329 lea(ary1, Address(ary1, limit, scaleFactor));
4330 lea(ary2, Address(ary2, limit, Address::times_1));
4331 negptr(limit);
4332
4333 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4334 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4335
4336 cmpl(limit, -64);
4337 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4338
4339 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4340
4341 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4342 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4343 kortestql(mask, mask);
4344 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4345 addptr(limit, 64); // update since we already compared at this addr
4346 cmpl(limit, -64);
4347 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4348
4349 // At this point we may still need to compare -limit+result bytes.
4350 // We could execute the next two instruction and just continue via non-wide path:
4351 // cmpl(limit, 0);
4352 // jcc(Assembler::equal, COMPARE_TAIL); // true
4353 // But since we stopped at the points ary{1,2}+limit which are
4354 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4355 // (|limit| <= 32 and result < 32),
4356 // we may just compare the last 64 bytes.
4357 //
4358 addptr(result, -64); // it is safe, bc we just came from this area
4359 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4360 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4361 kortestql(mask, mask);
4362 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4363
4364 jmp(TRUE_LABEL);
4365
4366 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4367
4368 }//if (VM_Version::supports_avx512vlbw())
4369
4370 bind(COMPARE_WIDE_VECTORS);
4371 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4372 if (expand_ary2) {
4373 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4374 } else {
4375 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4376 }
4377 vpxor(vec1, vec2);
4378
4379 vptest(vec1, vec1);
4380 jcc(Assembler::notZero, FALSE_LABEL);
4381 addptr(limit, scaleIncr * 2);
4382 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4383
4384 testl(result, result);
4385 jcc(Assembler::zero, TRUE_LABEL);
4386
4387 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4388 if (expand_ary2) {
4389 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4390 } else {
4391 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4392 }
4393 vpxor(vec1, vec2);
4394
4395 vptest(vec1, vec1);
4396 jcc(Assembler::notZero, FALSE_LABEL);
4397 jmp(TRUE_LABEL);
4398
4399 bind(COMPARE_TAIL_16); // limit is zero
4400 movl(limit, result);
4401
4402 // Compare 16-byte chunks
4403 andl(result, 0x0000000f); // tail count (in bytes)
4404 andl(limit, 0xfffffff0); // vector count (in bytes)
4405 jcc(Assembler::zero, COMPARE_TAIL);
4406
4407 lea(ary1, Address(ary1, limit, scaleFactor));
4408 lea(ary2, Address(ary2, limit, Address::times_1));
4409 negptr(limit);
4410
4411 bind(COMPARE_WIDE_VECTORS_16);
4412 movdqu(vec1, Address(ary1, limit, scaleFactor));
4413 if (expand_ary2) {
4414 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4415 } else {
4416 movdqu(vec2, Address(ary2, limit, Address::times_1));
4417 }
4418 pxor(vec1, vec2);
4419
4420 ptest(vec1, vec1);
4421 jcc(Assembler::notZero, FALSE_LABEL);
4422 addptr(limit, scaleIncr);
4423 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4424
4425 bind(COMPARE_TAIL); // limit is zero
4426 movl(limit, result);
4427 // Fallthru to tail compare
4428 } else if (UseSSE42Intrinsics) {
4429 // With SSE4.2, use double quad vector compare
4430 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4431
4432 // Compare 16-byte vectors
4433 andl(result, 0x0000000f); // tail count (in bytes)
4434 andl(limit, 0xfffffff0); // vector count (in bytes)
4435 jcc(Assembler::zero, COMPARE_TAIL);
4436
4437 lea(ary1, Address(ary1, limit, Address::times_1));
4438 lea(ary2, Address(ary2, limit, Address::times_1));
4439 negptr(limit);
4440
4441 bind(COMPARE_WIDE_VECTORS);
4442 movdqu(vec1, Address(ary1, limit, Address::times_1));
4443 movdqu(vec2, Address(ary2, limit, Address::times_1));
4444 pxor(vec1, vec2);
4445
4446 ptest(vec1, vec1);
4447 jcc(Assembler::notZero, FALSE_LABEL);
4448 addptr(limit, 16);
4449 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4450
4451 testl(result, result);
4452 jcc(Assembler::zero, TRUE_LABEL);
4453
4454 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4455 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4456 pxor(vec1, vec2);
4457
4458 ptest(vec1, vec1);
4459 jccb(Assembler::notZero, FALSE_LABEL);
4460 jmpb(TRUE_LABEL);
4461
4462 bind(COMPARE_TAIL); // limit is zero
4463 movl(limit, result);
4464 // Fallthru to tail compare
4465 }
4466
4467 // Compare 4-byte vectors
4468 if (expand_ary2) {
4469 testl(result, result);
4470 jccb(Assembler::zero, TRUE_LABEL);
4471 } else {
4472 andl(limit, 0xfffffffc); // vector count (in bytes)
4473 jccb(Assembler::zero, COMPARE_CHAR);
4474 }
4475
4476 lea(ary1, Address(ary1, limit, scaleFactor));
4477 lea(ary2, Address(ary2, limit, Address::times_1));
4478 negptr(limit);
4479
4480 bind(COMPARE_VECTORS);
4481 if (expand_ary2) {
4482 // There are no "vector" operations for bytes to shorts
4483 movzbl(chr, Address(ary2, limit, Address::times_1));
4484 cmpw(Address(ary1, limit, Address::times_2), chr);
4485 jccb(Assembler::notEqual, FALSE_LABEL);
4486 addptr(limit, 1);
4487 jcc(Assembler::notZero, COMPARE_VECTORS);
4488 jmp(TRUE_LABEL);
4489 } else {
4490 movl(chr, Address(ary1, limit, Address::times_1));
4491 cmpl(chr, Address(ary2, limit, Address::times_1));
4492 jccb(Assembler::notEqual, FALSE_LABEL);
4493 addptr(limit, 4);
4494 jcc(Assembler::notZero, COMPARE_VECTORS);
4495 }
4496
4497 // Compare trailing char (final 2 bytes), if any
4498 bind(COMPARE_CHAR);
4499 testl(result, 0x2); // tail char
4500 jccb(Assembler::zero, COMPARE_BYTE);
4501 load_unsigned_short(chr, Address(ary1, 0));
4502 load_unsigned_short(limit, Address(ary2, 0));
4503 cmpl(chr, limit);
4504 jccb(Assembler::notEqual, FALSE_LABEL);
4505
4506 if (is_array_equ && is_char) {
4507 bind(COMPARE_BYTE);
4508 } else {
4509 lea(ary1, Address(ary1, 2));
4510 lea(ary2, Address(ary2, 2));
4511
4512 bind(COMPARE_BYTE);
4513 testl(result, 0x1); // tail byte
4514 jccb(Assembler::zero, TRUE_LABEL);
4515 load_unsigned_byte(chr, Address(ary1, 0));
4516 load_unsigned_byte(limit, Address(ary2, 0));
4517 cmpl(chr, limit);
4518 jccb(Assembler::notEqual, FALSE_LABEL);
4519 }
4520 bind(TRUE_LABEL);
4521 movl(result, 1); // return true
4522 jmpb(DONE);
4523
4524 bind(FALSE_LABEL);
4525 xorl(result, result); // return false
4526
4527 // That's it
4528 bind(DONE);
4529 if (UseAVX >= 2) {
4530 // clean upper bits of YMM registers
4531 vpxor(vec1, vec1);
4532 vpxor(vec2, vec2);
4533 }
4534 }
4535
4536 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4537 #define __ masm.
4538 Register dst = stub.data<0>();
4539 XMMRegister src = stub.data<1>();
4540 address target = stub.data<2>();
4541 __ bind(stub.entry());
4542 __ subptr(rsp, 8);
4543 __ movdbl(Address(rsp), src);
4544 __ call(RuntimeAddress(target));
4545 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4546 __ pop(dst);
4547 __ jmp(stub.continuation());
4548 #undef __
4549 }
4550
4551 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4552 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4553 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4554
4555 address slowpath_target;
4556 if (dst_bt == T_INT) {
4557 if (src_bt == T_FLOAT) {
4558 cvttss2sil(dst, src);
4559 cmpl(dst, 0x80000000);
4560 slowpath_target = StubRoutines::x86::f2i_fixup();
4561 } else {
4562 cvttsd2sil(dst, src);
4563 cmpl(dst, 0x80000000);
4564 slowpath_target = StubRoutines::x86::d2i_fixup();
4565 }
4566 } else {
4567 if (src_bt == T_FLOAT) {
4568 cvttss2siq(dst, src);
4569 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4570 slowpath_target = StubRoutines::x86::f2l_fixup();
4571 } else {
4572 cvttsd2siq(dst, src);
4573 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4574 slowpath_target = StubRoutines::x86::d2l_fixup();
4575 }
4576 }
4577
4578 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4579 int max_size = 23 + (UseAPX ? 1 : 0);
4580 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4581 jcc(Assembler::equal, stub->entry());
4582 bind(stub->continuation());
4583 }
4584
4585 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4586 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4587 switch(ideal_opc) {
4588 case Op_LShiftVS:
4589 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4590 case Op_LShiftVI:
4591 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4592 case Op_LShiftVL:
4593 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4594 case Op_RShiftVS:
4595 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4596 case Op_RShiftVI:
4597 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4598 case Op_RShiftVL:
4599 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4600 case Op_URShiftVS:
4601 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4602 case Op_URShiftVI:
4603 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4604 case Op_URShiftVL:
4605 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4606 case Op_RotateRightV:
4607 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4608 case Op_RotateLeftV:
4609 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4610 default:
4611 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4612 break;
4613 }
4614 }
4615
4616 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4617 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4618 if (is_unsigned) {
4619 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4620 } else {
4621 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4622 }
4623 }
4624
4625 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4626 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4627 switch (elem_bt) {
4628 case T_BYTE:
4629 if (ideal_opc == Op_SaturatingAddV) {
4630 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4631 } else {
4632 assert(ideal_opc == Op_SaturatingSubV, "");
4633 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4634 }
4635 break;
4636 case T_SHORT:
4637 if (ideal_opc == Op_SaturatingAddV) {
4638 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4639 } else {
4640 assert(ideal_opc == Op_SaturatingSubV, "");
4641 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4642 }
4643 break;
4644 default:
4645 fatal("Unsupported type %s", type2name(elem_bt));
4646 break;
4647 }
4648 }
4649
4650 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4651 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4652 switch (elem_bt) {
4653 case T_BYTE:
4654 if (ideal_opc == Op_SaturatingAddV) {
4655 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4656 } else {
4657 assert(ideal_opc == Op_SaturatingSubV, "");
4658 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4659 }
4660 break;
4661 case T_SHORT:
4662 if (ideal_opc == Op_SaturatingAddV) {
4663 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4664 } else {
4665 assert(ideal_opc == Op_SaturatingSubV, "");
4666 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4667 }
4668 break;
4669 default:
4670 fatal("Unsupported type %s", type2name(elem_bt));
4671 break;
4672 }
4673 }
4674
4675 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4676 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4677 if (is_unsigned) {
4678 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4679 } else {
4680 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4681 }
4682 }
4683
4684 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4685 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4686 switch (elem_bt) {
4687 case T_BYTE:
4688 if (ideal_opc == Op_SaturatingAddV) {
4689 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4690 } else {
4691 assert(ideal_opc == Op_SaturatingSubV, "");
4692 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4693 }
4694 break;
4695 case T_SHORT:
4696 if (ideal_opc == Op_SaturatingAddV) {
4697 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4698 } else {
4699 assert(ideal_opc == Op_SaturatingSubV, "");
4700 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4701 }
4702 break;
4703 default:
4704 fatal("Unsupported type %s", type2name(elem_bt));
4705 break;
4706 }
4707 }
4708
4709 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4710 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4711 switch (elem_bt) {
4712 case T_BYTE:
4713 if (ideal_opc == Op_SaturatingAddV) {
4714 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4715 } else {
4716 assert(ideal_opc == Op_SaturatingSubV, "");
4717 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4718 }
4719 break;
4720 case T_SHORT:
4721 if (ideal_opc == Op_SaturatingAddV) {
4722 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4723 } else {
4724 assert(ideal_opc == Op_SaturatingSubV, "");
4725 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4726 }
4727 break;
4728 default:
4729 fatal("Unsupported type %s", type2name(elem_bt));
4730 break;
4731 }
4732 }
4733
4734 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4735 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4736 bool is_varshift) {
4737 switch (ideal_opc) {
4738 case Op_AddVB:
4739 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4740 case Op_AddVS:
4741 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4742 case Op_AddVI:
4743 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4744 case Op_AddVL:
4745 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4746 case Op_AddVF:
4747 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4748 case Op_AddVD:
4749 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4750 case Op_SubVB:
4751 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4752 case Op_SubVS:
4753 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4754 case Op_SubVI:
4755 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4756 case Op_SubVL:
4757 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4758 case Op_SubVF:
4759 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4760 case Op_SubVD:
4761 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4762 case Op_MulVS:
4763 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4764 case Op_MulVI:
4765 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4766 case Op_MulVL:
4767 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4768 case Op_MulVF:
4769 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4770 case Op_MulVD:
4771 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4772 case Op_DivVF:
4773 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4774 case Op_DivVD:
4775 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4776 case Op_SqrtVF:
4777 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4778 case Op_SqrtVD:
4779 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4780 case Op_AbsVB:
4781 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4782 case Op_AbsVS:
4783 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4784 case Op_AbsVI:
4785 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4786 case Op_AbsVL:
4787 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4788 case Op_FmaVF:
4789 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4790 case Op_FmaVD:
4791 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4792 case Op_VectorRearrange:
4793 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4794 case Op_LShiftVS:
4795 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4796 case Op_LShiftVI:
4797 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4798 case Op_LShiftVL:
4799 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4800 case Op_RShiftVS:
4801 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4802 case Op_RShiftVI:
4803 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4804 case Op_RShiftVL:
4805 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4806 case Op_URShiftVS:
4807 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4808 case Op_URShiftVI:
4809 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4810 case Op_URShiftVL:
4811 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4812 case Op_RotateLeftV:
4813 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4814 case Op_RotateRightV:
4815 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4816 case Op_MaxV:
4817 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4818 case Op_MinV:
4819 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4820 case Op_UMinV:
4821 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4822 case Op_UMaxV:
4823 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4824 case Op_XorV:
4825 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4826 case Op_OrV:
4827 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4828 case Op_AndV:
4829 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4830 default:
4831 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4832 break;
4833 }
4834 }
4835
4836 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4837 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4838 switch (ideal_opc) {
4839 case Op_AddVB:
4840 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4841 case Op_AddVS:
4842 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4843 case Op_AddVI:
4844 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4845 case Op_AddVL:
4846 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4847 case Op_AddVF:
4848 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4849 case Op_AddVD:
4850 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4851 case Op_SubVB:
4852 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4853 case Op_SubVS:
4854 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4855 case Op_SubVI:
4856 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4857 case Op_SubVL:
4858 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4859 case Op_SubVF:
4860 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4861 case Op_SubVD:
4862 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4863 case Op_MulVS:
4864 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4865 case Op_MulVI:
4866 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4867 case Op_MulVL:
4868 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4869 case Op_MulVF:
4870 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4871 case Op_MulVD:
4872 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4873 case Op_DivVF:
4874 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4875 case Op_DivVD:
4876 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4877 case Op_FmaVF:
4878 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4879 case Op_FmaVD:
4880 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4881 case Op_MaxV:
4882 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4883 case Op_MinV:
4884 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4885 case Op_UMaxV:
4886 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4887 case Op_UMinV:
4888 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4889 case Op_XorV:
4890 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4891 case Op_OrV:
4892 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4893 case Op_AndV:
4894 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4895 default:
4896 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4897 break;
4898 }
4899 }
4900
4901 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4902 KRegister src1, KRegister src2) {
4903 BasicType etype = T_ILLEGAL;
4904 switch(mask_len) {
4905 case 2:
4906 case 4:
4907 case 8: etype = T_BYTE; break;
4908 case 16: etype = T_SHORT; break;
4909 case 32: etype = T_INT; break;
4910 case 64: etype = T_LONG; break;
4911 default: fatal("Unsupported type"); break;
4912 }
4913 assert(etype != T_ILLEGAL, "");
4914 switch(ideal_opc) {
4915 case Op_AndVMask:
4916 kand(etype, dst, src1, src2); break;
4917 case Op_OrVMask:
4918 kor(etype, dst, src1, src2); break;
4919 case Op_XorVMask:
4920 kxor(etype, dst, src1, src2); break;
4921 default:
4922 fatal("Unsupported masked operation"); break;
4923 }
4924 }
4925
4926 /*
4927 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4928 * If src is NaN, the result is 0.
4929 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4930 * the result is equal to the value of Integer.MIN_VALUE.
4931 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4932 * the result is equal to the value of Integer.MAX_VALUE.
4933 */
4934 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4935 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4936 Register rscratch, AddressLiteral float_sign_flip,
4937 int vec_enc) {
4938 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4939 Label done;
4940 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4941 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4942 vptest(xtmp2, xtmp2, vec_enc);
4943 jccb(Assembler::equal, done);
4944
4945 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4946 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4947
4948 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4949 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4950 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4951
4952 // Recompute the mask for remaining special value.
4953 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4954 // Extract SRC values corresponding to TRUE mask lanes.
4955 vpand(xtmp4, xtmp2, src, vec_enc);
4956 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4957 // values are set.
4958 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4959
4960 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4961 bind(done);
4962 }
4963
4964 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4965 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4966 Register rscratch, AddressLiteral float_sign_flip,
4967 int vec_enc) {
4968 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4969 Label done;
4970 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4971 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4972 kortestwl(ktmp1, ktmp1);
4973 jccb(Assembler::equal, done);
4974
4975 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4976 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4977 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4978
4979 kxorwl(ktmp1, ktmp1, ktmp2);
4980 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4981 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4982 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4983 bind(done);
4984 }
4985
4986 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4987 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4988 Register rscratch, AddressLiteral double_sign_flip,
4989 int vec_enc) {
4990 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4991
4992 Label done;
4993 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4994 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4995 kortestwl(ktmp1, ktmp1);
4996 jccb(Assembler::equal, done);
4997
4998 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4999 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5000 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5001
5002 kxorwl(ktmp1, ktmp1, ktmp2);
5003 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5004 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5005 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5006 bind(done);
5007 }
5008
5009 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5010 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5011 Register rscratch, AddressLiteral float_sign_flip,
5012 int vec_enc) {
5013 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5014 Label done;
5015 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5016 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5017 kortestwl(ktmp1, ktmp1);
5018 jccb(Assembler::equal, done);
5019
5020 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5021 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5022 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5023
5024 kxorwl(ktmp1, ktmp1, ktmp2);
5025 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5026 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5027 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5028 bind(done);
5029 }
5030
5031 /*
5032 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5033 * If src is NaN, the result is 0.
5034 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5035 * the result is equal to the value of Long.MIN_VALUE.
5036 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5037 * the result is equal to the value of Long.MAX_VALUE.
5038 */
5039 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5040 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5041 Register rscratch, AddressLiteral double_sign_flip,
5042 int vec_enc) {
5043 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5044
5045 Label done;
5046 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5047 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5048 kortestwl(ktmp1, ktmp1);
5049 jccb(Assembler::equal, done);
5050
5051 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5052 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5053 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5054
5055 kxorwl(ktmp1, ktmp1, ktmp2);
5056 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5057 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5058 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5059 bind(done);
5060 }
5061
5062 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5063 XMMRegister xtmp, int index, int vec_enc) {
5064 assert(vec_enc < Assembler::AVX_512bit, "");
5065 if (vec_enc == Assembler::AVX_256bit) {
5066 vextractf128_high(xtmp, src);
5067 vshufps(dst, src, xtmp, index, vec_enc);
5068 } else {
5069 vshufps(dst, src, zero, index, vec_enc);
5070 }
5071 }
5072
5073 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5074 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5075 AddressLiteral float_sign_flip, int src_vec_enc) {
5076 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5077
5078 Label done;
5079 // Compare the destination lanes with float_sign_flip
5080 // value to get mask for all special values.
5081 movdqu(xtmp1, float_sign_flip, rscratch);
5082 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5083 ptest(xtmp2, xtmp2);
5084 jccb(Assembler::equal, done);
5085
5086 // Flip float_sign_flip to get max integer value.
5087 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5088 pxor(xtmp1, xtmp4);
5089
5090 // Set detination lanes corresponding to unordered source lanes as zero.
5091 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5092 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5093
5094 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5095 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5096 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5097
5098 // Recompute the mask for remaining special value.
5099 pxor(xtmp2, xtmp3);
5100 // Extract mask corresponding to non-negative source lanes.
5101 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5102
5103 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5104 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5105 pand(xtmp3, xtmp2);
5106
5107 // Replace destination lanes holding special value(0x80000000) with max int
5108 // if corresponding source lane holds a +ve value.
5109 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5110 bind(done);
5111 }
5112
5113
5114 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5115 XMMRegister xtmp, Register rscratch, int vec_enc) {
5116 switch(to_elem_bt) {
5117 case T_SHORT:
5118 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5119 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5120 vpackusdw(dst, dst, zero, vec_enc);
5121 if (vec_enc == Assembler::AVX_256bit) {
5122 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5123 }
5124 break;
5125 case T_BYTE:
5126 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5127 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5128 vpackusdw(dst, dst, zero, vec_enc);
5129 if (vec_enc == Assembler::AVX_256bit) {
5130 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5131 }
5132 vpackuswb(dst, dst, zero, vec_enc);
5133 break;
5134 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5135 }
5136 }
5137
5138 /*
5139 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5140 * a) Perform vector D2L/F2I cast.
5141 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5142 * It signifies that source value could be any of the special floating point
5143 * values(NaN,-Inf,Inf,Max,-Min).
5144 * c) Set destination to zero if source is NaN value.
5145 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5146 */
5147
5148 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5149 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5150 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5151 int to_elem_sz = type2aelembytes(to_elem_bt);
5152 assert(to_elem_sz <= 4, "");
5153 vcvttps2dq(dst, src, vec_enc);
5154 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5155 if (to_elem_sz < 4) {
5156 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5157 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5158 }
5159 }
5160
5161 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5162 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5163 Register rscratch, int vec_enc) {
5164 int to_elem_sz = type2aelembytes(to_elem_bt);
5165 assert(to_elem_sz <= 4, "");
5166 vcvttps2dq(dst, src, vec_enc);
5167 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5168 switch(to_elem_bt) {
5169 case T_INT:
5170 break;
5171 case T_SHORT:
5172 evpmovdw(dst, dst, vec_enc);
5173 break;
5174 case T_BYTE:
5175 evpmovdb(dst, dst, vec_enc);
5176 break;
5177 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5178 }
5179 }
5180
5181 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5182 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5183 Register rscratch, int vec_enc) {
5184 evcvttps2qq(dst, src, vec_enc);
5185 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5186 }
5187
5188 // Handling for downcasting from double to integer or sub-word types on AVX2.
5189 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5190 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5191 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5192 int to_elem_sz = type2aelembytes(to_elem_bt);
5193 assert(to_elem_sz < 8, "");
5194 vcvttpd2dq(dst, src, vec_enc);
5195 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5196 float_sign_flip, vec_enc);
5197 if (to_elem_sz < 4) {
5198 // xtmp4 holds all zero lanes.
5199 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5200 }
5201 }
5202
5203 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5204 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5205 KRegister ktmp2, AddressLiteral sign_flip,
5206 Register rscratch, int vec_enc) {
5207 if (VM_Version::supports_avx512dq()) {
5208 evcvttpd2qq(dst, src, vec_enc);
5209 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5210 switch(to_elem_bt) {
5211 case T_LONG:
5212 break;
5213 case T_INT:
5214 evpmovsqd(dst, dst, vec_enc);
5215 break;
5216 case T_SHORT:
5217 evpmovsqd(dst, dst, vec_enc);
5218 evpmovdw(dst, dst, vec_enc);
5219 break;
5220 case T_BYTE:
5221 evpmovsqd(dst, dst, vec_enc);
5222 evpmovdb(dst, dst, vec_enc);
5223 break;
5224 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5225 }
5226 } else {
5227 assert(type2aelembytes(to_elem_bt) <= 4, "");
5228 vcvttpd2dq(dst, src, vec_enc);
5229 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5230 switch(to_elem_bt) {
5231 case T_INT:
5232 break;
5233 case T_SHORT:
5234 evpmovdw(dst, dst, vec_enc);
5235 break;
5236 case T_BYTE:
5237 evpmovdb(dst, dst, vec_enc);
5238 break;
5239 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5240 }
5241 }
5242 }
5243
5244 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5245 switch(to_elem_bt) {
5246 case T_LONG:
5247 evcvttps2qqs(dst, src, vec_enc);
5248 break;
5249 case T_INT:
5250 evcvttps2dqs(dst, src, vec_enc);
5251 break;
5252 case T_SHORT:
5253 evcvttps2dqs(dst, src, vec_enc);
5254 evpmovdw(dst, dst, vec_enc);
5255 break;
5256 case T_BYTE:
5257 evcvttps2dqs(dst, src, vec_enc);
5258 evpmovdb(dst, dst, vec_enc);
5259 break;
5260 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5261 }
5262 }
5263
5264 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5265 switch(to_elem_bt) {
5266 case T_LONG:
5267 evcvttps2qqs(dst, src, vec_enc);
5268 break;
5269 case T_INT:
5270 evcvttps2dqs(dst, src, vec_enc);
5271 break;
5272 case T_SHORT:
5273 evcvttps2dqs(dst, src, vec_enc);
5274 evpmovdw(dst, dst, vec_enc);
5275 break;
5276 case T_BYTE:
5277 evcvttps2dqs(dst, src, vec_enc);
5278 evpmovdb(dst, dst, vec_enc);
5279 break;
5280 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5281 }
5282 }
5283
5284 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5285 switch(to_elem_bt) {
5286 case T_LONG:
5287 evcvttpd2qqs(dst, src, vec_enc);
5288 break;
5289 case T_INT:
5290 evcvttpd2dqs(dst, src, vec_enc);
5291 break;
5292 case T_SHORT:
5293 evcvttpd2dqs(dst, src, vec_enc);
5294 evpmovdw(dst, dst, vec_enc);
5295 break;
5296 case T_BYTE:
5297 evcvttpd2dqs(dst, src, vec_enc);
5298 evpmovdb(dst, dst, vec_enc);
5299 break;
5300 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5301 }
5302 }
5303
5304 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5305 switch(to_elem_bt) {
5306 case T_LONG:
5307 evcvttpd2qqs(dst, src, vec_enc);
5308 break;
5309 case T_INT:
5310 evcvttpd2dqs(dst, src, vec_enc);
5311 break;
5312 case T_SHORT:
5313 evcvttpd2dqs(dst, src, vec_enc);
5314 evpmovdw(dst, dst, vec_enc);
5315 break;
5316 case T_BYTE:
5317 evcvttpd2dqs(dst, src, vec_enc);
5318 evpmovdb(dst, dst, vec_enc);
5319 break;
5320 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5321 }
5322 }
5323
5324 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5325 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5326 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5327 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5328 // and re-instantiate original MXCSR.RC mode after that.
5329 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5330
5331 mov64(tmp, julong_cast(0.5L));
5332 evpbroadcastq(xtmp1, tmp, vec_enc);
5333 vaddpd(xtmp1, src , xtmp1, vec_enc);
5334 evcvtpd2qq(dst, xtmp1, vec_enc);
5335 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5336 double_sign_flip, vec_enc);;
5337
5338 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5339 }
5340
5341 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5342 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5343 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5344 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5345 // and re-instantiate original MXCSR.RC mode after that.
5346 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5347
5348 movl(tmp, jint_cast(0.5));
5349 movq(xtmp1, tmp);
5350 vbroadcastss(xtmp1, xtmp1, vec_enc);
5351 vaddps(xtmp1, src , xtmp1, vec_enc);
5352 vcvtps2dq(dst, xtmp1, vec_enc);
5353 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5354 float_sign_flip, vec_enc);
5355
5356 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5357 }
5358
5359 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5360 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5361 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5362 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5363 // and re-instantiate original MXCSR.RC mode after that.
5364 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5365
5366 movl(tmp, jint_cast(0.5));
5367 movq(xtmp1, tmp);
5368 vbroadcastss(xtmp1, xtmp1, vec_enc);
5369 vaddps(xtmp1, src , xtmp1, vec_enc);
5370 vcvtps2dq(dst, xtmp1, vec_enc);
5371 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5372
5373 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5374 }
5375
5376 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5377 BasicType from_elem_bt, BasicType to_elem_bt) {
5378 switch (from_elem_bt) {
5379 case T_BYTE:
5380 switch (to_elem_bt) {
5381 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5382 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5383 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5384 default: ShouldNotReachHere();
5385 }
5386 break;
5387 case T_SHORT:
5388 switch (to_elem_bt) {
5389 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5390 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5391 default: ShouldNotReachHere();
5392 }
5393 break;
5394 case T_INT:
5395 assert(to_elem_bt == T_LONG, "");
5396 vpmovzxdq(dst, src, vlen_enc);
5397 break;
5398 default:
5399 ShouldNotReachHere();
5400 }
5401 }
5402
5403 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5404 BasicType from_elem_bt, BasicType to_elem_bt) {
5405 switch (from_elem_bt) {
5406 case T_BYTE:
5407 switch (to_elem_bt) {
5408 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5409 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5410 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5411 default: ShouldNotReachHere();
5412 }
5413 break;
5414 case T_SHORT:
5415 switch (to_elem_bt) {
5416 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5417 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5418 default: ShouldNotReachHere();
5419 }
5420 break;
5421 case T_INT:
5422 assert(to_elem_bt == T_LONG, "");
5423 vpmovsxdq(dst, src, vlen_enc);
5424 break;
5425 default:
5426 ShouldNotReachHere();
5427 }
5428 }
5429
5430 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5431 BasicType dst_bt, BasicType src_bt, int vlen) {
5432 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5433 assert(vlen_enc != AVX_512bit, "");
5434
5435 int dst_bt_size = type2aelembytes(dst_bt);
5436 int src_bt_size = type2aelembytes(src_bt);
5437 if (dst_bt_size > src_bt_size) {
5438 switch (dst_bt_size / src_bt_size) {
5439 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5440 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5441 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5442 default: ShouldNotReachHere();
5443 }
5444 } else {
5445 assert(dst_bt_size < src_bt_size, "");
5446 switch (src_bt_size / dst_bt_size) {
5447 case 2: {
5448 if (vlen_enc == AVX_128bit) {
5449 vpacksswb(dst, src, src, vlen_enc);
5450 } else {
5451 vpacksswb(dst, src, src, vlen_enc);
5452 vpermq(dst, dst, 0x08, vlen_enc);
5453 }
5454 break;
5455 }
5456 case 4: {
5457 if (vlen_enc == AVX_128bit) {
5458 vpackssdw(dst, src, src, vlen_enc);
5459 vpacksswb(dst, dst, dst, vlen_enc);
5460 } else {
5461 vpackssdw(dst, src, src, vlen_enc);
5462 vpermq(dst, dst, 0x08, vlen_enc);
5463 vpacksswb(dst, dst, dst, AVX_128bit);
5464 }
5465 break;
5466 }
5467 case 8: {
5468 if (vlen_enc == AVX_128bit) {
5469 vpshufd(dst, src, 0x08, vlen_enc);
5470 vpackssdw(dst, dst, dst, vlen_enc);
5471 vpacksswb(dst, dst, dst, vlen_enc);
5472 } else {
5473 vpshufd(dst, src, 0x08, vlen_enc);
5474 vpermq(dst, dst, 0x08, vlen_enc);
5475 vpackssdw(dst, dst, dst, AVX_128bit);
5476 vpacksswb(dst, dst, dst, AVX_128bit);
5477 }
5478 break;
5479 }
5480 default: ShouldNotReachHere();
5481 }
5482 }
5483 }
5484
5485 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5486 bool merge, BasicType bt, int vlen_enc) {
5487 if (bt == T_INT) {
5488 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5489 } else {
5490 assert(bt == T_LONG, "");
5491 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5492 }
5493 }
5494
5495 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5496 bool merge, BasicType bt, int vlen_enc) {
5497 if (bt == T_INT) {
5498 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5499 } else {
5500 assert(bt == T_LONG, "");
5501 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5502 }
5503 }
5504
5505 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5506 Register rtmp2, XMMRegister xtmp, int mask_len,
5507 int vec_enc) {
5508 int index = 0;
5509 int vindex = 0;
5510 mov64(rtmp1, 0x0101010101010101L);
5511 pdepq(rtmp1, src, rtmp1);
5512 if (mask_len > 8) {
5513 movq(rtmp2, src);
5514 vpxor(xtmp, xtmp, xtmp, vec_enc);
5515 movq(xtmp, rtmp1);
5516 }
5517 movq(dst, rtmp1);
5518
5519 mask_len -= 8;
5520 while (mask_len > 0) {
5521 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5522 index++;
5523 if ((index % 2) == 0) {
5524 pxor(xtmp, xtmp);
5525 }
5526 mov64(rtmp1, 0x0101010101010101L);
5527 shrq(rtmp2, 8);
5528 pdepq(rtmp1, rtmp2, rtmp1);
5529 pinsrq(xtmp, rtmp1, index % 2);
5530 vindex = index / 2;
5531 if (vindex) {
5532 // Write entire 16 byte vector when both 64 bit
5533 // lanes are update to save redundant instructions.
5534 if (index % 2) {
5535 vinsertf128(dst, dst, xtmp, vindex);
5536 }
5537 } else {
5538 vmovdqu(dst, xtmp);
5539 }
5540 mask_len -= 8;
5541 }
5542 }
5543
5544 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5545 switch(opc) {
5546 case Op_VectorMaskTrueCount:
5547 popcntq(dst, tmp);
5548 break;
5549 case Op_VectorMaskLastTrue:
5550 if (VM_Version::supports_lzcnt()) {
5551 lzcntq(tmp, tmp);
5552 movl(dst, 63);
5553 subl(dst, tmp);
5554 } else {
5555 movl(dst, -1);
5556 bsrq(tmp, tmp);
5557 cmov32(Assembler::notZero, dst, tmp);
5558 }
5559 break;
5560 case Op_VectorMaskFirstTrue:
5561 if (VM_Version::supports_bmi1()) {
5562 if (masklen < 32) {
5563 orl(tmp, 1 << masklen);
5564 tzcntl(dst, tmp);
5565 } else if (masklen == 32) {
5566 tzcntl(dst, tmp);
5567 } else {
5568 assert(masklen == 64, "");
5569 tzcntq(dst, tmp);
5570 }
5571 } else {
5572 if (masklen < 32) {
5573 orl(tmp, 1 << masklen);
5574 bsfl(dst, tmp);
5575 } else {
5576 assert(masklen == 32 || masklen == 64, "");
5577 movl(dst, masklen);
5578 if (masklen == 32) {
5579 bsfl(tmp, tmp);
5580 } else {
5581 bsfq(tmp, tmp);
5582 }
5583 cmov32(Assembler::notZero, dst, tmp);
5584 }
5585 }
5586 break;
5587 case Op_VectorMaskToLong:
5588 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5589 break;
5590 default: assert(false, "Unhandled mask operation");
5591 }
5592 }
5593
5594 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5595 int masklen, int masksize, int vec_enc) {
5596 assert(VM_Version::supports_popcnt(), "");
5597
5598 if(VM_Version::supports_avx512bw()) {
5599 kmovql(tmp, mask);
5600 } else {
5601 assert(masklen <= 16, "");
5602 kmovwl(tmp, mask);
5603 }
5604
5605 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5606 // operations needs to be clipped.
5607 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5608 andq(tmp, (1 << masklen) - 1);
5609 }
5610
5611 vector_mask_operation_helper(opc, dst, tmp, masklen);
5612 }
5613
5614 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5615 Register tmp, int masklen, BasicType bt, int vec_enc) {
5616 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5617 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5618 assert(VM_Version::supports_popcnt(), "");
5619
5620 bool need_clip = false;
5621 switch(bt) {
5622 case T_BOOLEAN:
5623 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5624 vpxor(xtmp, xtmp, xtmp, vec_enc);
5625 vpsubb(xtmp, xtmp, mask, vec_enc);
5626 vpmovmskb(tmp, xtmp, vec_enc);
5627 need_clip = masklen < 16;
5628 break;
5629 case T_BYTE:
5630 vpmovmskb(tmp, mask, vec_enc);
5631 need_clip = masklen < 16;
5632 break;
5633 case T_SHORT:
5634 vpacksswb(xtmp, mask, mask, vec_enc);
5635 if (masklen >= 16) {
5636 vpermpd(xtmp, xtmp, 8, vec_enc);
5637 }
5638 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5639 need_clip = masklen < 16;
5640 break;
5641 case T_INT:
5642 case T_FLOAT:
5643 vmovmskps(tmp, mask, vec_enc);
5644 need_clip = masklen < 4;
5645 break;
5646 case T_LONG:
5647 case T_DOUBLE:
5648 vmovmskpd(tmp, mask, vec_enc);
5649 need_clip = masklen < 2;
5650 break;
5651 default: assert(false, "Unhandled type, %s", type2name(bt));
5652 }
5653
5654 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5655 // operations needs to be clipped.
5656 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5657 // need_clip implies masklen < 32
5658 andq(tmp, (1 << masklen) - 1);
5659 }
5660
5661 vector_mask_operation_helper(opc, dst, tmp, masklen);
5662 }
5663
5664 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5665 Register rtmp2, int mask_len) {
5666 kmov(rtmp1, src);
5667 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5668 mov64(rtmp2, -1L);
5669 pextq(rtmp2, rtmp2, rtmp1);
5670 kmov(dst, rtmp2);
5671 }
5672
5673 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5674 XMMRegister mask, Register rtmp, Register rscratch,
5675 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5676 int vec_enc) {
5677 assert(type2aelembytes(bt) >= 4, "");
5678 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5679 address compress_perm_table = nullptr;
5680 address expand_perm_table = nullptr;
5681 if (type2aelembytes(bt) == 8) {
5682 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5683 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5684 vmovmskpd(rtmp, mask, vec_enc);
5685 } else {
5686 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5687 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5688 vmovmskps(rtmp, mask, vec_enc);
5689 }
5690 shlq(rtmp, 5); // for 32 byte permute row.
5691 if (opcode == Op_CompressV) {
5692 lea(rscratch, ExternalAddress(compress_perm_table));
5693 } else {
5694 lea(rscratch, ExternalAddress(expand_perm_table));
5695 }
5696 addptr(rtmp, rscratch);
5697 vmovdqu(permv, Address(rtmp));
5698 vpermps(dst, permv, src, Assembler::AVX_256bit);
5699 vpxor(xtmp, xtmp, xtmp, vec_enc);
5700 // Blend the result with zero vector using permute mask, each column entry
5701 // in a permute table row contains either a valid permute index or a -1 (default)
5702 // value, this can potentially be used as a blending mask after
5703 // compressing/expanding the source vector lanes.
5704 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5705 }
5706
5707 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5708 bool merge, BasicType bt, int vec_enc) {
5709 if (opcode == Op_CompressV) {
5710 switch(bt) {
5711 case T_BYTE:
5712 evpcompressb(dst, mask, src, merge, vec_enc);
5713 break;
5714 case T_CHAR:
5715 case T_SHORT:
5716 evpcompressw(dst, mask, src, merge, vec_enc);
5717 break;
5718 case T_INT:
5719 evpcompressd(dst, mask, src, merge, vec_enc);
5720 break;
5721 case T_FLOAT:
5722 evcompressps(dst, mask, src, merge, vec_enc);
5723 break;
5724 case T_LONG:
5725 evpcompressq(dst, mask, src, merge, vec_enc);
5726 break;
5727 case T_DOUBLE:
5728 evcompresspd(dst, mask, src, merge, vec_enc);
5729 break;
5730 default:
5731 fatal("Unsupported type %s", type2name(bt));
5732 break;
5733 }
5734 } else {
5735 assert(opcode == Op_ExpandV, "");
5736 switch(bt) {
5737 case T_BYTE:
5738 evpexpandb(dst, mask, src, merge, vec_enc);
5739 break;
5740 case T_CHAR:
5741 case T_SHORT:
5742 evpexpandw(dst, mask, src, merge, vec_enc);
5743 break;
5744 case T_INT:
5745 evpexpandd(dst, mask, src, merge, vec_enc);
5746 break;
5747 case T_FLOAT:
5748 evexpandps(dst, mask, src, merge, vec_enc);
5749 break;
5750 case T_LONG:
5751 evpexpandq(dst, mask, src, merge, vec_enc);
5752 break;
5753 case T_DOUBLE:
5754 evexpandpd(dst, mask, src, merge, vec_enc);
5755 break;
5756 default:
5757 fatal("Unsupported type %s", type2name(bt));
5758 break;
5759 }
5760 }
5761 }
5762
5763 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5764 KRegister ktmp1, int vec_enc) {
5765 if (opcode == Op_SignumVD) {
5766 vsubpd(dst, zero, one, vec_enc);
5767 // if src < 0 ? -1 : 1
5768 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5769 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5770 // if src == NaN, -0.0 or 0.0 return src.
5771 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5772 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5773 } else {
5774 assert(opcode == Op_SignumVF, "");
5775 vsubps(dst, zero, one, vec_enc);
5776 // if src < 0 ? -1 : 1
5777 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5778 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5779 // if src == NaN, -0.0 or 0.0 return src.
5780 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5781 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5782 }
5783 }
5784
5785 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5786 XMMRegister xtmp1, int vec_enc) {
5787 if (opcode == Op_SignumVD) {
5788 vsubpd(dst, zero, one, vec_enc);
5789 // if src < 0 ? -1 : 1
5790 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5791 // if src == NaN, -0.0 or 0.0 return src.
5792 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5793 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5794 } else {
5795 assert(opcode == Op_SignumVF, "");
5796 vsubps(dst, zero, one, vec_enc);
5797 // if src < 0 ? -1 : 1
5798 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5799 // if src == NaN, -0.0 or 0.0 return src.
5800 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5801 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5802 }
5803 }
5804
5805 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5806 if (VM_Version::supports_avx512bw()) {
5807 if (mask_len > 32) {
5808 kmovql(dst, src);
5809 } else {
5810 kmovdl(dst, src);
5811 if (mask_len != 32) {
5812 kshiftrdl(dst, dst, 32 - mask_len);
5813 }
5814 }
5815 } else {
5816 assert(mask_len <= 16, "");
5817 kmovwl(dst, src);
5818 if (mask_len != 16) {
5819 kshiftrwl(dst, dst, 16 - mask_len);
5820 }
5821 }
5822 }
5823
5824 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5825 int lane_size = type2aelembytes(bt);
5826 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5827 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5828 movptr(rtmp, imm32);
5829 switch(lane_size) {
5830 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5831 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5832 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5833 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5834 fatal("Unsupported lane size %d", lane_size);
5835 break;
5836 }
5837 } else {
5838 movptr(rtmp, imm32);
5839 movq(dst, rtmp);
5840 switch(lane_size) {
5841 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5842 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5843 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5844 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5845 fatal("Unsupported lane size %d", lane_size);
5846 break;
5847 }
5848 }
5849 }
5850
5851 //
5852 // Following is lookup table based popcount computation algorithm:-
5853 // Index Bit set count
5854 // [ 0000 -> 0,
5855 // 0001 -> 1,
5856 // 0010 -> 1,
5857 // 0011 -> 2,
5858 // 0100 -> 1,
5859 // 0101 -> 2,
5860 // 0110 -> 2,
5861 // 0111 -> 3,
5862 // 1000 -> 1,
5863 // 1001 -> 2,
5864 // 1010 -> 3,
5865 // 1011 -> 3,
5866 // 1100 -> 2,
5867 // 1101 -> 3,
5868 // 1111 -> 4 ]
5869 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5870 // shuffle indices for lookup table access.
5871 // b. Right shift each byte of vector lane by 4 positions.
5872 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5873 // shuffle indices for lookup table access.
5874 // d. Add the bitset count of upper and lower 4 bits of each byte.
5875 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5876 // count of all the bytes of a quadword.
5877 // f. Perform step e. for upper 128bit vector lane.
5878 // g. Pack the bitset count of quadwords back to double word.
5879 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5880
5881 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5882 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5883 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5884 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5885 vpsrlw(dst, src, 4, vec_enc);
5886 vpand(dst, dst, xtmp1, vec_enc);
5887 vpand(xtmp1, src, xtmp1, vec_enc);
5888 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5889 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5890 vpshufb(dst, xtmp2, dst, vec_enc);
5891 vpaddb(dst, dst, xtmp1, vec_enc);
5892 }
5893
5894 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5895 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5896 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5897 // Following code is as per steps e,f,g and h of above algorithm.
5898 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5899 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5900 vpsadbw(dst, dst, xtmp2, vec_enc);
5901 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5902 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5903 vpackuswb(dst, xtmp1, dst, vec_enc);
5904 }
5905
5906 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5907 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5908 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5909 // Add the popcount of upper and lower bytes of word.
5910 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5911 vpsrlw(dst, xtmp1, 8, vec_enc);
5912 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5913 vpaddw(dst, dst, xtmp1, vec_enc);
5914 }
5915
5916 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5917 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5918 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5919 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5920 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5921 }
5922
5923 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5924 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5925 switch(bt) {
5926 case T_LONG:
5927 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5928 break;
5929 case T_INT:
5930 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5931 break;
5932 case T_CHAR:
5933 case T_SHORT:
5934 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5935 break;
5936 case T_BYTE:
5937 case T_BOOLEAN:
5938 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5939 break;
5940 default:
5941 fatal("Unsupported type %s", type2name(bt));
5942 break;
5943 }
5944 }
5945
5946 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5947 KRegister mask, bool merge, int vec_enc) {
5948 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5949 switch(bt) {
5950 case T_LONG:
5951 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5952 evpopcntq(dst, mask, src, merge, vec_enc);
5953 break;
5954 case T_INT:
5955 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5956 evpopcntd(dst, mask, src, merge, vec_enc);
5957 break;
5958 case T_CHAR:
5959 case T_SHORT:
5960 assert(VM_Version::supports_avx512_bitalg(), "");
5961 evpopcntw(dst, mask, src, merge, vec_enc);
5962 break;
5963 case T_BYTE:
5964 case T_BOOLEAN:
5965 assert(VM_Version::supports_avx512_bitalg(), "");
5966 evpopcntb(dst, mask, src, merge, vec_enc);
5967 break;
5968 default:
5969 fatal("Unsupported type %s", type2name(bt));
5970 break;
5971 }
5972 }
5973
5974 // Bit reversal algorithm first reverses the bits of each byte followed by
5975 // a byte level reversal for multi-byte primitive types (short/int/long).
5976 // Algorithm performs a lookup table access to get reverse bit sequence
5977 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5978 // is obtained by swapping the reverse bit sequences of upper and lower
5979 // nibble of a byte.
5980 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5981 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5982 if (VM_Version::supports_avx512vlbw()) {
5983
5984 // Get the reverse bit sequence of lower nibble of each byte.
5985 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5986 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5987 evpandq(dst, xtmp2, src, vec_enc);
5988 vpshufb(dst, xtmp1, dst, vec_enc);
5989 vpsllq(dst, dst, 4, vec_enc);
5990
5991 // Get the reverse bit sequence of upper nibble of each byte.
5992 vpandn(xtmp2, xtmp2, src, vec_enc);
5993 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5994 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5995
5996 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5997 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5998 evporq(xtmp2, dst, xtmp2, vec_enc);
5999 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6000
6001 } else if(vec_enc == Assembler::AVX_512bit) {
6002 // Shift based bit reversal.
6003 assert(bt == T_LONG || bt == T_INT, "");
6004
6005 // Swap lower and upper nibble of each byte.
6006 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6007
6008 // Swap two least and most significant bits of each nibble.
6009 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6010
6011 // Swap adjacent pair of bits.
6012 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6013 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6014
6015 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6016 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6017 } else {
6018 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6019 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6020
6021 // Get the reverse bit sequence of lower nibble of each byte.
6022 vpand(dst, xtmp2, src, vec_enc);
6023 vpshufb(dst, xtmp1, dst, vec_enc);
6024 vpsllq(dst, dst, 4, vec_enc);
6025
6026 // Get the reverse bit sequence of upper nibble of each byte.
6027 vpandn(xtmp2, xtmp2, src, vec_enc);
6028 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6029 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6030
6031 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6032 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6033 vpor(xtmp2, dst, xtmp2, vec_enc);
6034 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6035 }
6036 }
6037
6038 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6039 XMMRegister xtmp, Register rscratch) {
6040 assert(VM_Version::supports_gfni(), "");
6041 assert(rscratch != noreg || always_reachable(mask), "missing");
6042
6043 // Galois field instruction based bit reversal based on following algorithm.
6044 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6045 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6046 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6047 vector_reverse_byte(bt, dst, xtmp, vec_enc);
6048 }
6049
6050 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6051 XMMRegister xtmp1, Register rtmp, int vec_enc) {
6052 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6053 evpandq(dst, xtmp1, src, vec_enc);
6054 vpsllq(dst, dst, nbits, vec_enc);
6055 vpandn(xtmp1, xtmp1, src, vec_enc);
6056 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6057 evporq(dst, dst, xtmp1, vec_enc);
6058 }
6059
6060 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6061 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6062 // Shift based bit reversal.
6063 assert(VM_Version::supports_evex(), "");
6064 switch(bt) {
6065 case T_LONG:
6066 // Swap upper and lower double word of each quad word.
6067 evprorq(xtmp1, k0, src, 32, true, vec_enc);
6068 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6069 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6070 break;
6071 case T_INT:
6072 // Swap upper and lower word of each double word.
6073 evprord(xtmp1, k0, src, 16, true, vec_enc);
6074 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6075 break;
6076 case T_CHAR:
6077 case T_SHORT:
6078 // Swap upper and lower byte of each word.
6079 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6080 break;
6081 case T_BYTE:
6082 evmovdquq(dst, k0, src, true, vec_enc);
6083 break;
6084 default:
6085 fatal("Unsupported type %s", type2name(bt));
6086 break;
6087 }
6088 }
6089
6090 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6091 if (bt == T_BYTE) {
6092 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6093 evmovdquq(dst, k0, src, true, vec_enc);
6094 } else {
6095 vmovdqu(dst, src);
6096 }
6097 return;
6098 }
6099 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6100 // pre-computed shuffle indices.
6101 switch(bt) {
6102 case T_LONG:
6103 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6104 break;
6105 case T_INT:
6106 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6107 break;
6108 case T_CHAR:
6109 case T_SHORT:
6110 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6111 break;
6112 default:
6113 fatal("Unsupported type %s", type2name(bt));
6114 break;
6115 }
6116 vpshufb(dst, src, dst, vec_enc);
6117 }
6118
6119 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6120 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6121 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6122 assert(is_integral_type(bt), "");
6123 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6124 assert(VM_Version::supports_avx512cd(), "");
6125 switch(bt) {
6126 case T_LONG:
6127 evplzcntq(dst, ktmp, src, merge, vec_enc);
6128 break;
6129 case T_INT:
6130 evplzcntd(dst, ktmp, src, merge, vec_enc);
6131 break;
6132 case T_SHORT:
6133 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6134 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6135 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6136 vpunpckhwd(dst, xtmp1, src, vec_enc);
6137 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6138 vpackusdw(dst, xtmp2, dst, vec_enc);
6139 break;
6140 case T_BYTE:
6141 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6142 // accessing the lookup table.
6143 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6144 // accessing the lookup table.
6145 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6146 assert(VM_Version::supports_avx512bw(), "");
6147 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6148 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6149 vpand(xtmp2, dst, src, vec_enc);
6150 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6151 vpsrlw(xtmp3, src, 4, vec_enc);
6152 vpand(xtmp3, dst, xtmp3, vec_enc);
6153 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6154 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6155 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6156 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6157 break;
6158 default:
6159 fatal("Unsupported type %s", type2name(bt));
6160 break;
6161 }
6162 }
6163
6164 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6165 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6166 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6167 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6168 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6169 // accessing the lookup table.
6170 vpand(dst, xtmp2, src, vec_enc);
6171 vpshufb(dst, xtmp1, dst, vec_enc);
6172 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6173 // accessing the lookup table.
6174 vpsrlw(xtmp3, src, 4, vec_enc);
6175 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6176 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6177 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6178 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6179 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6180 vpaddb(dst, dst, xtmp2, vec_enc);
6181 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6182 }
6183
6184 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6185 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6186 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6187 // Add zero counts of lower byte and upper byte of a word if
6188 // upper byte holds a zero value.
6189 vpsrlw(xtmp3, src, 8, vec_enc);
6190 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6191 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6192 vpsllw(xtmp2, dst, 8, vec_enc);
6193 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6194 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6195 vpsrlw(dst, dst, 8, vec_enc);
6196 }
6197
6198 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6199 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6200 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6201 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6202 // exponent as the leading zero count.
6203
6204 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6205 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6206 // contributes to the leading number of zeros.
6207 vpsrld(dst, src, 1, vec_enc);
6208 vpandn(dst, dst, src, vec_enc);
6209
6210 vcvtdq2ps(dst, dst, vec_enc);
6211
6212 // By comparing the register to itself, all the bits in the destination are set.
6213 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6214
6215 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6216 vpsrld(xtmp2, xtmp1, 24, vec_enc);
6217 vpsrld(dst, dst, 23, vec_enc);
6218 vpand(dst, xtmp2, dst, vec_enc);
6219
6220 // Subtract 127 from the exponent, which removes the bias from the exponent.
6221 vpsrld(xtmp2, xtmp1, 25, vec_enc);
6222 vpsubd(dst, dst, xtmp2, vec_enc);
6223
6224 vpsrld(xtmp2, xtmp1, 27, vec_enc);
6225
6226 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6227 // is found in any of the lanes, replace the lane with -1 from xtmp1.
6228 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6229
6230 // If the original value is negative, replace the lane with 31.
6231 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6232
6233 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6234 // and for negative numbers the result is 0 as the exponent was replaced with 31.
6235 vpsubd(dst, xtmp2, dst, vec_enc);
6236 }
6237
6238 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6239 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6240 // Find the leading zeros of the top and bottom halves of the long individually.
6241 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6242
6243 // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6244 vpsrlq(xtmp1, dst, 32, vec_enc);
6245 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6246 // be in the most significant position of the bottom half.
6247 vpsrlq(xtmp2, dst, 6, vec_enc);
6248
6249 // In the bottom half, add the top half and bottom half results.
6250 vpaddq(dst, xtmp1, dst, vec_enc);
6251
6252 // For the bottom half, choose between the values using the most significant bit of xtmp2.
6253 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6254 // which contains only the top half result.
6255 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6256 // the lane as required.
6257 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6258 }
6259
6260 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6261 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6262 Register rtmp, int vec_enc) {
6263 assert(is_integral_type(bt), "unexpected type");
6264 assert(vec_enc < Assembler::AVX_512bit, "");
6265 switch(bt) {
6266 case T_LONG:
6267 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6268 break;
6269 case T_INT:
6270 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6271 break;
6272 case T_SHORT:
6273 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6274 break;
6275 case T_BYTE:
6276 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6277 break;
6278 default:
6279 fatal("Unsupported type %s", type2name(bt));
6280 break;
6281 }
6282 }
6283
6284 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6285 switch(bt) {
6286 case T_BYTE:
6287 vpsubb(dst, src1, src2, vec_enc);
6288 break;
6289 case T_SHORT:
6290 vpsubw(dst, src1, src2, vec_enc);
6291 break;
6292 case T_INT:
6293 vpsubd(dst, src1, src2, vec_enc);
6294 break;
6295 case T_LONG:
6296 vpsubq(dst, src1, src2, vec_enc);
6297 break;
6298 default:
6299 fatal("Unsupported type %s", type2name(bt));
6300 break;
6301 }
6302 }
6303
6304 // Trailing zero count computation is based on leading zero count operation as per
6305 // following equation. All AVX3 targets support AVX512CD feature which offers
6306 // direct vector instruction to compute leading zero count.
6307 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6308 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6309 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6310 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6311 assert(is_integral_type(bt), "");
6312 // xtmp = -1
6313 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6314 // xtmp = xtmp + src
6315 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6316 // xtmp = xtmp & ~src
6317 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6318 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6319 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6320 vpsub(bt, dst, xtmp4, dst, vec_enc);
6321 }
6322
6323 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6324 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6325 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6326 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6327 assert(is_integral_type(bt), "");
6328 // xtmp = 0
6329 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6330 // xtmp = 0 - src
6331 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6332 // xtmp = xtmp | src
6333 vpor(xtmp3, xtmp3, src, vec_enc);
6334 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6335 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6336 vpsub(bt, dst, xtmp1, dst, vec_enc);
6337 }
6338
6339 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6340 Label done;
6341 Label neg_divisor_fastpath;
6342 cmpl(divisor, 0);
6343 jccb(Assembler::less, neg_divisor_fastpath);
6344 xorl(rdx, rdx);
6345 divl(divisor);
6346 jmpb(done);
6347 bind(neg_divisor_fastpath);
6348 // Fastpath for divisor < 0:
6349 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6350 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6351 movl(rdx, rax);
6352 subl(rdx, divisor);
6353 if (VM_Version::supports_bmi1()) {
6354 andnl(rax, rdx, rax);
6355 } else {
6356 notl(rdx);
6357 andl(rax, rdx);
6358 }
6359 shrl(rax, 31);
6360 bind(done);
6361 }
6362
6363 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6364 Label done;
6365 Label neg_divisor_fastpath;
6366 cmpl(divisor, 0);
6367 jccb(Assembler::less, neg_divisor_fastpath);
6368 xorl(rdx, rdx);
6369 divl(divisor);
6370 jmpb(done);
6371 bind(neg_divisor_fastpath);
6372 // Fastpath when divisor < 0:
6373 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6374 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6375 movl(rdx, rax);
6376 subl(rax, divisor);
6377 if (VM_Version::supports_bmi1()) {
6378 andnl(rax, rax, rdx);
6379 } else {
6380 notl(rax);
6381 andl(rax, rdx);
6382 }
6383 sarl(rax, 31);
6384 andl(rax, divisor);
6385 subl(rdx, rax);
6386 bind(done);
6387 }
6388
6389 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6390 Label done;
6391 Label neg_divisor_fastpath;
6392
6393 cmpl(divisor, 0);
6394 jccb(Assembler::less, neg_divisor_fastpath);
6395 xorl(rdx, rdx);
6396 divl(divisor);
6397 jmpb(done);
6398 bind(neg_divisor_fastpath);
6399 // Fastpath for divisor < 0:
6400 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6401 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6402 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6403 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6404 movl(rdx, rax);
6405 subl(rax, divisor);
6406 if (VM_Version::supports_bmi1()) {
6407 andnl(rax, rax, rdx);
6408 } else {
6409 notl(rax);
6410 andl(rax, rdx);
6411 }
6412 movl(tmp, rax);
6413 shrl(rax, 31); // quotient
6414 sarl(tmp, 31);
6415 andl(tmp, divisor);
6416 subl(rdx, tmp); // remainder
6417 bind(done);
6418 }
6419
6420 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6421 XMMRegister xtmp2, Register rtmp) {
6422 if(VM_Version::supports_gfni()) {
6423 // Galois field instruction based bit reversal based on following algorithm.
6424 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6425 mov64(rtmp, 0x8040201008040201L);
6426 movq(xtmp1, src);
6427 movq(xtmp2, rtmp);
6428 gf2p8affineqb(xtmp1, xtmp2, 0);
6429 movq(dst, xtmp1);
6430 } else {
6431 // Swap even and odd numbered bits.
6432 movl(rtmp, src);
6433 andl(rtmp, 0x55555555);
6434 shll(rtmp, 1);
6435 movl(dst, src);
6436 andl(dst, 0xAAAAAAAA);
6437 shrl(dst, 1);
6438 orl(dst, rtmp);
6439
6440 // Swap LSB and MSB 2 bits of each nibble.
6441 movl(rtmp, dst);
6442 andl(rtmp, 0x33333333);
6443 shll(rtmp, 2);
6444 andl(dst, 0xCCCCCCCC);
6445 shrl(dst, 2);
6446 orl(dst, rtmp);
6447
6448 // Swap LSB and MSB 4 bits of each byte.
6449 movl(rtmp, dst);
6450 andl(rtmp, 0x0F0F0F0F);
6451 shll(rtmp, 4);
6452 andl(dst, 0xF0F0F0F0);
6453 shrl(dst, 4);
6454 orl(dst, rtmp);
6455 }
6456 bswapl(dst);
6457 }
6458
6459 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6460 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6461 if(VM_Version::supports_gfni()) {
6462 // Galois field instruction based bit reversal based on following algorithm.
6463 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6464 mov64(rtmp1, 0x8040201008040201L);
6465 movq(xtmp1, src);
6466 movq(xtmp2, rtmp1);
6467 gf2p8affineqb(xtmp1, xtmp2, 0);
6468 movq(dst, xtmp1);
6469 } else {
6470 // Swap even and odd numbered bits.
6471 movq(rtmp1, src);
6472 mov64(rtmp2, 0x5555555555555555L);
6473 andq(rtmp1, rtmp2);
6474 shlq(rtmp1, 1);
6475 movq(dst, src);
6476 notq(rtmp2);
6477 andq(dst, rtmp2);
6478 shrq(dst, 1);
6479 orq(dst, rtmp1);
6480
6481 // Swap LSB and MSB 2 bits of each nibble.
6482 movq(rtmp1, dst);
6483 mov64(rtmp2, 0x3333333333333333L);
6484 andq(rtmp1, rtmp2);
6485 shlq(rtmp1, 2);
6486 notq(rtmp2);
6487 andq(dst, rtmp2);
6488 shrq(dst, 2);
6489 orq(dst, rtmp1);
6490
6491 // Swap LSB and MSB 4 bits of each byte.
6492 movq(rtmp1, dst);
6493 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6494 andq(rtmp1, rtmp2);
6495 shlq(rtmp1, 4);
6496 notq(rtmp2);
6497 andq(dst, rtmp2);
6498 shrq(dst, 4);
6499 orq(dst, rtmp1);
6500 }
6501 bswapq(dst);
6502 }
6503
6504 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6505 Label done;
6506 Label neg_divisor_fastpath;
6507 cmpq(divisor, 0);
6508 jccb(Assembler::less, neg_divisor_fastpath);
6509 xorl(rdx, rdx);
6510 divq(divisor);
6511 jmpb(done);
6512 bind(neg_divisor_fastpath);
6513 // Fastpath for divisor < 0:
6514 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6515 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6516 movq(rdx, rax);
6517 subq(rdx, divisor);
6518 if (VM_Version::supports_bmi1()) {
6519 andnq(rax, rdx, rax);
6520 } else {
6521 notq(rdx);
6522 andq(rax, rdx);
6523 }
6524 shrq(rax, 63);
6525 bind(done);
6526 }
6527
6528 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6529 Label done;
6530 Label neg_divisor_fastpath;
6531 cmpq(divisor, 0);
6532 jccb(Assembler::less, neg_divisor_fastpath);
6533 xorq(rdx, rdx);
6534 divq(divisor);
6535 jmp(done);
6536 bind(neg_divisor_fastpath);
6537 // Fastpath when divisor < 0:
6538 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6539 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6540 movq(rdx, rax);
6541 subq(rax, divisor);
6542 if (VM_Version::supports_bmi1()) {
6543 andnq(rax, rax, rdx);
6544 } else {
6545 notq(rax);
6546 andq(rax, rdx);
6547 }
6548 sarq(rax, 63);
6549 andq(rax, divisor);
6550 subq(rdx, rax);
6551 bind(done);
6552 }
6553
6554 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6555 Label done;
6556 Label neg_divisor_fastpath;
6557 cmpq(divisor, 0);
6558 jccb(Assembler::less, neg_divisor_fastpath);
6559 xorq(rdx, rdx);
6560 divq(divisor);
6561 jmp(done);
6562 bind(neg_divisor_fastpath);
6563 // Fastpath for divisor < 0:
6564 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6565 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6566 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6567 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6568 movq(rdx, rax);
6569 subq(rax, divisor);
6570 if (VM_Version::supports_bmi1()) {
6571 andnq(rax, rax, rdx);
6572 } else {
6573 notq(rax);
6574 andq(rax, rdx);
6575 }
6576 movq(tmp, rax);
6577 shrq(rax, 63); // quotient
6578 sarq(tmp, 63);
6579 andq(tmp, divisor);
6580 subq(rdx, tmp); // remainder
6581 bind(done);
6582 }
6583
6584 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6585 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6586 int vlen_enc) {
6587 assert(VM_Version::supports_avx512bw(), "");
6588 // Byte shuffles are inlane operations and indices are determined using
6589 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6590 // normalized to index range 0-15. This makes sure that all the multiples
6591 // of an index value are placed at same relative position in 128 bit
6592 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6593 // will be 16th element in their respective 128 bit lanes.
6594 movl(rtmp, 16);
6595 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6596
6597 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6598 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6599 // original shuffle indices and move the shuffled lanes corresponding to true
6600 // mask to destination vector.
6601 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6602 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6603 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6604
6605 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6606 // and broadcasting second 128 bit lane.
6607 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6608 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6609 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6610 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6611 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6612
6613 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6614 // and broadcasting third 128 bit lane.
6615 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6616 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6617 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6618 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6619 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6620
6621 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6622 // and broadcasting third 128 bit lane.
6623 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6624 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6625 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6626 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6627 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6628 }
6629
6630 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6631 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6632 if (vlen_enc == AVX_128bit) {
6633 vpermilps(dst, src, shuffle, vlen_enc);
6634 } else if (bt == T_INT) {
6635 vpermd(dst, shuffle, src, vlen_enc);
6636 } else {
6637 assert(bt == T_FLOAT, "");
6638 vpermps(dst, shuffle, src, vlen_enc);
6639 }
6640 }
6641
6642 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6643 switch(opcode) {
6644 case Op_AddHF: vaddsh(dst, src1, src2); break;
6645 case Op_SubHF: vsubsh(dst, src1, src2); break;
6646 case Op_MulHF: vmulsh(dst, src1, src2); break;
6647 case Op_DivHF: vdivsh(dst, src1, src2); break;
6648 default: assert(false, "%s", NodeClassNames[opcode]); break;
6649 }
6650 }
6651
6652 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6653 switch(elem_bt) {
6654 case T_BYTE:
6655 if (ideal_opc == Op_SaturatingAddV) {
6656 vpaddsb(dst, src1, src2, vlen_enc);
6657 } else {
6658 assert(ideal_opc == Op_SaturatingSubV, "");
6659 vpsubsb(dst, src1, src2, vlen_enc);
6660 }
6661 break;
6662 case T_SHORT:
6663 if (ideal_opc == Op_SaturatingAddV) {
6664 vpaddsw(dst, src1, src2, vlen_enc);
6665 } else {
6666 assert(ideal_opc == Op_SaturatingSubV, "");
6667 vpsubsw(dst, src1, src2, vlen_enc);
6668 }
6669 break;
6670 default:
6671 fatal("Unsupported type %s", type2name(elem_bt));
6672 break;
6673 }
6674 }
6675
6676 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6677 switch(elem_bt) {
6678 case T_BYTE:
6679 if (ideal_opc == Op_SaturatingAddV) {
6680 vpaddusb(dst, src1, src2, vlen_enc);
6681 } else {
6682 assert(ideal_opc == Op_SaturatingSubV, "");
6683 vpsubusb(dst, src1, src2, vlen_enc);
6684 }
6685 break;
6686 case T_SHORT:
6687 if (ideal_opc == Op_SaturatingAddV) {
6688 vpaddusw(dst, src1, src2, vlen_enc);
6689 } else {
6690 assert(ideal_opc == Op_SaturatingSubV, "");
6691 vpsubusw(dst, src1, src2, vlen_enc);
6692 }
6693 break;
6694 default:
6695 fatal("Unsupported type %s", type2name(elem_bt));
6696 break;
6697 }
6698 }
6699
6700 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6701 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6702 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6703 // overflow_mask = Inp1 <u Inp2
6704 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6705 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6706 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6707 }
6708
6709 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6710 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6711 // Emulate unsigned comparison using signed comparison
6712 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6713 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6714 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6715 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6716
6717 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6718
6719 // Res = INP1 - INP2 (non-commutative and non-associative)
6720 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6721 // Res = Mask ? Zero : Res
6722 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6723 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6724 }
6725
6726 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6727 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6728 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6729 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6730 // Res = Signed Add INP1, INP2
6731 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6732 // T1 = SRC1 | SRC2
6733 vpor(xtmp1, src1, src2, vlen_enc);
6734 // Max_Unsigned = -1
6735 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6736 // Unsigned compare: Mask = Res <u T1
6737 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6738 // res = Mask ? Max_Unsigned : Res
6739 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6740 }
6741
6742 //
6743 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6744 // unsigned addition operation.
6745 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6746 //
6747 // We empirically determined its semantic equivalence to following reduced expression
6748 // overflow_mask = (a + b) <u (a | b)
6749 //
6750 // and also verified it though Alive2 solver.
6751 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6752 //
6753
6754 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6755 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6756 // Res = Signed Add INP1, INP2
6757 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6758 // Compute T1 = INP1 | INP2
6759 vpor(xtmp3, src1, src2, vlen_enc);
6760 // T1 = Minimum signed value.
6761 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6762 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6763 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6764 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6765 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6766 // Compute overflow detection mask = Res<1> <s T1
6767 if (elem_bt == T_INT) {
6768 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6769 } else {
6770 assert(elem_bt == T_LONG, "");
6771 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6772 }
6773 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6774 }
6775
6776 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6777 int vlen_enc, bool xtmp2_hold_M1) {
6778 if (VM_Version::supports_avx512dq()) {
6779 evpmovq2m(ktmp, src, vlen_enc);
6780 } else {
6781 assert(VM_Version::supports_evex(), "");
6782 if (!xtmp2_hold_M1) {
6783 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6784 }
6785 evpsraq(xtmp1, src, 63, vlen_enc);
6786 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6787 }
6788 }
6789
6790 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6791 int vlen_enc, bool xtmp2_hold_M1) {
6792 if (VM_Version::supports_avx512dq()) {
6793 evpmovd2m(ktmp, src, vlen_enc);
6794 } else {
6795 assert(VM_Version::supports_evex(), "");
6796 if (!xtmp2_hold_M1) {
6797 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6798 }
6799 vpsrad(xtmp1, src, 31, vlen_enc);
6800 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6801 }
6802 }
6803
6804
6805 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6806 if (elem_bt == T_LONG) {
6807 if (VM_Version::supports_evex()) {
6808 evpsraq(dst, src, 63, vlen_enc);
6809 } else {
6810 vpsrad(dst, src, 31, vlen_enc);
6811 vpshufd(dst, dst, 0xF5, vlen_enc);
6812 }
6813 } else {
6814 assert(elem_bt == T_INT, "");
6815 vpsrad(dst, src, 31, vlen_enc);
6816 }
6817 }
6818
6819 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6820 if (compute_allones) {
6821 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6822 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6823 } else {
6824 vpcmpeqq(allones, allones, allones, vlen_enc);
6825 }
6826 }
6827 if (elem_bt == T_LONG) {
6828 vpsrlq(dst, allones, 1, vlen_enc);
6829 } else {
6830 assert(elem_bt == T_INT, "");
6831 vpsrld(dst, allones, 1, vlen_enc);
6832 }
6833 }
6834
6835 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6836 if (compute_allones) {
6837 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6838 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6839 } else {
6840 vpcmpeqq(allones, allones, allones, vlen_enc);
6841 }
6842 }
6843 if (elem_bt == T_LONG) {
6844 vpsllq(dst, allones, 63, vlen_enc);
6845 } else {
6846 assert(elem_bt == T_INT, "");
6847 vpslld(dst, allones, 31, vlen_enc);
6848 }
6849 }
6850
6851 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6852 Assembler::ComparisonPredicate cond, int vlen_enc) {
6853 switch(elem_bt) {
6854 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6855 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6856 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6857 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6858 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6859 }
6860 }
6861
6862 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6863 switch(elem_bt) {
6864 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6865 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6866 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6867 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6868 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6869 }
6870 }
6871
6872 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6873 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6874 if (elem_bt == T_LONG) {
6875 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6876 } else {
6877 assert(elem_bt == T_INT, "");
6878 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6879 }
6880 }
6881
6882 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6883 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6884 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6885 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6886 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6887 // Overflow detection based on Hacker's delight section 2-13.
6888 if (ideal_opc == Op_SaturatingAddV) {
6889 // res = src1 + src2
6890 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6891 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6892 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6893 vpxor(xtmp1, dst, src1, vlen_enc);
6894 vpxor(xtmp2, dst, src2, vlen_enc);
6895 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6896 } else {
6897 assert(ideal_opc == Op_SaturatingSubV, "");
6898 // res = src1 - src2
6899 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6900 // Overflow occurs when both inputs have opposite polarity and
6901 // result polarity does not comply with first input polarity.
6902 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6903 vpxor(xtmp1, src1, src2, vlen_enc);
6904 vpxor(xtmp2, dst, src1, vlen_enc);
6905 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6906 }
6907
6908 // Compute overflow detection mask.
6909 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6910 // Note: xtmp1 hold -1 in all its lanes after above call.
6911
6912 // Compute mask based on first input polarity.
6913 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6914
6915 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6916 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6917
6918 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6919 // set bits in first input polarity mask holds a min value.
6920 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6921 // Blend destination lanes with saturated values using overflow detection mask.
6922 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6923 }
6924
6925
6926 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6927 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6928 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6929 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6930 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6931 // Overflow detection based on Hacker's delight section 2-13.
6932 if (ideal_opc == Op_SaturatingAddV) {
6933 // res = src1 + src2
6934 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6935 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6936 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6937 vpxor(xtmp1, dst, src1, vlen_enc);
6938 vpxor(xtmp2, dst, src2, vlen_enc);
6939 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6940 } else {
6941 assert(ideal_opc == Op_SaturatingSubV, "");
6942 // res = src1 - src2
6943 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6944 // Overflow occurs when both inputs have opposite polarity and
6945 // result polarity does not comply with first input polarity.
6946 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6947 vpxor(xtmp1, src1, src2, vlen_enc);
6948 vpxor(xtmp2, dst, src1, vlen_enc);
6949 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6950 }
6951
6952 // Sign-extend to compute overflow detection mask.
6953 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6954
6955 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6956 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6957 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6958
6959 // Compose saturating min/max vector using first input polarity mask.
6960 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6961 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6962
6963 // Blend result with saturating vector using overflow detection mask.
6964 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6965 }
6966
6967 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6968 switch(elem_bt) {
6969 case T_BYTE:
6970 if (ideal_opc == Op_SaturatingAddV) {
6971 vpaddsb(dst, src1, src2, vlen_enc);
6972 } else {
6973 assert(ideal_opc == Op_SaturatingSubV, "");
6974 vpsubsb(dst, src1, src2, vlen_enc);
6975 }
6976 break;
6977 case T_SHORT:
6978 if (ideal_opc == Op_SaturatingAddV) {
6979 vpaddsw(dst, src1, src2, vlen_enc);
6980 } else {
6981 assert(ideal_opc == Op_SaturatingSubV, "");
6982 vpsubsw(dst, src1, src2, vlen_enc);
6983 }
6984 break;
6985 default:
6986 fatal("Unsupported type %s", type2name(elem_bt));
6987 break;
6988 }
6989 }
6990
6991 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6992 switch(elem_bt) {
6993 case T_BYTE:
6994 if (ideal_opc == Op_SaturatingAddV) {
6995 vpaddusb(dst, src1, src2, vlen_enc);
6996 } else {
6997 assert(ideal_opc == Op_SaturatingSubV, "");
6998 vpsubusb(dst, src1, src2, vlen_enc);
6999 }
7000 break;
7001 case T_SHORT:
7002 if (ideal_opc == Op_SaturatingAddV) {
7003 vpaddusw(dst, src1, src2, vlen_enc);
7004 } else {
7005 assert(ideal_opc == Op_SaturatingSubV, "");
7006 vpsubusw(dst, src1, src2, vlen_enc);
7007 }
7008 break;
7009 default:
7010 fatal("Unsupported type %s", type2name(elem_bt));
7011 break;
7012 }
7013 }
7014
7015 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7016 XMMRegister src2, int vlen_enc) {
7017 switch(elem_bt) {
7018 case T_BYTE:
7019 evpermi2b(dst, src1, src2, vlen_enc);
7020 break;
7021 case T_SHORT:
7022 evpermi2w(dst, src1, src2, vlen_enc);
7023 break;
7024 case T_INT:
7025 evpermi2d(dst, src1, src2, vlen_enc);
7026 break;
7027 case T_LONG:
7028 evpermi2q(dst, src1, src2, vlen_enc);
7029 break;
7030 case T_FLOAT:
7031 evpermi2ps(dst, src1, src2, vlen_enc);
7032 break;
7033 case T_DOUBLE:
7034 evpermi2pd(dst, src1, src2, vlen_enc);
7035 break;
7036 default:
7037 fatal("Unsupported type %s", type2name(elem_bt));
7038 break;
7039 }
7040 }
7041
7042 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7043 if (is_unsigned) {
7044 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7045 } else {
7046 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7047 }
7048 }
7049
7050 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7051 if (is_unsigned) {
7052 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7053 } else {
7054 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7055 }
7056 }
7057
7058 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7059 switch(opcode) {
7060 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7061 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7062 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7063 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7064 default: assert(false, "%s", NodeClassNames[opcode]); break;
7065 }
7066 }
7067
7068 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7069 switch(opcode) {
7070 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7071 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7072 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7073 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7074 default: assert(false, "%s", NodeClassNames[opcode]); break;
7075 }
7076 }
7077
7078 void C2_MacroAssembler::sminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7079 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7080 vminmax_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7081 }
7082
7083 void C2_MacroAssembler::sminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7084 KRegister ktmp) {
7085 if (opcode == Op_MaxHF) {
7086 // dst = max(src1, src2)
7087 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN);
7088 } else {
7089 assert(opcode == Op_MinHF, "");
7090 // dst = min(src1, src2)
7091 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN);
7092 }
7093 }
7094
7095 void C2_MacroAssembler::vminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7096 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7097 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7098 // Move sign bits of src2 to mask register.
7099 evpmovw2m(ktmp, src2, vlen_enc);
7100 // xtmp1 = src2 < 0 ? src2 : src1
7101 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7102 // xtmp2 = src2 < 0 ? ? src1 : src2
7103 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7104 // Idea behind above swapping is to make seconds source operand a +ve value.
7105 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7106 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7107 // the second source operand, either a NaN or a valid floating-point value, is returned
7108 // dst = max(xtmp1, xtmp2)
7109 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7110 // isNaN = is_unordered_quiet(xtmp1)
7111 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7112 // Final result is same as first source if its a NaN value,
7113 // in case second operand holds a NaN value then as per above semantics
7114 // result is same as second operand.
7115 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7116 } else {
7117 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7118 // Move sign bits of src1 to mask register.
7119 evpmovw2m(ktmp, src1, vlen_enc);
7120 // xtmp1 = src1 < 0 ? src2 : src1
7121 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7122 // xtmp2 = src1 < 0 ? src1 : src2
7123 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7124 // Idea behind above swapping is to make seconds source operand a -ve value.
7125 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7126 // the second source operand is returned.
7127 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7128 // or a valid floating-point value, is written to the result.
7129 // dst = min(xtmp1, xtmp2)
7130 evminph(dst, xtmp1, xtmp2, vlen_enc);
7131 // isNaN = is_unordered_quiet(xtmp1)
7132 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7133 // Final result is same as first source if its a NaN value,
7134 // in case second operand holds a NaN value then as per above semantics
7135 // result is same as second operand.
7136 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7137 }
7138 }
7139
7140 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7141 KRegister ktmp, int vlen_enc) {
7142 if (opcode == Op_MaxVHF) {
7143 // dst = max(src1, src2)
7144 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7145 } else {
7146 assert(opcode == Op_MinVHF, "");
7147 // dst = min(src1, src2)
7148 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7149 }
7150 }
7151
7152 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, Address src2,
7153 KRegister ktmp, int vlen_enc) {
7154 if (opcode == Op_MaxVHF) {
7155 // dst = max(src1, src2)
7156 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7157 } else {
7158 assert(opcode == Op_MinVHF, "");
7159 // dst = min(src1, src2)
7160 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7161 }
7162 }
7163
7164 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
7165 // The vector iota entries array is ordered by type B/S/I/L/F/D, and
7166 // the offset between two types is 16.
7167 switch(bt) {
7168 case T_BYTE:
7169 return 0;
7170 case T_SHORT:
7171 return 1;
7172 case T_INT:
7173 return 2;
7174 case T_LONG:
7175 return 3;
7176 case T_FLOAT:
7177 return 4;
7178 case T_DOUBLE:
7179 return 5;
7180 default:
7181 ShouldNotReachHere();
7182 }
7183 }