1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "oops/methodData.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/opcodes.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/globals.hpp"
36 #include "runtime/objectMonitor.hpp"
37 #include "runtime/objectMonitorTable.hpp"
38 #include "runtime/stubRoutines.hpp"
39 #include "runtime/synchronizer.hpp"
40 #include "utilities/checkedCast.hpp"
41 #include "utilities/globalDefinitions.hpp"
42 #include "utilities/powerOfTwo.hpp"
43 #include "utilities/sizes.hpp"
44
45 #ifdef PRODUCT
46 #define BLOCK_COMMENT(str) /* nothing */
47 #define STOP(error) stop(error)
48 #else
49 #define BLOCK_COMMENT(str) block_comment(str)
50 #define STOP(error) block_comment(error); stop(error)
51 #endif
52
53 // C2 compiled method's prolog code.
54 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
55 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
56
57 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
58 // Remove word for return addr
59 framesize -= wordSize;
60 stack_bang_size -= wordSize;
61
62 // Calls to C2R adapters often do not accept exceptional returns.
63 // We require that their callers must bang for them. But be careful, because
64 // some VM calls (such as call site linkage) can use several kilobytes of
65 // stack. But the stack safety zone should account for that.
66 // See bugs 4446381, 4468289, 4497237.
67 if (stack_bang_size > 0) {
68 generate_stack_overflow_check(stack_bang_size);
69
70 // We always push rbp, so that on return to interpreter rbp, will be
71 // restored correctly and we can correct the stack.
72 push(rbp);
73 // Save caller's stack pointer into RBP if the frame pointer is preserved.
74 if (PreserveFramePointer) {
75 mov(rbp, rsp);
76 }
77 // Remove word for ebp
78 framesize -= wordSize;
79
80 // Create frame
81 if (framesize) {
82 subptr(rsp, framesize);
83 }
84 } else {
85 subptr(rsp, framesize);
86
87 // Save RBP register now.
88 framesize -= wordSize;
89 movptr(Address(rsp, framesize), rbp);
90 // Save caller's stack pointer into RBP if the frame pointer is preserved.
91 if (PreserveFramePointer) {
92 movptr(rbp, rsp);
93 if (framesize > 0) {
94 addptr(rbp, framesize);
95 }
96 }
97 }
98
99 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
100 framesize -= wordSize;
101 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
102 }
103
104 #ifdef ASSERT
105 if (VerifyStackAtCalls) {
106 Label L;
107 push(rax);
108 mov(rax, rsp);
109 andptr(rax, StackAlignmentInBytes-1);
110 cmpptr(rax, StackAlignmentInBytes-wordSize);
111 pop(rax);
112 jcc(Assembler::equal, L);
113 STOP("Stack is not properly aligned!");
114 bind(L);
115 }
116 #endif
117
118 if (!is_stub) {
119 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
120 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
121 Label dummy_slow_path;
122 Label dummy_continuation;
123 Label* slow_path = &dummy_slow_path;
124 Label* continuation = &dummy_continuation;
125 if (!Compile::current()->output()->in_scratch_emit_size()) {
126 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
127 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
128 Compile::current()->output()->add_stub(stub);
129 slow_path = &stub->entry();
130 continuation = &stub->continuation();
131 }
132 bs->nmethod_entry_barrier(this, slow_path, continuation);
133 }
134 }
135
136 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
137 switch (vlen_in_bytes) {
138 case 4: // fall-through
139 case 8: // fall-through
140 case 16: return Assembler::AVX_128bit;
141 case 32: return Assembler::AVX_256bit;
142 case 64: return Assembler::AVX_512bit;
143
144 default: {
145 ShouldNotReachHere();
146 return Assembler::AVX_NoVec;
147 }
148 }
149 }
150
151 // fast_lock and fast_unlock used by C2
152
153 // Because the transitions from emitted code to the runtime
154 // monitorenter/exit helper stubs are so slow it's critical that
155 // we inline both the stack-locking fast path and the inflated fast path.
156 //
157 // See also: cmpFastLock and cmpFastUnlock.
158 //
159 // What follows is a specialized inline transliteration of the code
160 // in enter() and exit(). If we're concerned about I$ bloat another
161 // option would be to emit TrySlowEnter and TrySlowExit methods
162 // at startup-time. These methods would accept arguments as
163 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
164 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
165 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
166 // In practice, however, the # of lock sites is bounded and is usually small.
167 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
168 // if the processor uses simple bimodal branch predictors keyed by EIP
169 // Since the helper routines would be called from multiple synchronization
170 // sites.
171 //
172 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
173 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
174 // to those specialized methods. That'd give us a mostly platform-independent
175 // implementation that the JITs could optimize and inline at their pleasure.
176 // Done correctly, the only time we'd need to cross to native could would be
177 // to park() or unpark() threads. We'd also need a few more unsafe operators
178 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
179 // (b) explicit barriers or fence operations.
180 //
181 // TODO:
182 //
183 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
184 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
185 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
186 // the lock operators would typically be faster than reifying Self.
187 //
188 // * Ideally I'd define the primitives as:
189 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
190 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
191 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
192 // Instead, we're stuck with a rather awkward and brittle register assignments below.
193 // Furthermore the register assignments are overconstrained, possibly resulting in
194 // sub-optimal code near the synchronization site.
195 //
196 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
197 // Alternately, use a better sp-proximity test.
198 //
199 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
200 // Either one is sufficient to uniquely identify a thread.
201 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
202 //
203 // * Intrinsify notify() and notifyAll() for the common cases where the
204 // object is locked by the calling thread but the waitlist is empty.
205 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
206 //
207 // * use jccb and jmpb instead of jcc and jmp to improve code density.
208 // But beware of excessive branch density on AMD Opterons.
209 //
210 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
211 // or failure of the fast path. If the fast path fails then we pass
212 // control to the slow path, typically in C. In fast_lock and
213 // fast_unlock we often branch to DONE_LABEL, just to find that C2
214 // will emit a conditional branch immediately after the node.
215 // So we have branches to branches and lots of ICC.ZF games.
216 // Instead, it might be better to have C2 pass a "FailureLabel"
217 // into fast_lock and fast_unlock. In the case of success, control
218 // will drop through the node. ICC.ZF is undefined at exit.
219 // In the case of failure, the node will branch directly to the
220 // FailureLabel
221
222 // obj: object to lock
223 // box: on-stack box address -- KILLED
224 // rax: tmp -- KILLED
225 // t : tmp -- KILLED
226 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
227 Register t, Register thread) {
228 assert(rax_reg == rax, "Used for CAS");
229 assert_different_registers(obj, box, rax_reg, t, thread);
230
231 // Handle inflated monitor.
232 Label inflated;
233 // Finish fast lock successfully. ZF value is irrelevant.
234 Label locked;
235 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
236 Label slow_path;
237
238 if (UseObjectMonitorTable) {
239 // Clear cache in case fast locking succeeds or we need to take the slow-path.
240 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
241 }
242
243 if (DiagnoseSyncOnValueBasedClasses != 0) {
244 load_klass(rax_reg, obj, t);
245 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
246 jcc(Assembler::notZero, slow_path);
247 }
248
249 const Register mark = t;
250
251 { // Fast Lock
252
253 Label push;
254
255 const Register top = UseObjectMonitorTable ? rax_reg : box;
256
257 // Load the mark.
258 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
259
260 // Prefetch top.
261 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
262
263 // Check for monitor (0b10).
264 testptr(mark, markWord::monitor_value);
265 jcc(Assembler::notZero, inflated);
266
267 // Check if lock-stack is full.
268 cmpl(top, LockStack::end_offset() - 1);
269 jcc(Assembler::greater, slow_path);
270
271 // Check if recursive.
272 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
273 jccb(Assembler::equal, push);
274
275 // Try to lock. Transition lock bits 0b01 => 0b00
276 movptr(rax_reg, mark);
277 orptr(rax_reg, markWord::unlocked_value);
278 andptr(mark, ~(int32_t)markWord::unlocked_value);
279 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
280 jcc(Assembler::notEqual, slow_path);
281
282 if (UseObjectMonitorTable) {
283 // Need to reload top, clobbered by CAS.
284 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
285 }
286 bind(push);
287 // After successful lock, push object on lock-stack.
288 movptr(Address(thread, top), obj);
289 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
290 jmp(locked);
291 }
292
293 { // Handle inflated monitor.
294 bind(inflated);
295
296 const Register monitor = t;
297
298 if (!UseObjectMonitorTable) {
299 assert(mark == monitor, "should be the same here");
300 } else {
301 const Register hash = t;
302 Label monitor_found;
303
304 // Look for the monitor in the om_cache.
305
306 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
307 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
308 const int num_unrolled = OMCache::CAPACITY;
309 for (int i = 0; i < num_unrolled; i++) {
310 movptr(monitor, Address(thread, cache_offset + monitor_offset));
311 cmpptr(obj, Address(thread, cache_offset));
312 jccb(Assembler::equal, monitor_found);
313 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
314 }
315
316 // Look for the monitor in the table.
317
318 // Get the hash code.
319 movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
320 shrq(hash, markWord::hash_shift);
321 andq(hash, markWord::hash_mask);
322
323 // Get the table and calculate the bucket's address.
324 lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
325 movptr(rax_reg, Address(rax_reg));
326 andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
327 movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
328
329 // Read the monitor from the bucket.
330 movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
331
332 // Check if the monitor in the bucket is special (empty, tombstone or removed)
333 cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
334 jcc(Assembler::below, slow_path);
335
336 // Check if object matches.
337 movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
338 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
339 bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
340 cmpptr(rax_reg, obj);
341 jcc(Assembler::notEqual, slow_path);
342
343 bind(monitor_found);
344 }
345 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
346 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
347 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
348
349 Label monitor_locked;
350 // Lock the monitor.
351
352 if (UseObjectMonitorTable) {
353 // Cache the monitor for unlock before trashing box. On failure to acquire
354 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
355 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
356 }
357
358 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
359 xorptr(rax_reg, rax_reg);
360 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
361 lock(); cmpxchgptr(box, owner_address);
362 jccb(Assembler::equal, monitor_locked);
363
364 // Check if recursive.
365 cmpptr(box, rax_reg);
366 jccb(Assembler::notEqual, slow_path);
367
368 // Recursive.
369 increment(recursions_address);
370
371 bind(monitor_locked);
372 }
373
374 bind(locked);
375 // Set ZF = 1
376 xorl(rax_reg, rax_reg);
377
378 #ifdef ASSERT
379 // Check that locked label is reached with ZF set.
380 Label zf_correct;
381 Label zf_bad_zero;
382 jcc(Assembler::zero, zf_correct);
383 jmp(zf_bad_zero);
384 #endif
385
386 bind(slow_path);
387 #ifdef ASSERT
388 // Check that slow_path label is reached with ZF not set.
389 jcc(Assembler::notZero, zf_correct);
390 stop("Fast Lock ZF != 0");
391 bind(zf_bad_zero);
392 stop("Fast Lock ZF != 1");
393 bind(zf_correct);
394 #endif
395 // C2 uses the value of ZF to determine the continuation.
396 }
397
398 // obj: object to lock
399 // rax: tmp -- KILLED
400 // t : tmp - cannot be obj nor rax -- KILLED
401 //
402 // Some commentary on balanced locking:
403 //
404 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
405 // Methods that don't have provably balanced locking are forced to run in the
406 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
407 // The interpreter provides two properties:
408 // I1: At return-time the interpreter automatically and quietly unlocks any
409 // objects acquired in the current activation (frame). Recall that the
410 // interpreter maintains an on-stack list of locks currently held by
411 // a frame.
412 // I2: If a method attempts to unlock an object that is not held by the
413 // frame the interpreter throws IMSX.
414 //
415 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
416 // B() doesn't have provably balanced locking so it runs in the interpreter.
417 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
418 // is still locked by A().
419 //
420 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
421 // Specification" states that an object locked by JNI's MonitorEnter should not be
422 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
423 // specify what will occur if a program engages in such mixed-mode locking, however.
424 // Arguably given that the spec legislates the JNI case as undefined our implementation
425 // could reasonably *avoid* checking owner in fast_unlock().
426 // In the interest of performance we elide m->Owner==Self check in unlock.
427 // A perfectly viable alternative is to elide the owner check except when
428 // Xcheck:jni is enabled.
429
430 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
431 assert(reg_rax == rax, "Used for CAS");
432 assert_different_registers(obj, reg_rax, t);
433
434 // Handle inflated monitor.
435 Label inflated, inflated_check_lock_stack;
436 // Finish fast unlock successfully. MUST jump with ZF == 1
437 Label unlocked, slow_path;
438
439 const Register mark = t;
440 const Register monitor = t;
441 const Register top = UseObjectMonitorTable ? t : reg_rax;
442 const Register box = reg_rax;
443
444 Label dummy;
445 C2FastUnlockStub* stub = nullptr;
446
447 if (!Compile::current()->output()->in_scratch_emit_size()) {
448 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
449 Compile::current()->output()->add_stub(stub);
450 }
451
452 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
453
454 { // Fast Unlock
455
456 // Load top.
457 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
458
459 if (!UseObjectMonitorTable) {
460 // Prefetch mark.
461 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
462 }
463
464 // Check if obj is top of lock-stack.
465 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
466 // Top of lock stack was not obj. Must be monitor.
467 jcc(Assembler::notEqual, inflated_check_lock_stack);
468
469 // Pop lock-stack.
470 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
471 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
472
473 // Check if recursive.
474 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
475 jcc(Assembler::equal, unlocked);
476
477 // We elide the monitor check, let the CAS fail instead.
478
479 if (UseObjectMonitorTable) {
480 // Load mark.
481 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
482 }
483
484 // Try to unlock. Transition lock bits 0b00 => 0b01
485 movptr(reg_rax, mark);
486 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
487 orptr(mark, markWord::unlocked_value);
488 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
489 jcc(Assembler::notEqual, push_and_slow_path);
490 jmp(unlocked);
491 }
492
493
494 { // Handle inflated monitor.
495 bind(inflated_check_lock_stack);
496 #ifdef ASSERT
497 Label check_done;
498 subl(top, oopSize);
499 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
500 jcc(Assembler::below, check_done);
501 cmpptr(obj, Address(thread, top));
502 jcc(Assembler::notEqual, inflated_check_lock_stack);
503 stop("Fast Unlock lock on stack");
504 bind(check_done);
505 if (UseObjectMonitorTable) {
506 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
507 }
508 testptr(mark, markWord::monitor_value);
509 jcc(Assembler::notZero, inflated);
510 stop("Fast Unlock not monitor");
511 #endif
512
513 bind(inflated);
514
515 if (!UseObjectMonitorTable) {
516 assert(mark == monitor, "should be the same here");
517 } else {
518 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
519 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
520 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
521 cmpptr(monitor, alignof(ObjectMonitor*));
522 jcc(Assembler::below, slow_path);
523 }
524 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
525 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
526 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
527 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
528 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
529
530 Label recursive;
531
532 // Check if recursive.
533 cmpptr(recursions_address, 0);
534 jcc(Assembler::notZero, recursive);
535
536 // Set owner to null.
537 // Release to satisfy the JMM
538 movptr(owner_address, NULL_WORD);
539 // We need a full fence after clearing owner to avoid stranding.
540 // StoreLoad achieves this.
541 membar(StoreLoad);
542
543 // Check if the entry_list is empty.
544 cmpptr(entry_list_address, NULL_WORD);
545 jcc(Assembler::zero, unlocked); // If so we are done.
546
547 // Check if there is a successor.
548 cmpptr(succ_address, NULL_WORD);
549 jcc(Assembler::notZero, unlocked); // If so we are done.
550
551 // Save the monitor pointer in the current thread, so we can try to
552 // reacquire the lock in SharedRuntime::monitor_exit_helper().
553 if (!UseObjectMonitorTable) {
554 andptr(monitor, ~(int32_t)markWord::monitor_value);
555 }
556 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
557
558 orl(t, 1); // Fast Unlock ZF = 0
559 jmpb(slow_path);
560
561 // Recursive unlock.
562 bind(recursive);
563 decrement(recursions_address);
564 }
565
566 bind(unlocked);
567 xorl(t, t); // Fast Unlock ZF = 1
568
569 #ifdef ASSERT
570 // Check that unlocked label is reached with ZF set.
571 Label zf_correct;
572 Label zf_bad_zero;
573 jcc(Assembler::zero, zf_correct);
574 jmp(zf_bad_zero);
575 #endif
576
577 bind(slow_path);
578 if (stub != nullptr) {
579 bind(stub->slow_path_continuation());
580 }
581 #ifdef ASSERT
582 // Check that stub->continuation() label is reached with ZF not set.
583 jcc(Assembler::notZero, zf_correct);
584 stop("Fast Unlock ZF != 0");
585 bind(zf_bad_zero);
586 stop("Fast Unlock ZF != 1");
587 bind(zf_correct);
588 #endif
589 // C2 uses the value of ZF to determine the continuation.
590 }
591
592 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
593 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
594 }
595
596 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
597 const int framesize = Compile::current()->output()->frame_size_in_bytes();
598 masm->movptr(dst, rsp);
599 if (framesize > 2 * wordSize) {
600 masm->addptr(dst, framesize - 2 * wordSize);
601 }
602 }
603
604 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
605 if (PreserveFramePointer) {
606 // frame pointer is valid
607 #ifdef ASSERT
608 // Verify frame pointer value in rbp.
609 reconstruct_frame_pointer_helper(this, rtmp);
610 Label L_success;
611 cmpq(rbp, rtmp);
612 jccb(Assembler::equal, L_success);
613 STOP("frame pointer mismatch");
614 bind(L_success);
615 #endif // ASSERT
616 } else {
617 reconstruct_frame_pointer_helper(this, rbp);
618 }
619 }
620
621 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
622 jint lo = t->_lo;
623 jint hi = t->_hi;
624 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
625 if (t == TypeInt::INT) {
626 return;
627 }
628
629 BLOCK_COMMENT("CastII {");
630 Label fail;
631 Label succeed;
632
633 if (lo != min_jint) {
634 cmpl(val, lo);
635 jccb(Assembler::less, fail);
636 }
637 if (hi != max_jint) {
638 cmpl(val, hi);
639 jccb(Assembler::greater, fail);
640 }
641 jmpb(succeed);
642
643 bind(fail);
644 movl(c_rarg0, idx);
645 movl(c_rarg1, val);
646 movl(c_rarg2, lo);
647 movl(c_rarg3, hi);
648 reconstruct_frame_pointer(rscratch1);
649 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
650 hlt();
651 bind(succeed);
652 BLOCK_COMMENT("} // CastII");
653 }
654
655 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
656 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
657 }
658
659 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
660 jlong lo = t->_lo;
661 jlong hi = t->_hi;
662 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
663 if (t == TypeLong::LONG) {
664 return;
665 }
666
667 BLOCK_COMMENT("CastLL {");
668 Label fail;
669 Label succeed;
670
671 auto cmp_val = [&](jlong bound) {
672 if (is_simm32(bound)) {
673 cmpq(val, checked_cast<int>(bound));
674 } else {
675 mov64(tmp, bound);
676 cmpq(val, tmp);
677 }
678 };
679
680 if (lo != min_jlong) {
681 cmp_val(lo);
682 jccb(Assembler::less, fail);
683 }
684 if (hi != max_jlong) {
685 cmp_val(hi);
686 jccb(Assembler::greater, fail);
687 }
688 jmpb(succeed);
689
690 bind(fail);
691 movl(c_rarg0, idx);
692 movq(c_rarg1, val);
693 mov64(c_rarg2, lo);
694 mov64(c_rarg3, hi);
695 reconstruct_frame_pointer(rscratch1);
696 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
697 hlt();
698 bind(succeed);
699 BLOCK_COMMENT("} // CastLL");
700 }
701
702 //-------------------------------------------------------------------------------------------
703 // Generic instructions support for use in .ad files C2 code generation
704
705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
706 if (dst != src) {
707 movdqu(dst, src);
708 }
709 if (opcode == Op_AbsVD) {
710 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
711 } else {
712 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
713 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
714 }
715 }
716
717 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
718 if (opcode == Op_AbsVD) {
719 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
720 } else {
721 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
722 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
723 }
724 }
725
726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
727 if (dst != src) {
728 movdqu(dst, src);
729 }
730 if (opcode == Op_AbsVF) {
731 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
732 } else {
733 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
734 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
735 }
736 }
737
738 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
739 if (opcode == Op_AbsVF) {
740 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
741 } else {
742 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
743 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
744 }
745 }
746
747 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
748 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
749 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
750
751 if (opcode == Op_MinV) {
752 if (elem_bt == T_BYTE) {
753 pminsb(dst, src);
754 } else if (elem_bt == T_SHORT) {
755 pminsw(dst, src);
756 } else if (elem_bt == T_INT) {
757 pminsd(dst, src);
758 } else {
759 assert(elem_bt == T_LONG, "required");
760 assert(tmp == xmm0, "required");
761 assert_different_registers(dst, src, tmp);
762 movdqu(xmm0, dst);
763 pcmpgtq(xmm0, src);
764 blendvpd(dst, src); // xmm0 as mask
765 }
766 } else { // opcode == Op_MaxV
767 if (elem_bt == T_BYTE) {
768 pmaxsb(dst, src);
769 } else if (elem_bt == T_SHORT) {
770 pmaxsw(dst, src);
771 } else if (elem_bt == T_INT) {
772 pmaxsd(dst, src);
773 } else {
774 assert(elem_bt == T_LONG, "required");
775 assert(tmp == xmm0, "required");
776 assert_different_registers(dst, src, tmp);
777 movdqu(xmm0, src);
778 pcmpgtq(xmm0, dst);
779 blendvpd(dst, src); // xmm0 as mask
780 }
781 }
782 }
783
784 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
785 XMMRegister src1, Address src2, int vlen_enc) {
786 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
787 if (opcode == Op_UMinV) {
788 switch(elem_bt) {
789 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
790 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
791 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
792 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
793 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
794 }
795 } else {
796 assert(opcode == Op_UMaxV, "required");
797 switch(elem_bt) {
798 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
799 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
800 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
801 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
802 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
803 }
804 }
805 }
806
807 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
808 // For optimality, leverage a full vector width of 512 bits
809 // for operations over smaller vector sizes on AVX512 targets.
810 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
811 if (opcode == Op_UMaxV) {
812 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
813 } else {
814 assert(opcode == Op_UMinV, "required");
815 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
816 }
817 } else {
818 // T1 = -1
819 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
820 // T1 = -1 << 63
821 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
822 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
823 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
824 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
825 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
826 // Mask = T2 > T1
827 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
828 if (opcode == Op_UMaxV) {
829 // Res = Mask ? Src2 : Src1
830 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
831 } else {
832 // Res = Mask ? Src1 : Src2
833 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
834 }
835 }
836 }
837
838 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
839 XMMRegister src1, XMMRegister src2, int vlen_enc) {
840 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
841 if (opcode == Op_UMinV) {
842 switch(elem_bt) {
843 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
844 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
845 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
846 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
847 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
848 }
849 } else {
850 assert(opcode == Op_UMaxV, "required");
851 switch(elem_bt) {
852 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
853 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
854 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
855 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
856 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
857 }
858 }
859 }
860
861 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
862 XMMRegister dst, XMMRegister src1, XMMRegister src2,
863 int vlen_enc) {
864 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
865
866 if (opcode == Op_MinV) {
867 if (elem_bt == T_BYTE) {
868 vpminsb(dst, src1, src2, vlen_enc);
869 } else if (elem_bt == T_SHORT) {
870 vpminsw(dst, src1, src2, vlen_enc);
871 } else if (elem_bt == T_INT) {
872 vpminsd(dst, src1, src2, vlen_enc);
873 } else {
874 assert(elem_bt == T_LONG, "required");
875 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
876 vpminsq(dst, src1, src2, vlen_enc);
877 } else {
878 assert_different_registers(dst, src1, src2);
879 vpcmpgtq(dst, src1, src2, vlen_enc);
880 vblendvpd(dst, src1, src2, dst, vlen_enc);
881 }
882 }
883 } else { // opcode == Op_MaxV
884 if (elem_bt == T_BYTE) {
885 vpmaxsb(dst, src1, src2, vlen_enc);
886 } else if (elem_bt == T_SHORT) {
887 vpmaxsw(dst, src1, src2, vlen_enc);
888 } else if (elem_bt == T_INT) {
889 vpmaxsd(dst, src1, src2, vlen_enc);
890 } else {
891 assert(elem_bt == T_LONG, "required");
892 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
893 vpmaxsq(dst, src1, src2, vlen_enc);
894 } else {
895 assert_different_registers(dst, src1, src2);
896 vpcmpgtq(dst, src1, src2, vlen_enc);
897 vblendvpd(dst, src2, src1, dst, vlen_enc);
898 }
899 }
900 }
901 }
902
903 // Float/Double min max
904
905 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
906 XMMRegister dst, XMMRegister a, XMMRegister b,
907 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
908 int vlen_enc) {
909 assert(UseAVX > 0, "required");
910 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
911 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
912 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
913 assert_different_registers(a, tmp, atmp, btmp);
914 assert_different_registers(b, tmp, atmp, btmp);
915
916 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
917 bool is_double_word = is_double_word_type(elem_bt);
918
919 /* Note on 'non-obvious' assembly sequence:
920 *
921 * While there are vminps/vmaxps instructions, there are two important differences between hardware
922 * and Java on how they handle floats:
923 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
924 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
925 *
926 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
927 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
928 * (only useful when signs differ, noop otherwise)
929 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
930
931 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
932 * btmp = (b < +0.0) ? a : b
933 * atmp = (b < +0.0) ? b : a
934 * Tmp = Max_Float(atmp , btmp)
935 * Res = (atmp == NaN) ? atmp : Tmp
936 */
937
938 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
939 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
940 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
941 XMMRegister mask;
942
943 if (!is_double_word && is_min) {
944 mask = a;
945 vblend = &MacroAssembler::vblendvps;
946 vmaxmin = &MacroAssembler::vminps;
947 vcmp = &MacroAssembler::vcmpps;
948 } else if (!is_double_word && !is_min) {
949 mask = b;
950 vblend = &MacroAssembler::vblendvps;
951 vmaxmin = &MacroAssembler::vmaxps;
952 vcmp = &MacroAssembler::vcmpps;
953 } else if (is_double_word && is_min) {
954 mask = a;
955 vblend = &MacroAssembler::vblendvpd;
956 vmaxmin = &MacroAssembler::vminpd;
957 vcmp = &MacroAssembler::vcmppd;
958 } else {
959 assert(is_double_word && !is_min, "sanity");
960 mask = b;
961 vblend = &MacroAssembler::vblendvpd;
962 vmaxmin = &MacroAssembler::vmaxpd;
963 vcmp = &MacroAssembler::vcmppd;
964 }
965
966 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
967 XMMRegister maxmin, scratch;
968 if (dst == btmp) {
969 maxmin = btmp;
970 scratch = tmp;
971 } else {
972 maxmin = tmp;
973 scratch = btmp;
974 }
975
976 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
977 if (precompute_mask && !is_double_word) {
978 vpsrad(tmp, mask, 32, vlen_enc);
979 mask = tmp;
980 } else if (precompute_mask && is_double_word) {
981 vpxor(tmp, tmp, tmp, vlen_enc);
982 vpcmpgtq(tmp, tmp, mask, vlen_enc);
983 mask = tmp;
984 }
985
986 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
987 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
988 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
989 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
990 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
991 }
992
993 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
994 XMMRegister dst, XMMRegister a, XMMRegister b,
995 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
996 int vlen_enc) {
997 assert(UseAVX > 2, "required");
998 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
999 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1000 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1001 assert_different_registers(dst, a, atmp, btmp);
1002 assert_different_registers(dst, b, atmp, btmp);
1003
1004 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1005 bool is_double_word = is_double_word_type(elem_bt);
1006 bool merge = true;
1007
1008 if (!is_double_word && is_min) {
1009 evpmovd2m(ktmp, a, vlen_enc);
1010 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1011 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1012 vminps(dst, atmp, btmp, vlen_enc);
1013 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1015 } else if (!is_double_word && !is_min) {
1016 evpmovd2m(ktmp, b, vlen_enc);
1017 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019 vmaxps(dst, atmp, btmp, vlen_enc);
1020 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022 } else if (is_double_word && is_min) {
1023 evpmovq2m(ktmp, a, vlen_enc);
1024 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1025 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1026 vminpd(dst, atmp, btmp, vlen_enc);
1027 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1029 } else {
1030 assert(is_double_word && !is_min, "sanity");
1031 evpmovq2m(ktmp, b, vlen_enc);
1032 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1033 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1034 vmaxpd(dst, atmp, btmp, vlen_enc);
1035 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1037 }
1038 }
1039
1040 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1041 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1042 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1043 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1044
1045 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1046 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1047 if (elem_bt == T_FLOAT) {
1048 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1049 } else {
1050 assert(elem_bt == T_DOUBLE, "");
1051 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1052 }
1053 }
1054
1055 // Float/Double signum
1056 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1057 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1058
1059 Label DONE_LABEL;
1060
1061 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1062 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1063 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1064 if (opcode == Op_SignumF) {
1065 if (VM_Version::supports_avx10_2()) {
1066 vucomxss(dst, zero);
1067 jcc(Assembler::negative, DONE_LABEL);
1068 } else {
1069 ucomiss(dst, zero);
1070 jcc(Assembler::equal, DONE_LABEL);
1071 }
1072 movflt(dst, one);
1073 jcc(Assembler::above, DONE_LABEL);
1074 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1075 } else if (opcode == Op_SignumD) {
1076 if (VM_Version::supports_avx10_2()) {
1077 vucomxsd(dst, zero);
1078 jcc(Assembler::negative, DONE_LABEL);
1079 } else {
1080 ucomisd(dst, zero);
1081 jcc(Assembler::equal, DONE_LABEL);
1082 }
1083 movdbl(dst, one);
1084 jcc(Assembler::above, DONE_LABEL);
1085 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1086 }
1087
1088 bind(DONE_LABEL);
1089 }
1090
1091 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1092 if (sign) {
1093 pmovsxbw(dst, src);
1094 } else {
1095 pmovzxbw(dst, src);
1096 }
1097 }
1098
1099 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1100 if (sign) {
1101 vpmovsxbw(dst, src, vector_len);
1102 } else {
1103 vpmovzxbw(dst, src, vector_len);
1104 }
1105 }
1106
1107 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1108 if (sign) {
1109 vpmovsxbd(dst, src, vector_len);
1110 } else {
1111 vpmovzxbd(dst, src, vector_len);
1112 }
1113 }
1114
1115 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1116 if (sign) {
1117 vpmovsxwd(dst, src, vector_len);
1118 } else {
1119 vpmovzxwd(dst, src, vector_len);
1120 }
1121 }
1122
1123 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1124 int shift, int vector_len) {
1125 if (opcode == Op_RotateLeftV) {
1126 if (etype == T_INT) {
1127 evprold(dst, src, shift, vector_len);
1128 } else {
1129 assert(etype == T_LONG, "expected type T_LONG");
1130 evprolq(dst, src, shift, vector_len);
1131 }
1132 } else {
1133 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1134 if (etype == T_INT) {
1135 evprord(dst, src, shift, vector_len);
1136 } else {
1137 assert(etype == T_LONG, "expected type T_LONG");
1138 evprorq(dst, src, shift, vector_len);
1139 }
1140 }
1141 }
1142
1143 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1144 XMMRegister shift, int vector_len) {
1145 if (opcode == Op_RotateLeftV) {
1146 if (etype == T_INT) {
1147 evprolvd(dst, src, shift, vector_len);
1148 } else {
1149 assert(etype == T_LONG, "expected type T_LONG");
1150 evprolvq(dst, src, shift, vector_len);
1151 }
1152 } else {
1153 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1154 if (etype == T_INT) {
1155 evprorvd(dst, src, shift, vector_len);
1156 } else {
1157 assert(etype == T_LONG, "expected type T_LONG");
1158 evprorvq(dst, src, shift, vector_len);
1159 }
1160 }
1161 }
1162
1163 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1164 if (opcode == Op_RShiftVI) {
1165 psrad(dst, shift);
1166 } else if (opcode == Op_LShiftVI) {
1167 pslld(dst, shift);
1168 } else {
1169 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1170 psrld(dst, shift);
1171 }
1172 }
1173
1174 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1175 switch (opcode) {
1176 case Op_RShiftVI: psrad(dst, shift); break;
1177 case Op_LShiftVI: pslld(dst, shift); break;
1178 case Op_URShiftVI: psrld(dst, shift); break;
1179
1180 default: assert(false, "%s", NodeClassNames[opcode]);
1181 }
1182 }
1183
1184 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1185 if (opcode == Op_RShiftVI) {
1186 vpsrad(dst, nds, shift, vector_len);
1187 } else if (opcode == Op_LShiftVI) {
1188 vpslld(dst, nds, shift, vector_len);
1189 } else {
1190 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1191 vpsrld(dst, nds, shift, vector_len);
1192 }
1193 }
1194
1195 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1196 switch (opcode) {
1197 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1198 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1199 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1200
1201 default: assert(false, "%s", NodeClassNames[opcode]);
1202 }
1203 }
1204
1205 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1206 switch (opcode) {
1207 case Op_RShiftVB: // fall-through
1208 case Op_RShiftVS: psraw(dst, shift); break;
1209
1210 case Op_LShiftVB: // fall-through
1211 case Op_LShiftVS: psllw(dst, shift); break;
1212
1213 case Op_URShiftVS: // fall-through
1214 case Op_URShiftVB: psrlw(dst, shift); break;
1215
1216 default: assert(false, "%s", NodeClassNames[opcode]);
1217 }
1218 }
1219
1220 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1221 switch (opcode) {
1222 case Op_RShiftVB: // fall-through
1223 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1224
1225 case Op_LShiftVB: // fall-through
1226 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1227
1228 case Op_URShiftVS: // fall-through
1229 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1230
1231 default: assert(false, "%s", NodeClassNames[opcode]);
1232 }
1233 }
1234
1235 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1236 switch (opcode) {
1237 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1238 case Op_LShiftVL: psllq(dst, shift); break;
1239 case Op_URShiftVL: psrlq(dst, shift); break;
1240
1241 default: assert(false, "%s", NodeClassNames[opcode]);
1242 }
1243 }
1244
1245 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1246 if (opcode == Op_RShiftVL) {
1247 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1248 } else if (opcode == Op_LShiftVL) {
1249 psllq(dst, shift);
1250 } else {
1251 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1252 psrlq(dst, shift);
1253 }
1254 }
1255
1256 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1257 switch (opcode) {
1258 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1259 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1260 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1261
1262 default: assert(false, "%s", NodeClassNames[opcode]);
1263 }
1264 }
1265
1266 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1267 if (opcode == Op_RShiftVL) {
1268 evpsraq(dst, nds, shift, vector_len);
1269 } else if (opcode == Op_LShiftVL) {
1270 vpsllq(dst, nds, shift, vector_len);
1271 } else {
1272 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1273 vpsrlq(dst, nds, shift, vector_len);
1274 }
1275 }
1276
1277 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1278 switch (opcode) {
1279 case Op_RShiftVB: // fall-through
1280 case Op_RShiftVS: // fall-through
1281 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1282
1283 case Op_LShiftVB: // fall-through
1284 case Op_LShiftVS: // fall-through
1285 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1286
1287 case Op_URShiftVB: // fall-through
1288 case Op_URShiftVS: // fall-through
1289 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1290
1291 default: assert(false, "%s", NodeClassNames[opcode]);
1292 }
1293 }
1294
1295 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1296 switch (opcode) {
1297 case Op_RShiftVB: // fall-through
1298 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1299
1300 case Op_LShiftVB: // fall-through
1301 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1302
1303 case Op_URShiftVB: // fall-through
1304 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1305
1306 default: assert(false, "%s", NodeClassNames[opcode]);
1307 }
1308 }
1309
1310 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1311 assert(UseAVX >= 2, "required");
1312 switch (opcode) {
1313 case Op_RShiftVL: {
1314 if (UseAVX > 2) {
1315 assert(tmp == xnoreg, "not used");
1316 if (!VM_Version::supports_avx512vl()) {
1317 vlen_enc = Assembler::AVX_512bit;
1318 }
1319 evpsravq(dst, src, shift, vlen_enc);
1320 } else {
1321 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1322 vpsrlvq(dst, src, shift, vlen_enc);
1323 vpsrlvq(tmp, tmp, shift, vlen_enc);
1324 vpxor(dst, dst, tmp, vlen_enc);
1325 vpsubq(dst, dst, tmp, vlen_enc);
1326 }
1327 break;
1328 }
1329 case Op_LShiftVL: {
1330 assert(tmp == xnoreg, "not used");
1331 vpsllvq(dst, src, shift, vlen_enc);
1332 break;
1333 }
1334 case Op_URShiftVL: {
1335 assert(tmp == xnoreg, "not used");
1336 vpsrlvq(dst, src, shift, vlen_enc);
1337 break;
1338 }
1339 default: assert(false, "%s", NodeClassNames[opcode]);
1340 }
1341 }
1342
1343 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1344 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1345 assert(opcode == Op_LShiftVB ||
1346 opcode == Op_RShiftVB ||
1347 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1348 bool sign = (opcode != Op_URShiftVB);
1349 assert(vector_len == 0, "required");
1350 vextendbd(sign, dst, src, 1);
1351 vpmovzxbd(vtmp, shift, 1);
1352 varshiftd(opcode, dst, dst, vtmp, 1);
1353 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1354 vextracti128_high(vtmp, dst);
1355 vpackusdw(dst, dst, vtmp, 0);
1356 }
1357
1358 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1359 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1360 assert(opcode == Op_LShiftVB ||
1361 opcode == Op_RShiftVB ||
1362 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1363 bool sign = (opcode != Op_URShiftVB);
1364 int ext_vector_len = vector_len + 1;
1365 vextendbw(sign, dst, src, ext_vector_len);
1366 vpmovzxbw(vtmp, shift, ext_vector_len);
1367 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1368 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1369 if (vector_len == 0) {
1370 vextracti128_high(vtmp, dst);
1371 vpackuswb(dst, dst, vtmp, vector_len);
1372 } else {
1373 vextracti64x4_high(vtmp, dst);
1374 vpackuswb(dst, dst, vtmp, vector_len);
1375 vpermq(dst, dst, 0xD8, vector_len);
1376 }
1377 }
1378
1379 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1380 switch(typ) {
1381 case T_BYTE:
1382 pinsrb(dst, val, idx);
1383 break;
1384 case T_SHORT:
1385 pinsrw(dst, val, idx);
1386 break;
1387 case T_INT:
1388 pinsrd(dst, val, idx);
1389 break;
1390 case T_LONG:
1391 pinsrq(dst, val, idx);
1392 break;
1393 default:
1394 assert(false,"Should not reach here.");
1395 break;
1396 }
1397 }
1398
1399 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1400 switch(typ) {
1401 case T_BYTE:
1402 vpinsrb(dst, src, val, idx);
1403 break;
1404 case T_SHORT:
1405 vpinsrw(dst, src, val, idx);
1406 break;
1407 case T_INT:
1408 vpinsrd(dst, src, val, idx);
1409 break;
1410 case T_LONG:
1411 vpinsrq(dst, src, val, idx);
1412 break;
1413 default:
1414 assert(false,"Should not reach here.");
1415 break;
1416 }
1417 }
1418
1419 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1420 Register base, Register idx_base,
1421 Register mask, Register mask_idx,
1422 Register rtmp, int vlen_enc) {
1423 vpxor(dst, dst, dst, vlen_enc);
1424 if (elem_bt == T_SHORT) {
1425 for (int i = 0; i < 4; i++) {
1426 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1427 Label skip_load;
1428 btq(mask, mask_idx);
1429 jccb(Assembler::carryClear, skip_load);
1430 movl(rtmp, Address(idx_base, i * 4));
1431 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1432 bind(skip_load);
1433 incq(mask_idx);
1434 }
1435 } else {
1436 assert(elem_bt == T_BYTE, "");
1437 for (int i = 0; i < 8; i++) {
1438 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1439 Label skip_load;
1440 btq(mask, mask_idx);
1441 jccb(Assembler::carryClear, skip_load);
1442 movl(rtmp, Address(idx_base, i * 4));
1443 pinsrb(dst, Address(base, rtmp), i);
1444 bind(skip_load);
1445 incq(mask_idx);
1446 }
1447 }
1448 }
1449
1450 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1451 Register base, Register idx_base,
1452 Register rtmp, int vlen_enc) {
1453 vpxor(dst, dst, dst, vlen_enc);
1454 if (elem_bt == T_SHORT) {
1455 for (int i = 0; i < 4; i++) {
1456 // dst[i] = src[idx_base[i]]
1457 movl(rtmp, Address(idx_base, i * 4));
1458 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1459 }
1460 } else {
1461 assert(elem_bt == T_BYTE, "");
1462 for (int i = 0; i < 8; i++) {
1463 // dst[i] = src[idx_base[i]]
1464 movl(rtmp, Address(idx_base, i * 4));
1465 pinsrb(dst, Address(base, rtmp), i);
1466 }
1467 }
1468 }
1469
1470 /*
1471 * Gather using hybrid algorithm, first partially unroll scalar loop
1472 * to accumulate values from gather indices into a quad-word(64bit) slice.
1473 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1474 * permutation to place the slice into appropriate vector lane
1475 * locations in destination vector. Following pseudo code describes the
1476 * algorithm in detail:
1477 *
1478 * DST_VEC = ZERO_VEC
1479 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1480 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1481 * FOREACH_ITER:
1482 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1483 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1484 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1485 * PERM_INDEX = PERM_INDEX - TWO_VEC
1486 *
1487 * With each iteration, doubleword permute indices (0,1) corresponding
1488 * to gathered quadword gets right shifted by two lane positions.
1489 *
1490 */
1491 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1492 Register base, Register idx_base,
1493 Register mask, XMMRegister xtmp1,
1494 XMMRegister xtmp2, XMMRegister temp_dst,
1495 Register rtmp, Register mask_idx,
1496 Register length, int vector_len, int vlen_enc) {
1497 Label GATHER8_LOOP;
1498 assert(is_subword_type(elem_ty), "");
1499 movl(length, vector_len);
1500 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1501 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1502 vallones(xtmp2, vlen_enc);
1503 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1504 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1505 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1506
1507 bind(GATHER8_LOOP);
1508 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1509 if (mask == noreg) {
1510 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1511 } else {
1512 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1513 }
1514 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1515 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1516 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1517 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1518 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1519 vpor(dst, dst, temp_dst, vlen_enc);
1520 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1521 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1522 jcc(Assembler::notEqual, GATHER8_LOOP);
1523 }
1524
1525 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1526 switch(typ) {
1527 case T_INT:
1528 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1529 break;
1530 case T_FLOAT:
1531 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1532 break;
1533 case T_LONG:
1534 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1535 break;
1536 case T_DOUBLE:
1537 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1538 break;
1539 default:
1540 assert(false,"Should not reach here.");
1541 break;
1542 }
1543 }
1544
1545 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1546 switch(typ) {
1547 case T_INT:
1548 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1549 break;
1550 case T_FLOAT:
1551 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1552 break;
1553 case T_LONG:
1554 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1555 break;
1556 case T_DOUBLE:
1557 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1558 break;
1559 default:
1560 assert(false,"Should not reach here.");
1561 break;
1562 }
1563 }
1564
1565 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1566 switch(typ) {
1567 case T_INT:
1568 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1569 break;
1570 case T_FLOAT:
1571 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1572 break;
1573 case T_LONG:
1574 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1575 break;
1576 case T_DOUBLE:
1577 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1578 break;
1579 default:
1580 assert(false,"Should not reach here.");
1581 break;
1582 }
1583 }
1584
1585 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1586 if (vlen_in_bytes <= 16) {
1587 pxor (dst, dst);
1588 psubb(dst, src);
1589 switch (elem_bt) {
1590 case T_BYTE: /* nothing to do */ break;
1591 case T_SHORT: pmovsxbw(dst, dst); break;
1592 case T_INT: pmovsxbd(dst, dst); break;
1593 case T_FLOAT: pmovsxbd(dst, dst); break;
1594 case T_LONG: pmovsxbq(dst, dst); break;
1595 case T_DOUBLE: pmovsxbq(dst, dst); break;
1596
1597 default: assert(false, "%s", type2name(elem_bt));
1598 }
1599 } else {
1600 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1601 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1602
1603 vpxor (dst, dst, dst, vlen_enc);
1604 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1605
1606 switch (elem_bt) {
1607 case T_BYTE: /* nothing to do */ break;
1608 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1609 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1610 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1611 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1612 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1613
1614 default: assert(false, "%s", type2name(elem_bt));
1615 }
1616 }
1617 }
1618
1619 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1620 if (novlbwdq) {
1621 vpmovsxbd(xtmp, src, vlen_enc);
1622 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1623 Assembler::eq, true, vlen_enc, noreg);
1624 } else {
1625 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1626 vpsubb(xtmp, xtmp, src, vlen_enc);
1627 evpmovb2m(dst, xtmp, vlen_enc);
1628 }
1629 }
1630
1631 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1632 if (is_integral_type(bt)) {
1633 switch (vlen_in_bytes) {
1634 case 4: movdl(dst, src); break;
1635 case 8: movq(dst, src); break;
1636 case 16: movdqu(dst, src); break;
1637 case 32: vmovdqu(dst, src); break;
1638 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1639 default: ShouldNotReachHere();
1640 }
1641 } else {
1642 switch (vlen_in_bytes) {
1643 case 4: movflt(dst, src); break;
1644 case 8: movdbl(dst, src); break;
1645 case 16: movups(dst, src); break;
1646 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1647 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1648 default: ShouldNotReachHere();
1649 }
1650 }
1651 }
1652
1653 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1654 assert(rscratch != noreg || always_reachable(src), "missing");
1655
1656 if (reachable(src)) {
1657 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1658 } else {
1659 lea(rscratch, src);
1660 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1661 }
1662 }
1663
1664 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1665 int vlen_enc = vector_length_encoding(vlen);
1666 if (VM_Version::supports_avx()) {
1667 if (bt == T_LONG) {
1668 if (VM_Version::supports_avx2()) {
1669 vpbroadcastq(dst, src, vlen_enc);
1670 } else {
1671 vmovddup(dst, src, vlen_enc);
1672 }
1673 } else if (bt == T_DOUBLE) {
1674 if (vlen_enc != Assembler::AVX_128bit) {
1675 vbroadcastsd(dst, src, vlen_enc, noreg);
1676 } else {
1677 vmovddup(dst, src, vlen_enc);
1678 }
1679 } else {
1680 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1681 vpbroadcastd(dst, src, vlen_enc);
1682 } else {
1683 vbroadcastss(dst, src, vlen_enc);
1684 }
1685 }
1686 } else if (VM_Version::supports_sse3()) {
1687 movddup(dst, src);
1688 } else {
1689 load_vector(bt, dst, src, vlen);
1690 }
1691 }
1692
1693 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1694 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1695 int offset = exact_log2(type2aelembytes(bt)) << 6;
1696 if (is_floating_point_type(bt)) {
1697 offset += 128;
1698 }
1699 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1700 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1701 }
1702
1703 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1704
1705 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1706 int vector_len = Assembler::AVX_128bit;
1707
1708 switch (opcode) {
1709 case Op_AndReductionV: pand(dst, src); break;
1710 case Op_OrReductionV: por (dst, src); break;
1711 case Op_XorReductionV: pxor(dst, src); break;
1712 case Op_MinReductionV:
1713 switch (typ) {
1714 case T_BYTE: pminsb(dst, src); break;
1715 case T_SHORT: pminsw(dst, src); break;
1716 case T_INT: pminsd(dst, src); break;
1717 case T_LONG: assert(UseAVX > 2, "required");
1718 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1719 default: assert(false, "wrong type");
1720 }
1721 break;
1722 case Op_MaxReductionV:
1723 switch (typ) {
1724 case T_BYTE: pmaxsb(dst, src); break;
1725 case T_SHORT: pmaxsw(dst, src); break;
1726 case T_INT: pmaxsd(dst, src); break;
1727 case T_LONG: assert(UseAVX > 2, "required");
1728 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1729 default: assert(false, "wrong type");
1730 }
1731 break;
1732 case Op_AddReductionVF: addss(dst, src); break;
1733 case Op_AddReductionVD: addsd(dst, src); break;
1734 case Op_AddReductionVI:
1735 switch (typ) {
1736 case T_BYTE: paddb(dst, src); break;
1737 case T_SHORT: paddw(dst, src); break;
1738 case T_INT: paddd(dst, src); break;
1739 default: assert(false, "wrong type");
1740 }
1741 break;
1742 case Op_AddReductionVL: paddq(dst, src); break;
1743 case Op_MulReductionVF: mulss(dst, src); break;
1744 case Op_MulReductionVD: mulsd(dst, src); break;
1745 case Op_MulReductionVI:
1746 switch (typ) {
1747 case T_SHORT: pmullw(dst, src); break;
1748 case T_INT: pmulld(dst, src); break;
1749 default: assert(false, "wrong type");
1750 }
1751 break;
1752 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1753 evpmullq(dst, dst, src, vector_len); break;
1754 default: assert(false, "wrong opcode");
1755 }
1756 }
1757
1758 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1759 switch (opcode) {
1760 case Op_AddReductionVF: addps(dst, src); break;
1761 case Op_AddReductionVD: addpd(dst, src); break;
1762 case Op_MulReductionVF: mulps(dst, src); break;
1763 case Op_MulReductionVD: mulpd(dst, src); break;
1764 default: assert(false, "%s", NodeClassNames[opcode]);
1765 }
1766 }
1767
1768 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1769 int vector_len = Assembler::AVX_256bit;
1770
1771 switch (opcode) {
1772 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1773 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1774 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1775 case Op_MinReductionV:
1776 switch (typ) {
1777 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1778 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1779 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1780 case T_LONG: assert(UseAVX > 2, "required");
1781 vpminsq(dst, src1, src2, vector_len); break;
1782 default: assert(false, "wrong type");
1783 }
1784 break;
1785 case Op_MaxReductionV:
1786 switch (typ) {
1787 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1788 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1789 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1790 case T_LONG: assert(UseAVX > 2, "required");
1791 vpmaxsq(dst, src1, src2, vector_len); break;
1792 default: assert(false, "wrong type");
1793 }
1794 break;
1795 case Op_AddReductionVI:
1796 switch (typ) {
1797 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1798 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1799 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1800 default: assert(false, "wrong type");
1801 }
1802 break;
1803 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1804 case Op_MulReductionVI:
1805 switch (typ) {
1806 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1807 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1808 default: assert(false, "wrong type");
1809 }
1810 break;
1811 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1812 default: assert(false, "wrong opcode");
1813 }
1814 }
1815
1816 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1817 int vector_len = Assembler::AVX_256bit;
1818
1819 switch (opcode) {
1820 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1821 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1822 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1823 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1824 default: assert(false, "%s", NodeClassNames[opcode]);
1825 }
1826 }
1827
1828 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1829 XMMRegister dst, XMMRegister src,
1830 XMMRegister vtmp1, XMMRegister vtmp2) {
1831 switch (opcode) {
1832 case Op_AddReductionVF:
1833 case Op_MulReductionVF:
1834 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1835 break;
1836
1837 case Op_AddReductionVD:
1838 case Op_MulReductionVD:
1839 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1840 break;
1841
1842 default: assert(false, "wrong opcode");
1843 }
1844 }
1845
1846 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1847 XMMRegister dst, XMMRegister src,
1848 XMMRegister vtmp1, XMMRegister vtmp2) {
1849 switch (opcode) {
1850 case Op_AddReductionVF:
1851 case Op_MulReductionVF:
1852 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1853 break;
1854
1855 case Op_AddReductionVD:
1856 case Op_MulReductionVD:
1857 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1858 break;
1859
1860 default: assert(false, "%s", NodeClassNames[opcode]);
1861 }
1862 }
1863
1864 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1865 Register dst, Register src1, XMMRegister src2,
1866 XMMRegister vtmp1, XMMRegister vtmp2) {
1867 switch (vlen) {
1868 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1869 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1870 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1871 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1872
1873 default: assert(false, "wrong vector length");
1874 }
1875 }
1876
1877 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1878 Register dst, Register src1, XMMRegister src2,
1879 XMMRegister vtmp1, XMMRegister vtmp2) {
1880 switch (vlen) {
1881 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1882 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1883 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1884 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1885
1886 default: assert(false, "wrong vector length");
1887 }
1888 }
1889
1890 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1891 Register dst, Register src1, XMMRegister src2,
1892 XMMRegister vtmp1, XMMRegister vtmp2) {
1893 switch (vlen) {
1894 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1895 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1896 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1897 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1898
1899 default: assert(false, "wrong vector length");
1900 }
1901 }
1902
1903 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1904 Register dst, Register src1, XMMRegister src2,
1905 XMMRegister vtmp1, XMMRegister vtmp2) {
1906 switch (vlen) {
1907 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1908 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1909 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1910 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1911
1912 default: assert(false, "wrong vector length");
1913 }
1914 }
1915
1916 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1917 Register dst, Register src1, XMMRegister src2,
1918 XMMRegister vtmp1, XMMRegister vtmp2) {
1919 switch (vlen) {
1920 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1921 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1922 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1923
1924 default: assert(false, "wrong vector length");
1925 }
1926 }
1927
1928 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1929 switch (vlen) {
1930 case 2:
1931 assert(vtmp2 == xnoreg, "");
1932 reduce2F(opcode, dst, src, vtmp1);
1933 break;
1934 case 4:
1935 assert(vtmp2 == xnoreg, "");
1936 reduce4F(opcode, dst, src, vtmp1);
1937 break;
1938 case 8:
1939 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1940 break;
1941 case 16:
1942 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1943 break;
1944 default: assert(false, "wrong vector length");
1945 }
1946 }
1947
1948 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1949 switch (vlen) {
1950 case 2:
1951 assert(vtmp2 == xnoreg, "");
1952 reduce2D(opcode, dst, src, vtmp1);
1953 break;
1954 case 4:
1955 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1956 break;
1957 case 8:
1958 reduce8D(opcode, dst, src, vtmp1, vtmp2);
1959 break;
1960 default: assert(false, "wrong vector length");
1961 }
1962 }
1963
1964 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1965 switch (vlen) {
1966 case 2:
1967 assert(vtmp1 == xnoreg, "");
1968 assert(vtmp2 == xnoreg, "");
1969 unorderedReduce2F(opcode, dst, src);
1970 break;
1971 case 4:
1972 assert(vtmp2 == xnoreg, "");
1973 unorderedReduce4F(opcode, dst, src, vtmp1);
1974 break;
1975 case 8:
1976 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1977 break;
1978 case 16:
1979 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1980 break;
1981 default: assert(false, "wrong vector length");
1982 }
1983 }
1984
1985 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1986 switch (vlen) {
1987 case 2:
1988 assert(vtmp1 == xnoreg, "");
1989 assert(vtmp2 == xnoreg, "");
1990 unorderedReduce2D(opcode, dst, src);
1991 break;
1992 case 4:
1993 assert(vtmp2 == xnoreg, "");
1994 unorderedReduce4D(opcode, dst, src, vtmp1);
1995 break;
1996 case 8:
1997 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
1998 break;
1999 default: assert(false, "wrong vector length");
2000 }
2001 }
2002
2003 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2004 if (opcode == Op_AddReductionVI) {
2005 if (vtmp1 != src2) {
2006 movdqu(vtmp1, src2);
2007 }
2008 phaddd(vtmp1, vtmp1);
2009 } else {
2010 pshufd(vtmp1, src2, 0x1);
2011 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2012 }
2013 movdl(vtmp2, src1);
2014 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2015 movdl(dst, vtmp1);
2016 }
2017
2018 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2019 if (opcode == Op_AddReductionVI) {
2020 if (vtmp1 != src2) {
2021 movdqu(vtmp1, src2);
2022 }
2023 phaddd(vtmp1, src2);
2024 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2025 } else {
2026 pshufd(vtmp2, src2, 0xE);
2027 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2028 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2029 }
2030 }
2031
2032 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2033 if (opcode == Op_AddReductionVI) {
2034 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2035 vextracti128_high(vtmp2, vtmp1);
2036 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2037 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2038 } else {
2039 vextracti128_high(vtmp1, src2);
2040 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2041 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2042 }
2043 }
2044
2045 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2046 vextracti64x4_high(vtmp2, src2);
2047 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2048 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2049 }
2050
2051 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2052 pshufd(vtmp2, src2, 0x1);
2053 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2054 movdqu(vtmp1, vtmp2);
2055 psrldq(vtmp1, 2);
2056 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2057 movdqu(vtmp2, vtmp1);
2058 psrldq(vtmp2, 1);
2059 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2060 movdl(vtmp2, src1);
2061 pmovsxbd(vtmp1, vtmp1);
2062 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2063 pextrb(dst, vtmp1, 0x0);
2064 movsbl(dst, dst);
2065 }
2066
2067 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2068 pshufd(vtmp1, src2, 0xE);
2069 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2070 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2071 }
2072
2073 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2074 vextracti128_high(vtmp2, src2);
2075 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2076 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2077 }
2078
2079 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2080 vextracti64x4_high(vtmp1, src2);
2081 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2082 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2083 }
2084
2085 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2086 pmovsxbw(vtmp2, src2);
2087 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2088 }
2089
2090 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2091 if (UseAVX > 1) {
2092 int vector_len = Assembler::AVX_256bit;
2093 vpmovsxbw(vtmp1, src2, vector_len);
2094 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2095 } else {
2096 pmovsxbw(vtmp2, src2);
2097 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2098 pshufd(vtmp2, src2, 0x1);
2099 pmovsxbw(vtmp2, src2);
2100 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2101 }
2102 }
2103
2104 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2105 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2106 int vector_len = Assembler::AVX_512bit;
2107 vpmovsxbw(vtmp1, src2, vector_len);
2108 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2109 } else {
2110 assert(UseAVX >= 2,"Should not reach here.");
2111 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2112 vextracti128_high(vtmp2, src2);
2113 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2114 }
2115 }
2116
2117 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2118 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2119 vextracti64x4_high(vtmp2, src2);
2120 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2121 }
2122
2123 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2124 if (opcode == Op_AddReductionVI) {
2125 if (vtmp1 != src2) {
2126 movdqu(vtmp1, src2);
2127 }
2128 phaddw(vtmp1, vtmp1);
2129 phaddw(vtmp1, vtmp1);
2130 } else {
2131 pshufd(vtmp2, src2, 0x1);
2132 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2133 movdqu(vtmp1, vtmp2);
2134 psrldq(vtmp1, 2);
2135 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2136 }
2137 movdl(vtmp2, src1);
2138 pmovsxwd(vtmp1, vtmp1);
2139 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2140 pextrw(dst, vtmp1, 0x0);
2141 movswl(dst, dst);
2142 }
2143
2144 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2145 if (opcode == Op_AddReductionVI) {
2146 if (vtmp1 != src2) {
2147 movdqu(vtmp1, src2);
2148 }
2149 phaddw(vtmp1, src2);
2150 } else {
2151 pshufd(vtmp1, src2, 0xE);
2152 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2153 }
2154 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2155 }
2156
2157 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2158 if (opcode == Op_AddReductionVI) {
2159 int vector_len = Assembler::AVX_256bit;
2160 vphaddw(vtmp2, src2, src2, vector_len);
2161 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2162 } else {
2163 vextracti128_high(vtmp2, src2);
2164 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2165 }
2166 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2167 }
2168
2169 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2170 int vector_len = Assembler::AVX_256bit;
2171 vextracti64x4_high(vtmp1, src2);
2172 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2173 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2174 }
2175
2176 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2177 pshufd(vtmp2, src2, 0xE);
2178 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2179 movdq(vtmp1, src1);
2180 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2181 movdq(dst, vtmp1);
2182 }
2183
2184 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2185 vextracti128_high(vtmp1, src2);
2186 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2187 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2188 }
2189
2190 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2191 vextracti64x4_high(vtmp2, src2);
2192 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2193 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2194 }
2195
2196 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2197 mov64(temp, -1L);
2198 bzhiq(temp, temp, len);
2199 kmovql(dst, temp);
2200 }
2201
2202 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2203 reduce_operation_128(T_FLOAT, opcode, dst, src);
2204 pshufd(vtmp, src, 0x1);
2205 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2206 }
2207
2208 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2209 reduce2F(opcode, dst, src, vtmp);
2210 pshufd(vtmp, src, 0x2);
2211 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2212 pshufd(vtmp, src, 0x3);
2213 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2214 }
2215
2216 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2217 reduce4F(opcode, dst, src, vtmp2);
2218 vextractf128_high(vtmp2, src);
2219 reduce4F(opcode, dst, vtmp2, vtmp1);
2220 }
2221
2222 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2223 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2224 vextracti64x4_high(vtmp1, src);
2225 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2226 }
2227
2228 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2229 pshufd(dst, src, 0x1);
2230 reduce_operation_128(T_FLOAT, opcode, dst, src);
2231 }
2232
2233 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2234 pshufd(vtmp, src, 0xE);
2235 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2236 unorderedReduce2F(opcode, dst, vtmp);
2237 }
2238
2239 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2240 vextractf128_high(vtmp1, src);
2241 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2242 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2243 }
2244
2245 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2246 vextractf64x4_high(vtmp2, src);
2247 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2248 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2249 }
2250
2251 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2252 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2253 pshufd(vtmp, src, 0xE);
2254 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2255 }
2256
2257 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2258 reduce2D(opcode, dst, src, vtmp2);
2259 vextractf128_high(vtmp2, src);
2260 reduce2D(opcode, dst, vtmp2, vtmp1);
2261 }
2262
2263 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2264 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2265 vextracti64x4_high(vtmp1, src);
2266 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2267 }
2268
2269 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2270 pshufd(dst, src, 0xE);
2271 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2272 }
2273
2274 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2275 vextractf128_high(vtmp, src);
2276 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2277 unorderedReduce2D(opcode, dst, vtmp);
2278 }
2279
2280 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2281 vextractf64x4_high(vtmp2, src);
2282 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2283 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2284 }
2285
2286 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2287 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2288 }
2289
2290 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2291 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2292 }
2293
2294 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2295 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2296 }
2297
2298 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2299 int vec_enc) {
2300 switch(elem_bt) {
2301 case T_INT:
2302 case T_FLOAT:
2303 vmaskmovps(dst, src, mask, vec_enc);
2304 break;
2305 case T_LONG:
2306 case T_DOUBLE:
2307 vmaskmovpd(dst, src, mask, vec_enc);
2308 break;
2309 default:
2310 fatal("Unsupported type %s", type2name(elem_bt));
2311 break;
2312 }
2313 }
2314
2315 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2316 int vec_enc) {
2317 switch(elem_bt) {
2318 case T_INT:
2319 case T_FLOAT:
2320 vmaskmovps(dst, src, mask, vec_enc);
2321 break;
2322 case T_LONG:
2323 case T_DOUBLE:
2324 vmaskmovpd(dst, src, mask, vec_enc);
2325 break;
2326 default:
2327 fatal("Unsupported type %s", type2name(elem_bt));
2328 break;
2329 }
2330 }
2331
2332 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2333 XMMRegister dst, XMMRegister src,
2334 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2335 XMMRegister xmm_0, XMMRegister xmm_1) {
2336 const int permconst[] = {1, 14};
2337 XMMRegister wsrc = src;
2338 XMMRegister wdst = xmm_0;
2339 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2340
2341 int vlen_enc = Assembler::AVX_128bit;
2342 if (vlen == 16) {
2343 vlen_enc = Assembler::AVX_256bit;
2344 }
2345
2346 for (int i = log2(vlen) - 1; i >=0; i--) {
2347 if (i == 0 && !is_dst_valid) {
2348 wdst = dst;
2349 }
2350 if (i == 3) {
2351 vextracti64x4_high(wtmp, wsrc);
2352 } else if (i == 2) {
2353 vextracti128_high(wtmp, wsrc);
2354 } else { // i = [0,1]
2355 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2356 }
2357
2358 if (VM_Version::supports_avx10_2()) {
2359 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2360 } else {
2361 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2362 }
2363 wsrc = wdst;
2364 vlen_enc = Assembler::AVX_128bit;
2365 }
2366 if (is_dst_valid) {
2367 if (VM_Version::supports_avx10_2()) {
2368 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2369 } else {
2370 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2371 }
2372 }
2373 }
2374
2375 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2376 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2377 XMMRegister xmm_0, XMMRegister xmm_1) {
2378 XMMRegister wsrc = src;
2379 XMMRegister wdst = xmm_0;
2380 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2381 int vlen_enc = Assembler::AVX_128bit;
2382 if (vlen == 8) {
2383 vlen_enc = Assembler::AVX_256bit;
2384 }
2385 for (int i = log2(vlen) - 1; i >=0; i--) {
2386 if (i == 0 && !is_dst_valid) {
2387 wdst = dst;
2388 }
2389 if (i == 1) {
2390 vextracti128_high(wtmp, wsrc);
2391 } else if (i == 2) {
2392 vextracti64x4_high(wtmp, wsrc);
2393 } else {
2394 assert(i == 0, "%d", i);
2395 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2396 }
2397
2398 if (VM_Version::supports_avx10_2()) {
2399 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2400 } else {
2401 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2402 }
2403
2404 wsrc = wdst;
2405 vlen_enc = Assembler::AVX_128bit;
2406 }
2407
2408 if (is_dst_valid) {
2409 if (VM_Version::supports_avx10_2()) {
2410 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2411 } else {
2412 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2413 }
2414 }
2415 }
2416
2417 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2418 switch (bt) {
2419 case T_BYTE: pextrb(dst, src, idx); break;
2420 case T_SHORT: pextrw(dst, src, idx); break;
2421 case T_INT: pextrd(dst, src, idx); break;
2422 case T_LONG: pextrq(dst, src, idx); break;
2423
2424 default:
2425 assert(false,"Should not reach here.");
2426 break;
2427 }
2428 }
2429
2430 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2431 int esize = type2aelembytes(typ);
2432 int elem_per_lane = 16/esize;
2433 int lane = elemindex / elem_per_lane;
2434 int eindex = elemindex % elem_per_lane;
2435
2436 if (lane >= 2) {
2437 assert(UseAVX > 2, "required");
2438 vextractf32x4(dst, src, lane & 3);
2439 return dst;
2440 } else if (lane > 0) {
2441 assert(UseAVX > 0, "required");
2442 vextractf128(dst, src, lane);
2443 return dst;
2444 } else {
2445 return src;
2446 }
2447 }
2448
2449 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2450 if (typ == T_BYTE) {
2451 movsbl(dst, dst);
2452 } else if (typ == T_SHORT) {
2453 movswl(dst, dst);
2454 }
2455 }
2456
2457 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2458 int esize = type2aelembytes(typ);
2459 int elem_per_lane = 16/esize;
2460 int eindex = elemindex % elem_per_lane;
2461 assert(is_integral_type(typ),"required");
2462
2463 if (eindex == 0) {
2464 if (typ == T_LONG) {
2465 movq(dst, src);
2466 } else {
2467 movdl(dst, src);
2468 movsxl(typ, dst);
2469 }
2470 } else {
2471 extract(typ, dst, src, eindex);
2472 movsxl(typ, dst);
2473 }
2474 }
2475
2476 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2477 int esize = type2aelembytes(typ);
2478 int elem_per_lane = 16/esize;
2479 int eindex = elemindex % elem_per_lane;
2480 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2481
2482 if (eindex == 0) {
2483 movq(dst, src);
2484 } else {
2485 if (typ == T_FLOAT) {
2486 if (UseAVX == 0) {
2487 movdqu(dst, src);
2488 shufps(dst, dst, eindex);
2489 } else {
2490 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2491 }
2492 } else {
2493 if (UseAVX == 0) {
2494 movdqu(dst, src);
2495 psrldq(dst, eindex*esize);
2496 } else {
2497 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2498 }
2499 movq(dst, dst);
2500 }
2501 }
2502 // Zero upper bits
2503 if (typ == T_FLOAT) {
2504 if (UseAVX == 0) {
2505 assert(vtmp != xnoreg, "required.");
2506 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2507 pand(dst, vtmp);
2508 } else {
2509 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2510 }
2511 }
2512 }
2513
2514 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2515 switch(typ) {
2516 case T_BYTE:
2517 case T_BOOLEAN:
2518 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2519 break;
2520 case T_SHORT:
2521 case T_CHAR:
2522 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2523 break;
2524 case T_INT:
2525 case T_FLOAT:
2526 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2527 break;
2528 case T_LONG:
2529 case T_DOUBLE:
2530 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2531 break;
2532 default:
2533 assert(false,"Should not reach here.");
2534 break;
2535 }
2536 }
2537
2538 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2539 assert(rscratch != noreg || always_reachable(src2), "missing");
2540
2541 switch(typ) {
2542 case T_BOOLEAN:
2543 case T_BYTE:
2544 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2545 break;
2546 case T_CHAR:
2547 case T_SHORT:
2548 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2549 break;
2550 case T_INT:
2551 case T_FLOAT:
2552 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2553 break;
2554 case T_LONG:
2555 case T_DOUBLE:
2556 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2557 break;
2558 default:
2559 assert(false,"Should not reach here.");
2560 break;
2561 }
2562 }
2563
2564 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2565 switch(typ) {
2566 case T_BYTE:
2567 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2568 break;
2569 case T_SHORT:
2570 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2571 break;
2572 case T_INT:
2573 case T_FLOAT:
2574 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2575 break;
2576 case T_LONG:
2577 case T_DOUBLE:
2578 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2579 break;
2580 default:
2581 assert(false,"Should not reach here.");
2582 break;
2583 }
2584 }
2585
2586 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2587 assert(vlen_in_bytes <= 32, "");
2588 int esize = type2aelembytes(bt);
2589 if (vlen_in_bytes == 32) {
2590 assert(vtmp == xnoreg, "required.");
2591 if (esize >= 4) {
2592 vtestps(src1, src2, AVX_256bit);
2593 } else {
2594 vptest(src1, src2, AVX_256bit);
2595 }
2596 return;
2597 }
2598 if (vlen_in_bytes < 16) {
2599 // Duplicate the lower part to fill the whole register,
2600 // Don't need to do so for src2
2601 assert(vtmp != xnoreg, "required");
2602 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2603 pshufd(vtmp, src1, shuffle_imm);
2604 } else {
2605 assert(vtmp == xnoreg, "required");
2606 vtmp = src1;
2607 }
2608 if (esize >= 4 && VM_Version::supports_avx()) {
2609 vtestps(vtmp, src2, AVX_128bit);
2610 } else {
2611 ptest(vtmp, src2);
2612 }
2613 }
2614
2615 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2616 #ifdef ASSERT
2617 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2618 bool is_bw_supported = VM_Version::supports_avx512bw();
2619 if (is_bw && !is_bw_supported) {
2620 assert(vlen_enc != Assembler::AVX_512bit, "required");
2621 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2622 "XMM register should be 0-15");
2623 }
2624 #endif // ASSERT
2625 switch (elem_bt) {
2626 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2627 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2628 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2629 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2630 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2631 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2632 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2633 }
2634 }
2635
2636 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2637 assert(UseAVX >= 2, "required");
2638 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2639 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2640 if ((UseAVX > 2) &&
2641 (!is_bw || VM_Version::supports_avx512bw()) &&
2642 (!is_vl || VM_Version::supports_avx512vl())) {
2643 switch (elem_bt) {
2644 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2645 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2646 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2647 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2648 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2649 }
2650 } else {
2651 assert(vlen_enc != Assembler::AVX_512bit, "required");
2652 assert((dst->encoding() < 16),"XMM register should be 0-15");
2653 switch (elem_bt) {
2654 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2655 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2656 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2657 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2658 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2659 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2660 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2661 }
2662 }
2663 }
2664
2665 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2666 switch (to_elem_bt) {
2667 case T_SHORT:
2668 vpmovsxbw(dst, src, vlen_enc);
2669 break;
2670 case T_INT:
2671 vpmovsxbd(dst, src, vlen_enc);
2672 break;
2673 case T_FLOAT:
2674 vpmovsxbd(dst, src, vlen_enc);
2675 vcvtdq2ps(dst, dst, vlen_enc);
2676 break;
2677 case T_LONG:
2678 vpmovsxbq(dst, src, vlen_enc);
2679 break;
2680 case T_DOUBLE: {
2681 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2682 vpmovsxbd(dst, src, mid_vlen_enc);
2683 vcvtdq2pd(dst, dst, vlen_enc);
2684 break;
2685 }
2686 default:
2687 fatal("Unsupported type %s", type2name(to_elem_bt));
2688 break;
2689 }
2690 }
2691
2692 //-------------------------------------------------------------------------------------------
2693
2694 // IndexOf for constant substrings with size >= 8 chars
2695 // which don't need to be loaded through stack.
2696 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2697 Register cnt1, Register cnt2,
2698 int int_cnt2, Register result,
2699 XMMRegister vec, Register tmp,
2700 int ae) {
2701 ShortBranchVerifier sbv(this);
2702 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2703 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2704
2705 // This method uses the pcmpestri instruction with bound registers
2706 // inputs:
2707 // xmm - substring
2708 // rax - substring length (elements count)
2709 // mem - scanned string
2710 // rdx - string length (elements count)
2711 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2712 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2713 // outputs:
2714 // rcx - matched index in string
2715 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2716 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2717 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2718 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2719 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2720
2721 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2722 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2723 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2724
2725 // Note, inline_string_indexOf() generates checks:
2726 // if (substr.count > string.count) return -1;
2727 // if (substr.count == 0) return 0;
2728 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2729
2730 // Load substring.
2731 if (ae == StrIntrinsicNode::UL) {
2732 pmovzxbw(vec, Address(str2, 0));
2733 } else {
2734 movdqu(vec, Address(str2, 0));
2735 }
2736 movl(cnt2, int_cnt2);
2737 movptr(result, str1); // string addr
2738
2739 if (int_cnt2 > stride) {
2740 jmpb(SCAN_TO_SUBSTR);
2741
2742 // Reload substr for rescan, this code
2743 // is executed only for large substrings (> 8 chars)
2744 bind(RELOAD_SUBSTR);
2745 if (ae == StrIntrinsicNode::UL) {
2746 pmovzxbw(vec, Address(str2, 0));
2747 } else {
2748 movdqu(vec, Address(str2, 0));
2749 }
2750 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2751
2752 bind(RELOAD_STR);
2753 // We came here after the beginning of the substring was
2754 // matched but the rest of it was not so we need to search
2755 // again. Start from the next element after the previous match.
2756
2757 // cnt2 is number of substring reminding elements and
2758 // cnt1 is number of string reminding elements when cmp failed.
2759 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2760 subl(cnt1, cnt2);
2761 addl(cnt1, int_cnt2);
2762 movl(cnt2, int_cnt2); // Now restore cnt2
2763
2764 decrementl(cnt1); // Shift to next element
2765 cmpl(cnt1, cnt2);
2766 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2767
2768 addptr(result, (1<<scale1));
2769
2770 } // (int_cnt2 > 8)
2771
2772 // Scan string for start of substr in 16-byte vectors
2773 bind(SCAN_TO_SUBSTR);
2774 pcmpestri(vec, Address(result, 0), mode);
2775 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2776 subl(cnt1, stride);
2777 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2778 cmpl(cnt1, cnt2);
2779 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2780 addptr(result, 16);
2781 jmpb(SCAN_TO_SUBSTR);
2782
2783 // Found a potential substr
2784 bind(FOUND_CANDIDATE);
2785 // Matched whole vector if first element matched (tmp(rcx) == 0).
2786 if (int_cnt2 == stride) {
2787 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2788 } else { // int_cnt2 > 8
2789 jccb(Assembler::overflow, FOUND_SUBSTR);
2790 }
2791 // After pcmpestri tmp(rcx) contains matched element index
2792 // Compute start addr of substr
2793 lea(result, Address(result, tmp, scale1));
2794
2795 // Make sure string is still long enough
2796 subl(cnt1, tmp);
2797 cmpl(cnt1, cnt2);
2798 if (int_cnt2 == stride) {
2799 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2800 } else { // int_cnt2 > 8
2801 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2802 }
2803 // Left less then substring.
2804
2805 bind(RET_NOT_FOUND);
2806 movl(result, -1);
2807 jmp(EXIT);
2808
2809 if (int_cnt2 > stride) {
2810 // This code is optimized for the case when whole substring
2811 // is matched if its head is matched.
2812 bind(MATCH_SUBSTR_HEAD);
2813 pcmpestri(vec, Address(result, 0), mode);
2814 // Reload only string if does not match
2815 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2816
2817 Label CONT_SCAN_SUBSTR;
2818 // Compare the rest of substring (> 8 chars).
2819 bind(FOUND_SUBSTR);
2820 // First 8 chars are already matched.
2821 negptr(cnt2);
2822 addptr(cnt2, stride);
2823
2824 bind(SCAN_SUBSTR);
2825 subl(cnt1, stride);
2826 cmpl(cnt2, -stride); // Do not read beyond substring
2827 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2828 // Back-up strings to avoid reading beyond substring:
2829 // cnt1 = cnt1 - cnt2 + 8
2830 addl(cnt1, cnt2); // cnt2 is negative
2831 addl(cnt1, stride);
2832 movl(cnt2, stride); negptr(cnt2);
2833 bind(CONT_SCAN_SUBSTR);
2834 if (int_cnt2 < (int)G) {
2835 int tail_off1 = int_cnt2<<scale1;
2836 int tail_off2 = int_cnt2<<scale2;
2837 if (ae == StrIntrinsicNode::UL) {
2838 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2839 } else {
2840 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2841 }
2842 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2843 } else {
2844 // calculate index in register to avoid integer overflow (int_cnt2*2)
2845 movl(tmp, int_cnt2);
2846 addptr(tmp, cnt2);
2847 if (ae == StrIntrinsicNode::UL) {
2848 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2849 } else {
2850 movdqu(vec, Address(str2, tmp, scale2, 0));
2851 }
2852 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2853 }
2854 // Need to reload strings pointers if not matched whole vector
2855 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2856 addptr(cnt2, stride);
2857 jcc(Assembler::negative, SCAN_SUBSTR);
2858 // Fall through if found full substring
2859
2860 } // (int_cnt2 > 8)
2861
2862 bind(RET_FOUND);
2863 // Found result if we matched full small substring.
2864 // Compute substr offset
2865 subptr(result, str1);
2866 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2867 shrl(result, 1); // index
2868 }
2869 bind(EXIT);
2870
2871 } // string_indexofC8
2872
2873 // Small strings are loaded through stack if they cross page boundary.
2874 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2875 Register cnt1, Register cnt2,
2876 int int_cnt2, Register result,
2877 XMMRegister vec, Register tmp,
2878 int ae) {
2879 ShortBranchVerifier sbv(this);
2880 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2881 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2882
2883 //
2884 // int_cnt2 is length of small (< 8 chars) constant substring
2885 // or (-1) for non constant substring in which case its length
2886 // is in cnt2 register.
2887 //
2888 // Note, inline_string_indexOf() generates checks:
2889 // if (substr.count > string.count) return -1;
2890 // if (substr.count == 0) return 0;
2891 //
2892 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2893 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2894 // This method uses the pcmpestri instruction with bound registers
2895 // inputs:
2896 // xmm - substring
2897 // rax - substring length (elements count)
2898 // mem - scanned string
2899 // rdx - string length (elements count)
2900 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2901 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2902 // outputs:
2903 // rcx - matched index in string
2904 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2905 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2906 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2907 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2908
2909 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2910 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2911 FOUND_CANDIDATE;
2912
2913 { //========================================================
2914 // We don't know where these strings are located
2915 // and we can't read beyond them. Load them through stack.
2916 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2917
2918 movptr(tmp, rsp); // save old SP
2919
2920 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2921 if (int_cnt2 == (1>>scale2)) { // One byte
2922 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2923 load_unsigned_byte(result, Address(str2, 0));
2924 movdl(vec, result); // move 32 bits
2925 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2926 // Not enough header space in 32-bit VM: 12+3 = 15.
2927 movl(result, Address(str2, -1));
2928 shrl(result, 8);
2929 movdl(vec, result); // move 32 bits
2930 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2931 load_unsigned_short(result, Address(str2, 0));
2932 movdl(vec, result); // move 32 bits
2933 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2934 movdl(vec, Address(str2, 0)); // move 32 bits
2935 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2936 movq(vec, Address(str2, 0)); // move 64 bits
2937 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2938 // Array header size is 12 bytes in 32-bit VM
2939 // + 6 bytes for 3 chars == 18 bytes,
2940 // enough space to load vec and shift.
2941 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2942 if (ae == StrIntrinsicNode::UL) {
2943 int tail_off = int_cnt2-8;
2944 pmovzxbw(vec, Address(str2, tail_off));
2945 psrldq(vec, -2*tail_off);
2946 }
2947 else {
2948 int tail_off = int_cnt2*(1<<scale2);
2949 movdqu(vec, Address(str2, tail_off-16));
2950 psrldq(vec, 16-tail_off);
2951 }
2952 }
2953 } else { // not constant substring
2954 cmpl(cnt2, stride);
2955 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2956
2957 // We can read beyond string if srt+16 does not cross page boundary
2958 // since heaps are aligned and mapped by pages.
2959 assert(os::vm_page_size() < (int)G, "default page should be small");
2960 movl(result, str2); // We need only low 32 bits
2961 andl(result, ((int)os::vm_page_size()-1));
2962 cmpl(result, ((int)os::vm_page_size()-16));
2963 jccb(Assembler::belowEqual, CHECK_STR);
2964
2965 // Move small strings to stack to allow load 16 bytes into vec.
2966 subptr(rsp, 16);
2967 int stk_offset = wordSize-(1<<scale2);
2968 push(cnt2);
2969
2970 bind(COPY_SUBSTR);
2971 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2972 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2973 movb(Address(rsp, cnt2, scale2, stk_offset), result);
2974 } else if (ae == StrIntrinsicNode::UU) {
2975 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2976 movw(Address(rsp, cnt2, scale2, stk_offset), result);
2977 }
2978 decrement(cnt2);
2979 jccb(Assembler::notZero, COPY_SUBSTR);
2980
2981 pop(cnt2);
2982 movptr(str2, rsp); // New substring address
2983 } // non constant
2984
2985 bind(CHECK_STR);
2986 cmpl(cnt1, stride);
2987 jccb(Assembler::aboveEqual, BIG_STRINGS);
2988
2989 // Check cross page boundary.
2990 movl(result, str1); // We need only low 32 bits
2991 andl(result, ((int)os::vm_page_size()-1));
2992 cmpl(result, ((int)os::vm_page_size()-16));
2993 jccb(Assembler::belowEqual, BIG_STRINGS);
2994
2995 subptr(rsp, 16);
2996 int stk_offset = -(1<<scale1);
2997 if (int_cnt2 < 0) { // not constant
2998 push(cnt2);
2999 stk_offset += wordSize;
3000 }
3001 movl(cnt2, cnt1);
3002
3003 bind(COPY_STR);
3004 if (ae == StrIntrinsicNode::LL) {
3005 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3006 movb(Address(rsp, cnt2, scale1, stk_offset), result);
3007 } else {
3008 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3009 movw(Address(rsp, cnt2, scale1, stk_offset), result);
3010 }
3011 decrement(cnt2);
3012 jccb(Assembler::notZero, COPY_STR);
3013
3014 if (int_cnt2 < 0) { // not constant
3015 pop(cnt2);
3016 }
3017 movptr(str1, rsp); // New string address
3018
3019 bind(BIG_STRINGS);
3020 // Load substring.
3021 if (int_cnt2 < 0) { // -1
3022 if (ae == StrIntrinsicNode::UL) {
3023 pmovzxbw(vec, Address(str2, 0));
3024 } else {
3025 movdqu(vec, Address(str2, 0));
3026 }
3027 push(cnt2); // substr count
3028 push(str2); // substr addr
3029 push(str1); // string addr
3030 } else {
3031 // Small (< 8 chars) constant substrings are loaded already.
3032 movl(cnt2, int_cnt2);
3033 }
3034 push(tmp); // original SP
3035
3036 } // Finished loading
3037
3038 //========================================================
3039 // Start search
3040 //
3041
3042 movptr(result, str1); // string addr
3043
3044 if (int_cnt2 < 0) { // Only for non constant substring
3045 jmpb(SCAN_TO_SUBSTR);
3046
3047 // SP saved at sp+0
3048 // String saved at sp+1*wordSize
3049 // Substr saved at sp+2*wordSize
3050 // Substr count saved at sp+3*wordSize
3051
3052 // Reload substr for rescan, this code
3053 // is executed only for large substrings (> 8 chars)
3054 bind(RELOAD_SUBSTR);
3055 movptr(str2, Address(rsp, 2*wordSize));
3056 movl(cnt2, Address(rsp, 3*wordSize));
3057 if (ae == StrIntrinsicNode::UL) {
3058 pmovzxbw(vec, Address(str2, 0));
3059 } else {
3060 movdqu(vec, Address(str2, 0));
3061 }
3062 // We came here after the beginning of the substring was
3063 // matched but the rest of it was not so we need to search
3064 // again. Start from the next element after the previous match.
3065 subptr(str1, result); // Restore counter
3066 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3067 shrl(str1, 1);
3068 }
3069 addl(cnt1, str1);
3070 decrementl(cnt1); // Shift to next element
3071 cmpl(cnt1, cnt2);
3072 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3073
3074 addptr(result, (1<<scale1));
3075 } // non constant
3076
3077 // Scan string for start of substr in 16-byte vectors
3078 bind(SCAN_TO_SUBSTR);
3079 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3080 pcmpestri(vec, Address(result, 0), mode);
3081 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3082 subl(cnt1, stride);
3083 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3084 cmpl(cnt1, cnt2);
3085 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3086 addptr(result, 16);
3087
3088 bind(ADJUST_STR);
3089 cmpl(cnt1, stride); // Do not read beyond string
3090 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3091 // Back-up string to avoid reading beyond string.
3092 lea(result, Address(result, cnt1, scale1, -16));
3093 movl(cnt1, stride);
3094 jmpb(SCAN_TO_SUBSTR);
3095
3096 // Found a potential substr
3097 bind(FOUND_CANDIDATE);
3098 // After pcmpestri tmp(rcx) contains matched element index
3099
3100 // Make sure string is still long enough
3101 subl(cnt1, tmp);
3102 cmpl(cnt1, cnt2);
3103 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3104 // Left less then substring.
3105
3106 bind(RET_NOT_FOUND);
3107 movl(result, -1);
3108 jmp(CLEANUP);
3109
3110 bind(FOUND_SUBSTR);
3111 // Compute start addr of substr
3112 lea(result, Address(result, tmp, scale1));
3113 if (int_cnt2 > 0) { // Constant substring
3114 // Repeat search for small substring (< 8 chars)
3115 // from new point without reloading substring.
3116 // Have to check that we don't read beyond string.
3117 cmpl(tmp, stride-int_cnt2);
3118 jccb(Assembler::greater, ADJUST_STR);
3119 // Fall through if matched whole substring.
3120 } else { // non constant
3121 assert(int_cnt2 == -1, "should be != 0");
3122
3123 addl(tmp, cnt2);
3124 // Found result if we matched whole substring.
3125 cmpl(tmp, stride);
3126 jcc(Assembler::lessEqual, RET_FOUND);
3127
3128 // Repeat search for small substring (<= 8 chars)
3129 // from new point 'str1' without reloading substring.
3130 cmpl(cnt2, stride);
3131 // Have to check that we don't read beyond string.
3132 jccb(Assembler::lessEqual, ADJUST_STR);
3133
3134 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3135 // Compare the rest of substring (> 8 chars).
3136 movptr(str1, result);
3137
3138 cmpl(tmp, cnt2);
3139 // First 8 chars are already matched.
3140 jccb(Assembler::equal, CHECK_NEXT);
3141
3142 bind(SCAN_SUBSTR);
3143 pcmpestri(vec, Address(str1, 0), mode);
3144 // Need to reload strings pointers if not matched whole vector
3145 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3146
3147 bind(CHECK_NEXT);
3148 subl(cnt2, stride);
3149 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3150 addptr(str1, 16);
3151 if (ae == StrIntrinsicNode::UL) {
3152 addptr(str2, 8);
3153 } else {
3154 addptr(str2, 16);
3155 }
3156 subl(cnt1, stride);
3157 cmpl(cnt2, stride); // Do not read beyond substring
3158 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3159 // Back-up strings to avoid reading beyond substring.
3160
3161 if (ae == StrIntrinsicNode::UL) {
3162 lea(str2, Address(str2, cnt2, scale2, -8));
3163 lea(str1, Address(str1, cnt2, scale1, -16));
3164 } else {
3165 lea(str2, Address(str2, cnt2, scale2, -16));
3166 lea(str1, Address(str1, cnt2, scale1, -16));
3167 }
3168 subl(cnt1, cnt2);
3169 movl(cnt2, stride);
3170 addl(cnt1, stride);
3171 bind(CONT_SCAN_SUBSTR);
3172 if (ae == StrIntrinsicNode::UL) {
3173 pmovzxbw(vec, Address(str2, 0));
3174 } else {
3175 movdqu(vec, Address(str2, 0));
3176 }
3177 jmp(SCAN_SUBSTR);
3178
3179 bind(RET_FOUND_LONG);
3180 movptr(str1, Address(rsp, wordSize));
3181 } // non constant
3182
3183 bind(RET_FOUND);
3184 // Compute substr offset
3185 subptr(result, str1);
3186 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3187 shrl(result, 1); // index
3188 }
3189 bind(CLEANUP);
3190 pop(rsp); // restore SP
3191
3192 } // string_indexof
3193
3194 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3195 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3196 ShortBranchVerifier sbv(this);
3197 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3198
3199 int stride = 8;
3200
3201 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3202 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3203 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3204 FOUND_SEQ_CHAR, DONE_LABEL;
3205
3206 movptr(result, str1);
3207 if (UseAVX >= 2) {
3208 cmpl(cnt1, stride);
3209 jcc(Assembler::less, SCAN_TO_CHAR);
3210 cmpl(cnt1, 2*stride);
3211 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3212 movdl(vec1, ch);
3213 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3214 vpxor(vec2, vec2);
3215 movl(tmp, cnt1);
3216 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3217 andl(cnt1,0x0000000F); //tail count (in chars)
3218
3219 bind(SCAN_TO_16_CHAR_LOOP);
3220 vmovdqu(vec3, Address(result, 0));
3221 vpcmpeqw(vec3, vec3, vec1, 1);
3222 vptest(vec2, vec3);
3223 jcc(Assembler::carryClear, FOUND_CHAR);
3224 addptr(result, 32);
3225 subl(tmp, 2*stride);
3226 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3227 jmp(SCAN_TO_8_CHAR);
3228 bind(SCAN_TO_8_CHAR_INIT);
3229 movdl(vec1, ch);
3230 pshuflw(vec1, vec1, 0x00);
3231 pshufd(vec1, vec1, 0);
3232 pxor(vec2, vec2);
3233 }
3234 bind(SCAN_TO_8_CHAR);
3235 cmpl(cnt1, stride);
3236 jcc(Assembler::less, SCAN_TO_CHAR);
3237 if (UseAVX < 2) {
3238 movdl(vec1, ch);
3239 pshuflw(vec1, vec1, 0x00);
3240 pshufd(vec1, vec1, 0);
3241 pxor(vec2, vec2);
3242 }
3243 movl(tmp, cnt1);
3244 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3245 andl(cnt1,0x00000007); //tail count (in chars)
3246
3247 bind(SCAN_TO_8_CHAR_LOOP);
3248 movdqu(vec3, Address(result, 0));
3249 pcmpeqw(vec3, vec1);
3250 ptest(vec2, vec3);
3251 jcc(Assembler::carryClear, FOUND_CHAR);
3252 addptr(result, 16);
3253 subl(tmp, stride);
3254 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3255 bind(SCAN_TO_CHAR);
3256 testl(cnt1, cnt1);
3257 jcc(Assembler::zero, RET_NOT_FOUND);
3258 bind(SCAN_TO_CHAR_LOOP);
3259 load_unsigned_short(tmp, Address(result, 0));
3260 cmpl(ch, tmp);
3261 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3262 addptr(result, 2);
3263 subl(cnt1, 1);
3264 jccb(Assembler::zero, RET_NOT_FOUND);
3265 jmp(SCAN_TO_CHAR_LOOP);
3266
3267 bind(RET_NOT_FOUND);
3268 movl(result, -1);
3269 jmpb(DONE_LABEL);
3270
3271 bind(FOUND_CHAR);
3272 if (UseAVX >= 2) {
3273 vpmovmskb(tmp, vec3);
3274 } else {
3275 pmovmskb(tmp, vec3);
3276 }
3277 bsfl(ch, tmp);
3278 addptr(result, ch);
3279
3280 bind(FOUND_SEQ_CHAR);
3281 subptr(result, str1);
3282 shrl(result, 1);
3283
3284 bind(DONE_LABEL);
3285 } // string_indexof_char
3286
3287 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3288 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3289 ShortBranchVerifier sbv(this);
3290 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3291
3292 int stride = 16;
3293
3294 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3295 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3296 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3297 FOUND_SEQ_CHAR, DONE_LABEL;
3298
3299 movptr(result, str1);
3300 if (UseAVX >= 2) {
3301 cmpl(cnt1, stride);
3302 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3303 cmpl(cnt1, stride*2);
3304 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3305 movdl(vec1, ch);
3306 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3307 vpxor(vec2, vec2);
3308 movl(tmp, cnt1);
3309 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3310 andl(cnt1,0x0000001F); //tail count (in chars)
3311
3312 bind(SCAN_TO_32_CHAR_LOOP);
3313 vmovdqu(vec3, Address(result, 0));
3314 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3315 vptest(vec2, vec3);
3316 jcc(Assembler::carryClear, FOUND_CHAR);
3317 addptr(result, 32);
3318 subl(tmp, stride*2);
3319 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3320 jmp(SCAN_TO_16_CHAR);
3321
3322 bind(SCAN_TO_16_CHAR_INIT);
3323 movdl(vec1, ch);
3324 pxor(vec2, vec2);
3325 pshufb(vec1, vec2);
3326 }
3327
3328 bind(SCAN_TO_16_CHAR);
3329 cmpl(cnt1, stride);
3330 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3331 if (UseAVX < 2) {
3332 movdl(vec1, ch);
3333 pxor(vec2, vec2);
3334 pshufb(vec1, vec2);
3335 }
3336 movl(tmp, cnt1);
3337 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3338 andl(cnt1,0x0000000F); //tail count (in bytes)
3339
3340 bind(SCAN_TO_16_CHAR_LOOP);
3341 movdqu(vec3, Address(result, 0));
3342 pcmpeqb(vec3, vec1);
3343 ptest(vec2, vec3);
3344 jcc(Assembler::carryClear, FOUND_CHAR);
3345 addptr(result, 16);
3346 subl(tmp, stride);
3347 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3348
3349 bind(SCAN_TO_CHAR_INIT);
3350 testl(cnt1, cnt1);
3351 jcc(Assembler::zero, RET_NOT_FOUND);
3352 bind(SCAN_TO_CHAR_LOOP);
3353 load_unsigned_byte(tmp, Address(result, 0));
3354 cmpl(ch, tmp);
3355 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3356 addptr(result, 1);
3357 subl(cnt1, 1);
3358 jccb(Assembler::zero, RET_NOT_FOUND);
3359 jmp(SCAN_TO_CHAR_LOOP);
3360
3361 bind(RET_NOT_FOUND);
3362 movl(result, -1);
3363 jmpb(DONE_LABEL);
3364
3365 bind(FOUND_CHAR);
3366 if (UseAVX >= 2) {
3367 vpmovmskb(tmp, vec3);
3368 } else {
3369 pmovmskb(tmp, vec3);
3370 }
3371 bsfl(ch, tmp);
3372 addptr(result, ch);
3373
3374 bind(FOUND_SEQ_CHAR);
3375 subptr(result, str1);
3376
3377 bind(DONE_LABEL);
3378 } // stringL_indexof_char
3379
3380 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3381 switch (eltype) {
3382 case T_BOOLEAN: return sizeof(jboolean);
3383 case T_BYTE: return sizeof(jbyte);
3384 case T_SHORT: return sizeof(jshort);
3385 case T_CHAR: return sizeof(jchar);
3386 case T_INT: return sizeof(jint);
3387 default:
3388 ShouldNotReachHere();
3389 return -1;
3390 }
3391 }
3392
3393 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3394 switch (eltype) {
3395 // T_BOOLEAN used as surrogate for unsigned byte
3396 case T_BOOLEAN: movzbl(dst, src); break;
3397 case T_BYTE: movsbl(dst, src); break;
3398 case T_SHORT: movswl(dst, src); break;
3399 case T_CHAR: movzwl(dst, src); break;
3400 case T_INT: movl(dst, src); break;
3401 default:
3402 ShouldNotReachHere();
3403 }
3404 }
3405
3406 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3407 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3408 }
3409
3410 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3411 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3412 }
3413
3414 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3415 const int vlen = Assembler::AVX_256bit;
3416 switch (eltype) {
3417 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3418 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3419 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3420 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3421 case T_INT:
3422 // do nothing
3423 break;
3424 default:
3425 ShouldNotReachHere();
3426 }
3427 }
3428
3429 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3430 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3431 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3432 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3433 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3434 BasicType eltype) {
3435 ShortBranchVerifier sbv(this);
3436 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3437 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3438 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3439
3440 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3441 SHORT_UNROLLED_LOOP_EXIT,
3442 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3443 UNROLLED_VECTOR_LOOP_BEGIN,
3444 END;
3445 switch (eltype) {
3446 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3447 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3448 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3449 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3450 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3451 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3452 }
3453
3454 // For "renaming" for readibility of the code
3455 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3456 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3457 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3458
3459 const int elsize = arrays_hashcode_elsize(eltype);
3460
3461 /*
3462 if (cnt1 >= 2) {
3463 if (cnt1 >= 32) {
3464 UNROLLED VECTOR LOOP
3465 }
3466 UNROLLED SCALAR LOOP
3467 }
3468 SINGLE SCALAR
3469 */
3470
3471 cmpl(cnt1, 32);
3472 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3473
3474 // cnt1 >= 32 && generate_vectorized_loop
3475 xorl(index, index);
3476
3477 // vresult = IntVector.zero(I256);
3478 for (int idx = 0; idx < 4; idx++) {
3479 vpxor(vresult[idx], vresult[idx]);
3480 }
3481 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3482 Register bound = tmp2;
3483 Register next = tmp3;
3484 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3485 movl(next, Address(tmp2, 0));
3486 movdl(vnext, next);
3487 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3488
3489 // index = 0;
3490 // bound = cnt1 & ~(32 - 1);
3491 movl(bound, cnt1);
3492 andl(bound, ~(32 - 1));
3493 // for (; index < bound; index += 32) {
3494 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3495 // result *= next;
3496 imull(result, next);
3497 // loop fission to upfront the cost of fetching from memory, OOO execution
3498 // can then hopefully do a better job of prefetching
3499 for (int idx = 0; idx < 4; idx++) {
3500 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3501 }
3502 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3503 for (int idx = 0; idx < 4; idx++) {
3504 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3505 arrays_hashcode_elvcast(vtmp[idx], eltype);
3506 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3507 }
3508 // index += 32;
3509 addl(index, 32);
3510 // index < bound;
3511 cmpl(index, bound);
3512 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3513 // }
3514
3515 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3516 subl(cnt1, bound);
3517 // release bound
3518
3519 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3520 for (int idx = 0; idx < 4; idx++) {
3521 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3522 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3523 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3524 }
3525 // result += vresult.reduceLanes(ADD);
3526 for (int idx = 0; idx < 4; idx++) {
3527 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3528 }
3529
3530 // } else if (cnt1 < 32) {
3531
3532 bind(SHORT_UNROLLED_BEGIN);
3533 // int i = 1;
3534 movl(index, 1);
3535 cmpl(index, cnt1);
3536 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3537
3538 // for (; i < cnt1 ; i += 2) {
3539 bind(SHORT_UNROLLED_LOOP_BEGIN);
3540 movl(tmp3, 961);
3541 imull(result, tmp3);
3542 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3543 movl(tmp3, tmp2);
3544 shll(tmp3, 5);
3545 subl(tmp3, tmp2);
3546 addl(result, tmp3);
3547 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3548 addl(result, tmp3);
3549 addl(index, 2);
3550 cmpl(index, cnt1);
3551 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3552
3553 // }
3554 // if (i >= cnt1) {
3555 bind(SHORT_UNROLLED_LOOP_EXIT);
3556 jccb(Assembler::greater, END);
3557 movl(tmp2, result);
3558 shll(result, 5);
3559 subl(result, tmp2);
3560 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3561 addl(result, tmp3);
3562 // }
3563 bind(END);
3564
3565 BLOCK_COMMENT("} // arrays_hashcode");
3566
3567 } // arrays_hashcode
3568
3569 // helper function for string_compare
3570 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3571 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3572 Address::ScaleFactor scale2, Register index, int ae) {
3573 if (ae == StrIntrinsicNode::LL) {
3574 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3575 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3576 } else if (ae == StrIntrinsicNode::UU) {
3577 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3578 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3579 } else {
3580 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3581 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3582 }
3583 }
3584
3585 // Compare strings, used for char[] and byte[].
3586 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3587 Register cnt1, Register cnt2, Register result,
3588 XMMRegister vec1, int ae, KRegister mask) {
3589 ShortBranchVerifier sbv(this);
3590 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3591 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3592 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3593 int stride2x2 = 0x40;
3594 Address::ScaleFactor scale = Address::no_scale;
3595 Address::ScaleFactor scale1 = Address::no_scale;
3596 Address::ScaleFactor scale2 = Address::no_scale;
3597
3598 if (ae != StrIntrinsicNode::LL) {
3599 stride2x2 = 0x20;
3600 }
3601
3602 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3603 shrl(cnt2, 1);
3604 }
3605 // Compute the minimum of the string lengths and the
3606 // difference of the string lengths (stack).
3607 // Do the conditional move stuff
3608 movl(result, cnt1);
3609 subl(cnt1, cnt2);
3610 push(cnt1);
3611 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3612
3613 // Is the minimum length zero?
3614 testl(cnt2, cnt2);
3615 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3616 if (ae == StrIntrinsicNode::LL) {
3617 // Load first bytes
3618 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3619 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3620 } else if (ae == StrIntrinsicNode::UU) {
3621 // Load first characters
3622 load_unsigned_short(result, Address(str1, 0));
3623 load_unsigned_short(cnt1, Address(str2, 0));
3624 } else {
3625 load_unsigned_byte(result, Address(str1, 0));
3626 load_unsigned_short(cnt1, Address(str2, 0));
3627 }
3628 subl(result, cnt1);
3629 jcc(Assembler::notZero, POP_LABEL);
3630
3631 if (ae == StrIntrinsicNode::UU) {
3632 // Divide length by 2 to get number of chars
3633 shrl(cnt2, 1);
3634 }
3635 cmpl(cnt2, 1);
3636 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3637
3638 // Check if the strings start at the same location and setup scale and stride
3639 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3640 cmpptr(str1, str2);
3641 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3642 if (ae == StrIntrinsicNode::LL) {
3643 scale = Address::times_1;
3644 stride = 16;
3645 } else {
3646 scale = Address::times_2;
3647 stride = 8;
3648 }
3649 } else {
3650 scale1 = Address::times_1;
3651 scale2 = Address::times_2;
3652 // scale not used
3653 stride = 8;
3654 }
3655
3656 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3657 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3658 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3659 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3660 Label COMPARE_TAIL_LONG;
3661 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3662
3663 int pcmpmask = 0x19;
3664 if (ae == StrIntrinsicNode::LL) {
3665 pcmpmask &= ~0x01;
3666 }
3667
3668 // Setup to compare 16-chars (32-bytes) vectors,
3669 // start from first character again because it has aligned address.
3670 if (ae == StrIntrinsicNode::LL) {
3671 stride2 = 32;
3672 } else {
3673 stride2 = 16;
3674 }
3675 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3676 adr_stride = stride << scale;
3677 } else {
3678 adr_stride1 = 8; //stride << scale1;
3679 adr_stride2 = 16; //stride << scale2;
3680 }
3681
3682 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3683 // rax and rdx are used by pcmpestri as elements counters
3684 movl(result, cnt2);
3685 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3686 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3687
3688 // fast path : compare first 2 8-char vectors.
3689 bind(COMPARE_16_CHARS);
3690 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3691 movdqu(vec1, Address(str1, 0));
3692 } else {
3693 pmovzxbw(vec1, Address(str1, 0));
3694 }
3695 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3696 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3697
3698 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3699 movdqu(vec1, Address(str1, adr_stride));
3700 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3701 } else {
3702 pmovzxbw(vec1, Address(str1, adr_stride1));
3703 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3704 }
3705 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3706 addl(cnt1, stride);
3707
3708 // Compare the characters at index in cnt1
3709 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3710 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3711 subl(result, cnt2);
3712 jmp(POP_LABEL);
3713
3714 // Setup the registers to start vector comparison loop
3715 bind(COMPARE_WIDE_VECTORS);
3716 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3717 lea(str1, Address(str1, result, scale));
3718 lea(str2, Address(str2, result, scale));
3719 } else {
3720 lea(str1, Address(str1, result, scale1));
3721 lea(str2, Address(str2, result, scale2));
3722 }
3723 subl(result, stride2);
3724 subl(cnt2, stride2);
3725 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3726 negptr(result);
3727
3728 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3729 bind(COMPARE_WIDE_VECTORS_LOOP);
3730
3731 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3732 cmpl(cnt2, stride2x2);
3733 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3734 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3735 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3736
3737 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3738 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3739 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3740 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3741 } else {
3742 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3743 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3744 }
3745 kortestql(mask, mask);
3746 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3747 addptr(result, stride2x2); // update since we already compared at this addr
3748 subl(cnt2, stride2x2); // and sub the size too
3749 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3750
3751 vpxor(vec1, vec1);
3752 jmpb(COMPARE_WIDE_TAIL);
3753 }//if (VM_Version::supports_avx512vlbw())
3754
3755 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3756 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3757 vmovdqu(vec1, Address(str1, result, scale));
3758 vpxor(vec1, Address(str2, result, scale));
3759 } else {
3760 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3761 vpxor(vec1, Address(str2, result, scale2));
3762 }
3763 vptest(vec1, vec1);
3764 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3765 addptr(result, stride2);
3766 subl(cnt2, stride2);
3767 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3768 // clean upper bits of YMM registers
3769 vpxor(vec1, vec1);
3770
3771 // compare wide vectors tail
3772 bind(COMPARE_WIDE_TAIL);
3773 testptr(result, result);
3774 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3775
3776 movl(result, stride2);
3777 movl(cnt2, result);
3778 negptr(result);
3779 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3780
3781 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3782 bind(VECTOR_NOT_EQUAL);
3783 // clean upper bits of YMM registers
3784 vpxor(vec1, vec1);
3785 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3786 lea(str1, Address(str1, result, scale));
3787 lea(str2, Address(str2, result, scale));
3788 } else {
3789 lea(str1, Address(str1, result, scale1));
3790 lea(str2, Address(str2, result, scale2));
3791 }
3792 jmp(COMPARE_16_CHARS);
3793
3794 // Compare tail chars, length between 1 to 15 chars
3795 bind(COMPARE_TAIL_LONG);
3796 movl(cnt2, result);
3797 cmpl(cnt2, stride);
3798 jcc(Assembler::less, COMPARE_SMALL_STR);
3799
3800 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3801 movdqu(vec1, Address(str1, 0));
3802 } else {
3803 pmovzxbw(vec1, Address(str1, 0));
3804 }
3805 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3806 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3807 subptr(cnt2, stride);
3808 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3809 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3810 lea(str1, Address(str1, result, scale));
3811 lea(str2, Address(str2, result, scale));
3812 } else {
3813 lea(str1, Address(str1, result, scale1));
3814 lea(str2, Address(str2, result, scale2));
3815 }
3816 negptr(cnt2);
3817 jmpb(WHILE_HEAD_LABEL);
3818
3819 bind(COMPARE_SMALL_STR);
3820 } else if (UseSSE42Intrinsics) {
3821 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3822 int pcmpmask = 0x19;
3823 // Setup to compare 8-char (16-byte) vectors,
3824 // start from first character again because it has aligned address.
3825 movl(result, cnt2);
3826 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3827 if (ae == StrIntrinsicNode::LL) {
3828 pcmpmask &= ~0x01;
3829 }
3830 jcc(Assembler::zero, COMPARE_TAIL);
3831 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3832 lea(str1, Address(str1, result, scale));
3833 lea(str2, Address(str2, result, scale));
3834 } else {
3835 lea(str1, Address(str1, result, scale1));
3836 lea(str2, Address(str2, result, scale2));
3837 }
3838 negptr(result);
3839
3840 // pcmpestri
3841 // inputs:
3842 // vec1- substring
3843 // rax - negative string length (elements count)
3844 // mem - scanned string
3845 // rdx - string length (elements count)
3846 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3847 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3848 // outputs:
3849 // rcx - first mismatched element index
3850 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3851
3852 bind(COMPARE_WIDE_VECTORS);
3853 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3854 movdqu(vec1, Address(str1, result, scale));
3855 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3856 } else {
3857 pmovzxbw(vec1, Address(str1, result, scale1));
3858 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3859 }
3860 // After pcmpestri cnt1(rcx) contains mismatched element index
3861
3862 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3863 addptr(result, stride);
3864 subptr(cnt2, stride);
3865 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3866
3867 // compare wide vectors tail
3868 testptr(result, result);
3869 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3870
3871 movl(cnt2, stride);
3872 movl(result, stride);
3873 negptr(result);
3874 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3875 movdqu(vec1, Address(str1, result, scale));
3876 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3877 } else {
3878 pmovzxbw(vec1, Address(str1, result, scale1));
3879 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3880 }
3881 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3882
3883 // Mismatched characters in the vectors
3884 bind(VECTOR_NOT_EQUAL);
3885 addptr(cnt1, result);
3886 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3887 subl(result, cnt2);
3888 jmpb(POP_LABEL);
3889
3890 bind(COMPARE_TAIL); // limit is zero
3891 movl(cnt2, result);
3892 // Fallthru to tail compare
3893 }
3894 // Shift str2 and str1 to the end of the arrays, negate min
3895 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3896 lea(str1, Address(str1, cnt2, scale));
3897 lea(str2, Address(str2, cnt2, scale));
3898 } else {
3899 lea(str1, Address(str1, cnt2, scale1));
3900 lea(str2, Address(str2, cnt2, scale2));
3901 }
3902 decrementl(cnt2); // first character was compared already
3903 negptr(cnt2);
3904
3905 // Compare the rest of the elements
3906 bind(WHILE_HEAD_LABEL);
3907 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3908 subl(result, cnt1);
3909 jccb(Assembler::notZero, POP_LABEL);
3910 increment(cnt2);
3911 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3912
3913 // Strings are equal up to min length. Return the length difference.
3914 bind(LENGTH_DIFF_LABEL);
3915 pop(result);
3916 if (ae == StrIntrinsicNode::UU) {
3917 // Divide diff by 2 to get number of chars
3918 sarl(result, 1);
3919 }
3920 jmpb(DONE_LABEL);
3921
3922 if (VM_Version::supports_avx512vlbw()) {
3923
3924 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3925
3926 kmovql(cnt1, mask);
3927 notq(cnt1);
3928 bsfq(cnt2, cnt1);
3929 if (ae != StrIntrinsicNode::LL) {
3930 // Divide diff by 2 to get number of chars
3931 sarl(cnt2, 1);
3932 }
3933 addq(result, cnt2);
3934 if (ae == StrIntrinsicNode::LL) {
3935 load_unsigned_byte(cnt1, Address(str2, result));
3936 load_unsigned_byte(result, Address(str1, result));
3937 } else if (ae == StrIntrinsicNode::UU) {
3938 load_unsigned_short(cnt1, Address(str2, result, scale));
3939 load_unsigned_short(result, Address(str1, result, scale));
3940 } else {
3941 load_unsigned_short(cnt1, Address(str2, result, scale2));
3942 load_unsigned_byte(result, Address(str1, result, scale1));
3943 }
3944 subl(result, cnt1);
3945 jmpb(POP_LABEL);
3946 }//if (VM_Version::supports_avx512vlbw())
3947
3948 // Discard the stored length difference
3949 bind(POP_LABEL);
3950 pop(cnt1);
3951
3952 // That's it
3953 bind(DONE_LABEL);
3954 if(ae == StrIntrinsicNode::UL) {
3955 negl(result);
3956 }
3957
3958 }
3959
3960 // Search for Non-ASCII character (Negative byte value) in a byte array,
3961 // return the index of the first such character, otherwise the length
3962 // of the array segment searched.
3963 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3964 // @IntrinsicCandidate
3965 // public static int countPositives(byte[] ba, int off, int len) {
3966 // for (int i = off; i < off + len; i++) {
3967 // if (ba[i] < 0) {
3968 // return i - off;
3969 // }
3970 // }
3971 // return len;
3972 // }
3973 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3974 Register result, Register tmp1,
3975 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3976 // rsi: byte array
3977 // rcx: len
3978 // rax: result
3979 ShortBranchVerifier sbv(this);
3980 assert_different_registers(ary1, len, result, tmp1);
3981 assert_different_registers(vec1, vec2);
3982 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3983
3984 movl(result, len); // copy
3985 // len == 0
3986 testl(len, len);
3987 jcc(Assembler::zero, DONE);
3988
3989 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3990 VM_Version::supports_avx512vlbw() &&
3991 VM_Version::supports_bmi2()) {
3992
3993 Label test_64_loop, test_tail, BREAK_LOOP;
3994 movl(tmp1, len);
3995 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3996
3997 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3998 andl(len, 0xffffffc0); // vector count (in chars)
3999 jccb(Assembler::zero, test_tail);
4000
4001 lea(ary1, Address(ary1, len, Address::times_1));
4002 negptr(len);
4003
4004 bind(test_64_loop);
4005 // Check whether our 64 elements of size byte contain negatives
4006 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4007 kortestql(mask1, mask1);
4008 jcc(Assembler::notZero, BREAK_LOOP);
4009
4010 addptr(len, 64);
4011 jccb(Assembler::notZero, test_64_loop);
4012
4013 bind(test_tail);
4014 // bail out when there is nothing to be done
4015 testl(tmp1, -1);
4016 jcc(Assembler::zero, DONE);
4017
4018
4019 // check the tail for absense of negatives
4020 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4021 {
4022 Register tmp3_aliased = len;
4023 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4024 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4025 notq(tmp3_aliased);
4026 kmovql(mask2, tmp3_aliased);
4027 }
4028
4029 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4030 ktestq(mask1, mask2);
4031 jcc(Assembler::zero, DONE);
4032
4033 // do a full check for negative registers in the tail
4034 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4035 // ary1 already pointing to the right place
4036 jmpb(TAIL_START);
4037
4038 bind(BREAK_LOOP);
4039 // At least one byte in the last 64 byte block was negative.
4040 // Set up to look at the last 64 bytes as if they were a tail
4041 lea(ary1, Address(ary1, len, Address::times_1));
4042 addptr(result, len);
4043 // Ignore the very last byte: if all others are positive,
4044 // it must be negative, so we can skip right to the 2+1 byte
4045 // end comparison at this point
4046 orl(result, 63);
4047 movl(len, 63);
4048 // Fallthru to tail compare
4049 } else {
4050
4051 if (UseAVX >= 2) {
4052 // With AVX2, use 32-byte vector compare
4053 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4054
4055 // Compare 32-byte vectors
4056 testl(len, 0xffffffe0); // vector count (in bytes)
4057 jccb(Assembler::zero, TAIL_START);
4058
4059 andl(len, 0xffffffe0);
4060 lea(ary1, Address(ary1, len, Address::times_1));
4061 negptr(len);
4062
4063 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4064 movdl(vec2, tmp1);
4065 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4066
4067 bind(COMPARE_WIDE_VECTORS);
4068 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4069 vptest(vec1, vec2);
4070 jccb(Assembler::notZero, BREAK_LOOP);
4071 addptr(len, 32);
4072 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4073
4074 testl(result, 0x0000001f); // any bytes remaining?
4075 jcc(Assembler::zero, DONE);
4076
4077 // Quick test using the already prepared vector mask
4078 movl(len, result);
4079 andl(len, 0x0000001f);
4080 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4081 vptest(vec1, vec2);
4082 jcc(Assembler::zero, DONE);
4083 // There are zeros, jump to the tail to determine exactly where
4084 jmpb(TAIL_START);
4085
4086 bind(BREAK_LOOP);
4087 // At least one byte in the last 32-byte vector is negative.
4088 // Set up to look at the last 32 bytes as if they were a tail
4089 lea(ary1, Address(ary1, len, Address::times_1));
4090 addptr(result, len);
4091 // Ignore the very last byte: if all others are positive,
4092 // it must be negative, so we can skip right to the 2+1 byte
4093 // end comparison at this point
4094 orl(result, 31);
4095 movl(len, 31);
4096 // Fallthru to tail compare
4097 } else if (UseSSE42Intrinsics) {
4098 // With SSE4.2, use double quad vector compare
4099 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4100
4101 // Compare 16-byte vectors
4102 testl(len, 0xfffffff0); // vector count (in bytes)
4103 jcc(Assembler::zero, TAIL_START);
4104
4105 andl(len, 0xfffffff0);
4106 lea(ary1, Address(ary1, len, Address::times_1));
4107 negptr(len);
4108
4109 movl(tmp1, 0x80808080);
4110 movdl(vec2, tmp1);
4111 pshufd(vec2, vec2, 0);
4112
4113 bind(COMPARE_WIDE_VECTORS);
4114 movdqu(vec1, Address(ary1, len, Address::times_1));
4115 ptest(vec1, vec2);
4116 jccb(Assembler::notZero, BREAK_LOOP);
4117 addptr(len, 16);
4118 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4119
4120 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4121 jcc(Assembler::zero, DONE);
4122
4123 // Quick test using the already prepared vector mask
4124 movl(len, result);
4125 andl(len, 0x0000000f); // tail count (in bytes)
4126 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4127 ptest(vec1, vec2);
4128 jcc(Assembler::zero, DONE);
4129 jmpb(TAIL_START);
4130
4131 bind(BREAK_LOOP);
4132 // At least one byte in the last 16-byte vector is negative.
4133 // Set up and look at the last 16 bytes as if they were a tail
4134 lea(ary1, Address(ary1, len, Address::times_1));
4135 addptr(result, len);
4136 // Ignore the very last byte: if all others are positive,
4137 // it must be negative, so we can skip right to the 2+1 byte
4138 // end comparison at this point
4139 orl(result, 15);
4140 movl(len, 15);
4141 // Fallthru to tail compare
4142 }
4143 }
4144
4145 bind(TAIL_START);
4146 // Compare 4-byte vectors
4147 andl(len, 0xfffffffc); // vector count (in bytes)
4148 jccb(Assembler::zero, COMPARE_CHAR);
4149
4150 lea(ary1, Address(ary1, len, Address::times_1));
4151 negptr(len);
4152
4153 bind(COMPARE_VECTORS);
4154 movl(tmp1, Address(ary1, len, Address::times_1));
4155 andl(tmp1, 0x80808080);
4156 jccb(Assembler::notZero, TAIL_ADJUST);
4157 addptr(len, 4);
4158 jccb(Assembler::notZero, COMPARE_VECTORS);
4159
4160 // Compare trailing char (final 2-3 bytes), if any
4161 bind(COMPARE_CHAR);
4162
4163 testl(result, 0x2); // tail char
4164 jccb(Assembler::zero, COMPARE_BYTE);
4165 load_unsigned_short(tmp1, Address(ary1, 0));
4166 andl(tmp1, 0x00008080);
4167 jccb(Assembler::notZero, CHAR_ADJUST);
4168 lea(ary1, Address(ary1, 2));
4169
4170 bind(COMPARE_BYTE);
4171 testl(result, 0x1); // tail byte
4172 jccb(Assembler::zero, DONE);
4173 load_unsigned_byte(tmp1, Address(ary1, 0));
4174 testl(tmp1, 0x00000080);
4175 jccb(Assembler::zero, DONE);
4176 subptr(result, 1);
4177 jmpb(DONE);
4178
4179 bind(TAIL_ADJUST);
4180 // there are negative bits in the last 4 byte block.
4181 // Adjust result and check the next three bytes
4182 addptr(result, len);
4183 orl(result, 3);
4184 lea(ary1, Address(ary1, len, Address::times_1));
4185 jmpb(COMPARE_CHAR);
4186
4187 bind(CHAR_ADJUST);
4188 // We are looking at a char + optional byte tail, and found that one
4189 // of the bytes in the char is negative. Adjust the result, check the
4190 // first byte and readjust if needed.
4191 andl(result, 0xfffffffc);
4192 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4193 jccb(Assembler::notZero, DONE);
4194 addptr(result, 1);
4195
4196 // That's it
4197 bind(DONE);
4198 if (UseAVX >= 2) {
4199 // clean upper bits of YMM registers
4200 vpxor(vec1, vec1);
4201 vpxor(vec2, vec2);
4202 }
4203 }
4204
4205 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4206 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4207 Register limit, Register result, Register chr,
4208 XMMRegister vec1, XMMRegister vec2, bool is_char,
4209 KRegister mask, bool expand_ary2) {
4210 // for expand_ary2, limit is the (smaller) size of the second array.
4211 ShortBranchVerifier sbv(this);
4212 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4213
4214 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4215 "Expansion only implemented for AVX2");
4216
4217 int length_offset = arrayOopDesc::length_offset_in_bytes();
4218 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4219
4220 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4221 int scaleIncr = expand_ary2 ? 8 : 16;
4222
4223 if (is_array_equ) {
4224 // Check the input args
4225 cmpoop(ary1, ary2);
4226 jcc(Assembler::equal, TRUE_LABEL);
4227
4228 // Need additional checks for arrays_equals.
4229 testptr(ary1, ary1);
4230 jcc(Assembler::zero, FALSE_LABEL);
4231 testptr(ary2, ary2);
4232 jcc(Assembler::zero, FALSE_LABEL);
4233
4234 // Check the lengths
4235 movl(limit, Address(ary1, length_offset));
4236 cmpl(limit, Address(ary2, length_offset));
4237 jcc(Assembler::notEqual, FALSE_LABEL);
4238 }
4239
4240 // count == 0
4241 testl(limit, limit);
4242 jcc(Assembler::zero, TRUE_LABEL);
4243
4244 if (is_array_equ) {
4245 // Load array address
4246 lea(ary1, Address(ary1, base_offset));
4247 lea(ary2, Address(ary2, base_offset));
4248 }
4249
4250 if (is_array_equ && is_char) {
4251 // arrays_equals when used for char[].
4252 shll(limit, 1); // byte count != 0
4253 }
4254 movl(result, limit); // copy
4255
4256 if (UseAVX >= 2) {
4257 // With AVX2, use 32-byte vector compare
4258 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4259
4260 // Compare 32-byte vectors
4261 if (expand_ary2) {
4262 andl(result, 0x0000000f); // tail count (in bytes)
4263 andl(limit, 0xfffffff0); // vector count (in bytes)
4264 jcc(Assembler::zero, COMPARE_TAIL);
4265 } else {
4266 andl(result, 0x0000001f); // tail count (in bytes)
4267 andl(limit, 0xffffffe0); // vector count (in bytes)
4268 jcc(Assembler::zero, COMPARE_TAIL_16);
4269 }
4270
4271 lea(ary1, Address(ary1, limit, scaleFactor));
4272 lea(ary2, Address(ary2, limit, Address::times_1));
4273 negptr(limit);
4274
4275 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4276 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4277
4278 cmpl(limit, -64);
4279 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4280
4281 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4282
4283 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4284 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4285 kortestql(mask, mask);
4286 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4287 addptr(limit, 64); // update since we already compared at this addr
4288 cmpl(limit, -64);
4289 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4290
4291 // At this point we may still need to compare -limit+result bytes.
4292 // We could execute the next two instruction and just continue via non-wide path:
4293 // cmpl(limit, 0);
4294 // jcc(Assembler::equal, COMPARE_TAIL); // true
4295 // But since we stopped at the points ary{1,2}+limit which are
4296 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4297 // (|limit| <= 32 and result < 32),
4298 // we may just compare the last 64 bytes.
4299 //
4300 addptr(result, -64); // it is safe, bc we just came from this area
4301 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4302 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4303 kortestql(mask, mask);
4304 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4305
4306 jmp(TRUE_LABEL);
4307
4308 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4309
4310 }//if (VM_Version::supports_avx512vlbw())
4311
4312 bind(COMPARE_WIDE_VECTORS);
4313 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4314 if (expand_ary2) {
4315 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4316 } else {
4317 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4318 }
4319 vpxor(vec1, vec2);
4320
4321 vptest(vec1, vec1);
4322 jcc(Assembler::notZero, FALSE_LABEL);
4323 addptr(limit, scaleIncr * 2);
4324 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4325
4326 testl(result, result);
4327 jcc(Assembler::zero, TRUE_LABEL);
4328
4329 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4330 if (expand_ary2) {
4331 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4332 } else {
4333 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4334 }
4335 vpxor(vec1, vec2);
4336
4337 vptest(vec1, vec1);
4338 jcc(Assembler::notZero, FALSE_LABEL);
4339 jmp(TRUE_LABEL);
4340
4341 bind(COMPARE_TAIL_16); // limit is zero
4342 movl(limit, result);
4343
4344 // Compare 16-byte chunks
4345 andl(result, 0x0000000f); // tail count (in bytes)
4346 andl(limit, 0xfffffff0); // vector count (in bytes)
4347 jcc(Assembler::zero, COMPARE_TAIL);
4348
4349 lea(ary1, Address(ary1, limit, scaleFactor));
4350 lea(ary2, Address(ary2, limit, Address::times_1));
4351 negptr(limit);
4352
4353 bind(COMPARE_WIDE_VECTORS_16);
4354 movdqu(vec1, Address(ary1, limit, scaleFactor));
4355 if (expand_ary2) {
4356 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4357 } else {
4358 movdqu(vec2, Address(ary2, limit, Address::times_1));
4359 }
4360 pxor(vec1, vec2);
4361
4362 ptest(vec1, vec1);
4363 jcc(Assembler::notZero, FALSE_LABEL);
4364 addptr(limit, scaleIncr);
4365 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4366
4367 bind(COMPARE_TAIL); // limit is zero
4368 movl(limit, result);
4369 // Fallthru to tail compare
4370 } else if (UseSSE42Intrinsics) {
4371 // With SSE4.2, use double quad vector compare
4372 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4373
4374 // Compare 16-byte vectors
4375 andl(result, 0x0000000f); // tail count (in bytes)
4376 andl(limit, 0xfffffff0); // vector count (in bytes)
4377 jcc(Assembler::zero, COMPARE_TAIL);
4378
4379 lea(ary1, Address(ary1, limit, Address::times_1));
4380 lea(ary2, Address(ary2, limit, Address::times_1));
4381 negptr(limit);
4382
4383 bind(COMPARE_WIDE_VECTORS);
4384 movdqu(vec1, Address(ary1, limit, Address::times_1));
4385 movdqu(vec2, Address(ary2, limit, Address::times_1));
4386 pxor(vec1, vec2);
4387
4388 ptest(vec1, vec1);
4389 jcc(Assembler::notZero, FALSE_LABEL);
4390 addptr(limit, 16);
4391 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4392
4393 testl(result, result);
4394 jcc(Assembler::zero, TRUE_LABEL);
4395
4396 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4397 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4398 pxor(vec1, vec2);
4399
4400 ptest(vec1, vec1);
4401 jccb(Assembler::notZero, FALSE_LABEL);
4402 jmpb(TRUE_LABEL);
4403
4404 bind(COMPARE_TAIL); // limit is zero
4405 movl(limit, result);
4406 // Fallthru to tail compare
4407 }
4408
4409 // Compare 4-byte vectors
4410 if (expand_ary2) {
4411 testl(result, result);
4412 jccb(Assembler::zero, TRUE_LABEL);
4413 } else {
4414 andl(limit, 0xfffffffc); // vector count (in bytes)
4415 jccb(Assembler::zero, COMPARE_CHAR);
4416 }
4417
4418 lea(ary1, Address(ary1, limit, scaleFactor));
4419 lea(ary2, Address(ary2, limit, Address::times_1));
4420 negptr(limit);
4421
4422 bind(COMPARE_VECTORS);
4423 if (expand_ary2) {
4424 // There are no "vector" operations for bytes to shorts
4425 movzbl(chr, Address(ary2, limit, Address::times_1));
4426 cmpw(Address(ary1, limit, Address::times_2), chr);
4427 jccb(Assembler::notEqual, FALSE_LABEL);
4428 addptr(limit, 1);
4429 jcc(Assembler::notZero, COMPARE_VECTORS);
4430 jmp(TRUE_LABEL);
4431 } else {
4432 movl(chr, Address(ary1, limit, Address::times_1));
4433 cmpl(chr, Address(ary2, limit, Address::times_1));
4434 jccb(Assembler::notEqual, FALSE_LABEL);
4435 addptr(limit, 4);
4436 jcc(Assembler::notZero, COMPARE_VECTORS);
4437 }
4438
4439 // Compare trailing char (final 2 bytes), if any
4440 bind(COMPARE_CHAR);
4441 testl(result, 0x2); // tail char
4442 jccb(Assembler::zero, COMPARE_BYTE);
4443 load_unsigned_short(chr, Address(ary1, 0));
4444 load_unsigned_short(limit, Address(ary2, 0));
4445 cmpl(chr, limit);
4446 jccb(Assembler::notEqual, FALSE_LABEL);
4447
4448 if (is_array_equ && is_char) {
4449 bind(COMPARE_BYTE);
4450 } else {
4451 lea(ary1, Address(ary1, 2));
4452 lea(ary2, Address(ary2, 2));
4453
4454 bind(COMPARE_BYTE);
4455 testl(result, 0x1); // tail byte
4456 jccb(Assembler::zero, TRUE_LABEL);
4457 load_unsigned_byte(chr, Address(ary1, 0));
4458 load_unsigned_byte(limit, Address(ary2, 0));
4459 cmpl(chr, limit);
4460 jccb(Assembler::notEqual, FALSE_LABEL);
4461 }
4462 bind(TRUE_LABEL);
4463 movl(result, 1); // return true
4464 jmpb(DONE);
4465
4466 bind(FALSE_LABEL);
4467 xorl(result, result); // return false
4468
4469 // That's it
4470 bind(DONE);
4471 if (UseAVX >= 2) {
4472 // clean upper bits of YMM registers
4473 vpxor(vec1, vec1);
4474 vpxor(vec2, vec2);
4475 }
4476 }
4477
4478 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4479 #define __ masm.
4480 Register dst = stub.data<0>();
4481 XMMRegister src = stub.data<1>();
4482 address target = stub.data<2>();
4483 __ bind(stub.entry());
4484 __ subptr(rsp, 8);
4485 __ movdbl(Address(rsp), src);
4486 __ call(RuntimeAddress(target));
4487 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4488 __ pop(dst);
4489 __ jmp(stub.continuation());
4490 #undef __
4491 }
4492
4493 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4494 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4495 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4496
4497 address slowpath_target;
4498 if (dst_bt == T_INT) {
4499 if (src_bt == T_FLOAT) {
4500 cvttss2sil(dst, src);
4501 cmpl(dst, 0x80000000);
4502 slowpath_target = StubRoutines::x86::f2i_fixup();
4503 } else {
4504 cvttsd2sil(dst, src);
4505 cmpl(dst, 0x80000000);
4506 slowpath_target = StubRoutines::x86::d2i_fixup();
4507 }
4508 } else {
4509 if (src_bt == T_FLOAT) {
4510 cvttss2siq(dst, src);
4511 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4512 slowpath_target = StubRoutines::x86::f2l_fixup();
4513 } else {
4514 cvttsd2siq(dst, src);
4515 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4516 slowpath_target = StubRoutines::x86::d2l_fixup();
4517 }
4518 }
4519
4520 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4521 int max_size = 23 + (UseAPX ? 1 : 0);
4522 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4523 jcc(Assembler::equal, stub->entry());
4524 bind(stub->continuation());
4525 }
4526
4527 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4528 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4529 switch(ideal_opc) {
4530 case Op_LShiftVS:
4531 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4532 case Op_LShiftVI:
4533 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4534 case Op_LShiftVL:
4535 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4536 case Op_RShiftVS:
4537 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4538 case Op_RShiftVI:
4539 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4540 case Op_RShiftVL:
4541 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4542 case Op_URShiftVS:
4543 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4544 case Op_URShiftVI:
4545 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4546 case Op_URShiftVL:
4547 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4548 case Op_RotateRightV:
4549 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4550 case Op_RotateLeftV:
4551 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4552 default:
4553 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4554 break;
4555 }
4556 }
4557
4558 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4559 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4560 if (is_unsigned) {
4561 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4562 } else {
4563 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4564 }
4565 }
4566
4567 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4568 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4569 switch (elem_bt) {
4570 case T_BYTE:
4571 if (ideal_opc == Op_SaturatingAddV) {
4572 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4573 } else {
4574 assert(ideal_opc == Op_SaturatingSubV, "");
4575 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4576 }
4577 break;
4578 case T_SHORT:
4579 if (ideal_opc == Op_SaturatingAddV) {
4580 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4581 } else {
4582 assert(ideal_opc == Op_SaturatingSubV, "");
4583 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4584 }
4585 break;
4586 default:
4587 fatal("Unsupported type %s", type2name(elem_bt));
4588 break;
4589 }
4590 }
4591
4592 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4593 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4594 switch (elem_bt) {
4595 case T_BYTE:
4596 if (ideal_opc == Op_SaturatingAddV) {
4597 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4598 } else {
4599 assert(ideal_opc == Op_SaturatingSubV, "");
4600 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4601 }
4602 break;
4603 case T_SHORT:
4604 if (ideal_opc == Op_SaturatingAddV) {
4605 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4606 } else {
4607 assert(ideal_opc == Op_SaturatingSubV, "");
4608 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4609 }
4610 break;
4611 default:
4612 fatal("Unsupported type %s", type2name(elem_bt));
4613 break;
4614 }
4615 }
4616
4617 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4618 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4619 if (is_unsigned) {
4620 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4621 } else {
4622 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4623 }
4624 }
4625
4626 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4627 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4628 switch (elem_bt) {
4629 case T_BYTE:
4630 if (ideal_opc == Op_SaturatingAddV) {
4631 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4632 } else {
4633 assert(ideal_opc == Op_SaturatingSubV, "");
4634 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4635 }
4636 break;
4637 case T_SHORT:
4638 if (ideal_opc == Op_SaturatingAddV) {
4639 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4640 } else {
4641 assert(ideal_opc == Op_SaturatingSubV, "");
4642 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4643 }
4644 break;
4645 default:
4646 fatal("Unsupported type %s", type2name(elem_bt));
4647 break;
4648 }
4649 }
4650
4651 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4652 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4653 switch (elem_bt) {
4654 case T_BYTE:
4655 if (ideal_opc == Op_SaturatingAddV) {
4656 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4657 } else {
4658 assert(ideal_opc == Op_SaturatingSubV, "");
4659 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4660 }
4661 break;
4662 case T_SHORT:
4663 if (ideal_opc == Op_SaturatingAddV) {
4664 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4665 } else {
4666 assert(ideal_opc == Op_SaturatingSubV, "");
4667 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4668 }
4669 break;
4670 default:
4671 fatal("Unsupported type %s", type2name(elem_bt));
4672 break;
4673 }
4674 }
4675
4676 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4677 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4678 bool is_varshift) {
4679 switch (ideal_opc) {
4680 case Op_AddVB:
4681 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4682 case Op_AddVS:
4683 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4684 case Op_AddVI:
4685 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4686 case Op_AddVL:
4687 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4688 case Op_AddVF:
4689 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4690 case Op_AddVD:
4691 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4692 case Op_SubVB:
4693 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4694 case Op_SubVS:
4695 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4696 case Op_SubVI:
4697 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4698 case Op_SubVL:
4699 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4700 case Op_SubVF:
4701 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4702 case Op_SubVD:
4703 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4704 case Op_MulVS:
4705 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4706 case Op_MulVI:
4707 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4708 case Op_MulVL:
4709 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4710 case Op_MulVF:
4711 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4712 case Op_MulVD:
4713 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4714 case Op_DivVF:
4715 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4716 case Op_DivVD:
4717 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4718 case Op_SqrtVF:
4719 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4720 case Op_SqrtVD:
4721 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4722 case Op_AbsVB:
4723 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4724 case Op_AbsVS:
4725 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4726 case Op_AbsVI:
4727 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4728 case Op_AbsVL:
4729 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4730 case Op_FmaVF:
4731 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4732 case Op_FmaVD:
4733 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4734 case Op_VectorRearrange:
4735 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4736 case Op_LShiftVS:
4737 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4738 case Op_LShiftVI:
4739 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4740 case Op_LShiftVL:
4741 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4742 case Op_RShiftVS:
4743 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4744 case Op_RShiftVI:
4745 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4746 case Op_RShiftVL:
4747 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4748 case Op_URShiftVS:
4749 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4750 case Op_URShiftVI:
4751 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4752 case Op_URShiftVL:
4753 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4754 case Op_RotateLeftV:
4755 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4756 case Op_RotateRightV:
4757 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4758 case Op_MaxV:
4759 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4760 case Op_MinV:
4761 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4762 case Op_UMinV:
4763 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4764 case Op_UMaxV:
4765 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4766 case Op_XorV:
4767 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4768 case Op_OrV:
4769 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4770 case Op_AndV:
4771 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4772 default:
4773 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4774 break;
4775 }
4776 }
4777
4778 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4779 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4780 switch (ideal_opc) {
4781 case Op_AddVB:
4782 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4783 case Op_AddVS:
4784 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4785 case Op_AddVI:
4786 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4787 case Op_AddVL:
4788 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4789 case Op_AddVF:
4790 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4791 case Op_AddVD:
4792 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4793 case Op_SubVB:
4794 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4795 case Op_SubVS:
4796 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4797 case Op_SubVI:
4798 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4799 case Op_SubVL:
4800 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4801 case Op_SubVF:
4802 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4803 case Op_SubVD:
4804 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4805 case Op_MulVS:
4806 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4807 case Op_MulVI:
4808 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4809 case Op_MulVL:
4810 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4811 case Op_MulVF:
4812 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4813 case Op_MulVD:
4814 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4815 case Op_DivVF:
4816 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4817 case Op_DivVD:
4818 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4819 case Op_FmaVF:
4820 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4821 case Op_FmaVD:
4822 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4823 case Op_MaxV:
4824 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4825 case Op_MinV:
4826 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4827 case Op_UMaxV:
4828 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4829 case Op_UMinV:
4830 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4831 case Op_XorV:
4832 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4833 case Op_OrV:
4834 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4835 case Op_AndV:
4836 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4837 default:
4838 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4839 break;
4840 }
4841 }
4842
4843 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4844 KRegister src1, KRegister src2) {
4845 BasicType etype = T_ILLEGAL;
4846 switch(mask_len) {
4847 case 2:
4848 case 4:
4849 case 8: etype = T_BYTE; break;
4850 case 16: etype = T_SHORT; break;
4851 case 32: etype = T_INT; break;
4852 case 64: etype = T_LONG; break;
4853 default: fatal("Unsupported type"); break;
4854 }
4855 assert(etype != T_ILLEGAL, "");
4856 switch(ideal_opc) {
4857 case Op_AndVMask:
4858 kand(etype, dst, src1, src2); break;
4859 case Op_OrVMask:
4860 kor(etype, dst, src1, src2); break;
4861 case Op_XorVMask:
4862 kxor(etype, dst, src1, src2); break;
4863 default:
4864 fatal("Unsupported masked operation"); break;
4865 }
4866 }
4867
4868 /*
4869 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4870 * If src is NaN, the result is 0.
4871 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4872 * the result is equal to the value of Integer.MIN_VALUE.
4873 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4874 * the result is equal to the value of Integer.MAX_VALUE.
4875 */
4876 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4877 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4878 Register rscratch, AddressLiteral float_sign_flip,
4879 int vec_enc) {
4880 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4881 Label done;
4882 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4883 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4884 vptest(xtmp2, xtmp2, vec_enc);
4885 jccb(Assembler::equal, done);
4886
4887 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4888 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4889
4890 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4891 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4892 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4893
4894 // Recompute the mask for remaining special value.
4895 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4896 // Extract SRC values corresponding to TRUE mask lanes.
4897 vpand(xtmp4, xtmp2, src, vec_enc);
4898 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4899 // values are set.
4900 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4901
4902 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4903 bind(done);
4904 }
4905
4906 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4907 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4908 Register rscratch, AddressLiteral float_sign_flip,
4909 int vec_enc) {
4910 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4911 Label done;
4912 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4913 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4914 kortestwl(ktmp1, ktmp1);
4915 jccb(Assembler::equal, done);
4916
4917 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4918 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4919 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4920
4921 kxorwl(ktmp1, ktmp1, ktmp2);
4922 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4923 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4924 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4925 bind(done);
4926 }
4927
4928 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4929 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4930 Register rscratch, AddressLiteral double_sign_flip,
4931 int vec_enc) {
4932 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4933
4934 Label done;
4935 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4936 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4937 kortestwl(ktmp1, ktmp1);
4938 jccb(Assembler::equal, done);
4939
4940 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4941 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4942 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4943
4944 kxorwl(ktmp1, ktmp1, ktmp2);
4945 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4946 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4947 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4948 bind(done);
4949 }
4950
4951 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4952 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4953 Register rscratch, AddressLiteral float_sign_flip,
4954 int vec_enc) {
4955 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4956 Label done;
4957 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4958 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4959 kortestwl(ktmp1, ktmp1);
4960 jccb(Assembler::equal, done);
4961
4962 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4963 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4964 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4965
4966 kxorwl(ktmp1, ktmp1, ktmp2);
4967 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4968 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4969 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4970 bind(done);
4971 }
4972
4973 /*
4974 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4975 * If src is NaN, the result is 0.
4976 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4977 * the result is equal to the value of Long.MIN_VALUE.
4978 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4979 * the result is equal to the value of Long.MAX_VALUE.
4980 */
4981 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4982 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4983 Register rscratch, AddressLiteral double_sign_flip,
4984 int vec_enc) {
4985 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4986
4987 Label done;
4988 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4989 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4990 kortestwl(ktmp1, ktmp1);
4991 jccb(Assembler::equal, done);
4992
4993 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4994 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4995 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4996
4997 kxorwl(ktmp1, ktmp1, ktmp2);
4998 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4999 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5000 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5001 bind(done);
5002 }
5003
5004 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5005 XMMRegister xtmp, int index, int vec_enc) {
5006 assert(vec_enc < Assembler::AVX_512bit, "");
5007 if (vec_enc == Assembler::AVX_256bit) {
5008 vextractf128_high(xtmp, src);
5009 vshufps(dst, src, xtmp, index, vec_enc);
5010 } else {
5011 vshufps(dst, src, zero, index, vec_enc);
5012 }
5013 }
5014
5015 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5016 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5017 AddressLiteral float_sign_flip, int src_vec_enc) {
5018 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5019
5020 Label done;
5021 // Compare the destination lanes with float_sign_flip
5022 // value to get mask for all special values.
5023 movdqu(xtmp1, float_sign_flip, rscratch);
5024 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5025 ptest(xtmp2, xtmp2);
5026 jccb(Assembler::equal, done);
5027
5028 // Flip float_sign_flip to get max integer value.
5029 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5030 pxor(xtmp1, xtmp4);
5031
5032 // Set detination lanes corresponding to unordered source lanes as zero.
5033 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5034 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5035
5036 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5037 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5038 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5039
5040 // Recompute the mask for remaining special value.
5041 pxor(xtmp2, xtmp3);
5042 // Extract mask corresponding to non-negative source lanes.
5043 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5044
5045 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5046 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5047 pand(xtmp3, xtmp2);
5048
5049 // Replace destination lanes holding special value(0x80000000) with max int
5050 // if corresponding source lane holds a +ve value.
5051 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5052 bind(done);
5053 }
5054
5055
5056 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5057 XMMRegister xtmp, Register rscratch, int vec_enc) {
5058 switch(to_elem_bt) {
5059 case T_SHORT:
5060 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5061 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5062 vpackusdw(dst, dst, zero, vec_enc);
5063 if (vec_enc == Assembler::AVX_256bit) {
5064 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5065 }
5066 break;
5067 case T_BYTE:
5068 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5069 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5070 vpackusdw(dst, dst, zero, vec_enc);
5071 if (vec_enc == Assembler::AVX_256bit) {
5072 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5073 }
5074 vpackuswb(dst, dst, zero, vec_enc);
5075 break;
5076 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5077 }
5078 }
5079
5080 /*
5081 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5082 * a) Perform vector D2L/F2I cast.
5083 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5084 * It signifies that source value could be any of the special floating point
5085 * values(NaN,-Inf,Inf,Max,-Min).
5086 * c) Set destination to zero if source is NaN value.
5087 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5088 */
5089
5090 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5091 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5092 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5093 int to_elem_sz = type2aelembytes(to_elem_bt);
5094 assert(to_elem_sz <= 4, "");
5095 vcvttps2dq(dst, src, vec_enc);
5096 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5097 if (to_elem_sz < 4) {
5098 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5099 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5100 }
5101 }
5102
5103 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5104 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5105 Register rscratch, int vec_enc) {
5106 int to_elem_sz = type2aelembytes(to_elem_bt);
5107 assert(to_elem_sz <= 4, "");
5108 vcvttps2dq(dst, src, vec_enc);
5109 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5110 switch(to_elem_bt) {
5111 case T_INT:
5112 break;
5113 case T_SHORT:
5114 evpmovdw(dst, dst, vec_enc);
5115 break;
5116 case T_BYTE:
5117 evpmovdb(dst, dst, vec_enc);
5118 break;
5119 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5120 }
5121 }
5122
5123 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5124 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5125 Register rscratch, int vec_enc) {
5126 evcvttps2qq(dst, src, vec_enc);
5127 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5128 }
5129
5130 // Handling for downcasting from double to integer or sub-word types on AVX2.
5131 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5132 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5133 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5134 int to_elem_sz = type2aelembytes(to_elem_bt);
5135 assert(to_elem_sz < 8, "");
5136 vcvttpd2dq(dst, src, vec_enc);
5137 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5138 float_sign_flip, vec_enc);
5139 if (to_elem_sz < 4) {
5140 // xtmp4 holds all zero lanes.
5141 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5142 }
5143 }
5144
5145 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5146 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5147 KRegister ktmp2, AddressLiteral sign_flip,
5148 Register rscratch, int vec_enc) {
5149 if (VM_Version::supports_avx512dq()) {
5150 evcvttpd2qq(dst, src, vec_enc);
5151 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5152 switch(to_elem_bt) {
5153 case T_LONG:
5154 break;
5155 case T_INT:
5156 evpmovsqd(dst, dst, vec_enc);
5157 break;
5158 case T_SHORT:
5159 evpmovsqd(dst, dst, vec_enc);
5160 evpmovdw(dst, dst, vec_enc);
5161 break;
5162 case T_BYTE:
5163 evpmovsqd(dst, dst, vec_enc);
5164 evpmovdb(dst, dst, vec_enc);
5165 break;
5166 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5167 }
5168 } else {
5169 assert(type2aelembytes(to_elem_bt) <= 4, "");
5170 vcvttpd2dq(dst, src, vec_enc);
5171 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5172 switch(to_elem_bt) {
5173 case T_INT:
5174 break;
5175 case T_SHORT:
5176 evpmovdw(dst, dst, vec_enc);
5177 break;
5178 case T_BYTE:
5179 evpmovdb(dst, dst, vec_enc);
5180 break;
5181 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5182 }
5183 }
5184 }
5185
5186 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5187 switch(to_elem_bt) {
5188 case T_LONG:
5189 evcvttps2qqs(dst, src, vec_enc);
5190 break;
5191 case T_INT:
5192 evcvttps2dqs(dst, src, vec_enc);
5193 break;
5194 case T_SHORT:
5195 evcvttps2dqs(dst, src, vec_enc);
5196 evpmovdw(dst, dst, vec_enc);
5197 break;
5198 case T_BYTE:
5199 evcvttps2dqs(dst, src, vec_enc);
5200 evpmovdb(dst, dst, vec_enc);
5201 break;
5202 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5203 }
5204 }
5205
5206 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5207 switch(to_elem_bt) {
5208 case T_LONG:
5209 evcvttps2qqs(dst, src, vec_enc);
5210 break;
5211 case T_INT:
5212 evcvttps2dqs(dst, src, vec_enc);
5213 break;
5214 case T_SHORT:
5215 evcvttps2dqs(dst, src, vec_enc);
5216 evpmovdw(dst, dst, vec_enc);
5217 break;
5218 case T_BYTE:
5219 evcvttps2dqs(dst, src, vec_enc);
5220 evpmovdb(dst, dst, vec_enc);
5221 break;
5222 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5223 }
5224 }
5225
5226 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5227 switch(to_elem_bt) {
5228 case T_LONG:
5229 evcvttpd2qqs(dst, src, vec_enc);
5230 break;
5231 case T_INT:
5232 evcvttpd2dqs(dst, src, vec_enc);
5233 break;
5234 case T_SHORT:
5235 evcvttpd2dqs(dst, src, vec_enc);
5236 evpmovdw(dst, dst, vec_enc);
5237 break;
5238 case T_BYTE:
5239 evcvttpd2dqs(dst, src, vec_enc);
5240 evpmovdb(dst, dst, vec_enc);
5241 break;
5242 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5243 }
5244 }
5245
5246 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5247 switch(to_elem_bt) {
5248 case T_LONG:
5249 evcvttpd2qqs(dst, src, vec_enc);
5250 break;
5251 case T_INT:
5252 evcvttpd2dqs(dst, src, vec_enc);
5253 break;
5254 case T_SHORT:
5255 evcvttpd2dqs(dst, src, vec_enc);
5256 evpmovdw(dst, dst, vec_enc);
5257 break;
5258 case T_BYTE:
5259 evcvttpd2dqs(dst, src, vec_enc);
5260 evpmovdb(dst, dst, vec_enc);
5261 break;
5262 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5263 }
5264 }
5265
5266 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5267 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5268 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5269 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5270 // and re-instantiate original MXCSR.RC mode after that.
5271 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5272
5273 mov64(tmp, julong_cast(0.5L));
5274 evpbroadcastq(xtmp1, tmp, vec_enc);
5275 vaddpd(xtmp1, src , xtmp1, vec_enc);
5276 evcvtpd2qq(dst, xtmp1, vec_enc);
5277 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5278 double_sign_flip, vec_enc);;
5279
5280 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5281 }
5282
5283 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5284 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5285 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5286 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5287 // and re-instantiate original MXCSR.RC mode after that.
5288 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5289
5290 movl(tmp, jint_cast(0.5));
5291 movq(xtmp1, tmp);
5292 vbroadcastss(xtmp1, xtmp1, vec_enc);
5293 vaddps(xtmp1, src , xtmp1, vec_enc);
5294 vcvtps2dq(dst, xtmp1, vec_enc);
5295 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5296 float_sign_flip, vec_enc);
5297
5298 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5299 }
5300
5301 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5302 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5303 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5304 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5305 // and re-instantiate original MXCSR.RC mode after that.
5306 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5307
5308 movl(tmp, jint_cast(0.5));
5309 movq(xtmp1, tmp);
5310 vbroadcastss(xtmp1, xtmp1, vec_enc);
5311 vaddps(xtmp1, src , xtmp1, vec_enc);
5312 vcvtps2dq(dst, xtmp1, vec_enc);
5313 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5314
5315 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5316 }
5317
5318 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5319 BasicType from_elem_bt, BasicType to_elem_bt) {
5320 switch (from_elem_bt) {
5321 case T_BYTE:
5322 switch (to_elem_bt) {
5323 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5324 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5325 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5326 default: ShouldNotReachHere();
5327 }
5328 break;
5329 case T_SHORT:
5330 switch (to_elem_bt) {
5331 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5332 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5333 default: ShouldNotReachHere();
5334 }
5335 break;
5336 case T_INT:
5337 assert(to_elem_bt == T_LONG, "");
5338 vpmovzxdq(dst, src, vlen_enc);
5339 break;
5340 default:
5341 ShouldNotReachHere();
5342 }
5343 }
5344
5345 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5346 BasicType from_elem_bt, BasicType to_elem_bt) {
5347 switch (from_elem_bt) {
5348 case T_BYTE:
5349 switch (to_elem_bt) {
5350 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5351 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5352 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5353 default: ShouldNotReachHere();
5354 }
5355 break;
5356 case T_SHORT:
5357 switch (to_elem_bt) {
5358 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5359 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5360 default: ShouldNotReachHere();
5361 }
5362 break;
5363 case T_INT:
5364 assert(to_elem_bt == T_LONG, "");
5365 vpmovsxdq(dst, src, vlen_enc);
5366 break;
5367 default:
5368 ShouldNotReachHere();
5369 }
5370 }
5371
5372 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5373 BasicType dst_bt, BasicType src_bt, int vlen) {
5374 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5375 assert(vlen_enc != AVX_512bit, "");
5376
5377 int dst_bt_size = type2aelembytes(dst_bt);
5378 int src_bt_size = type2aelembytes(src_bt);
5379 if (dst_bt_size > src_bt_size) {
5380 switch (dst_bt_size / src_bt_size) {
5381 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5382 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5383 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5384 default: ShouldNotReachHere();
5385 }
5386 } else {
5387 assert(dst_bt_size < src_bt_size, "");
5388 switch (src_bt_size / dst_bt_size) {
5389 case 2: {
5390 if (vlen_enc == AVX_128bit) {
5391 vpacksswb(dst, src, src, vlen_enc);
5392 } else {
5393 vpacksswb(dst, src, src, vlen_enc);
5394 vpermq(dst, dst, 0x08, vlen_enc);
5395 }
5396 break;
5397 }
5398 case 4: {
5399 if (vlen_enc == AVX_128bit) {
5400 vpackssdw(dst, src, src, vlen_enc);
5401 vpacksswb(dst, dst, dst, vlen_enc);
5402 } else {
5403 vpackssdw(dst, src, src, vlen_enc);
5404 vpermq(dst, dst, 0x08, vlen_enc);
5405 vpacksswb(dst, dst, dst, AVX_128bit);
5406 }
5407 break;
5408 }
5409 case 8: {
5410 if (vlen_enc == AVX_128bit) {
5411 vpshufd(dst, src, 0x08, vlen_enc);
5412 vpackssdw(dst, dst, dst, vlen_enc);
5413 vpacksswb(dst, dst, dst, vlen_enc);
5414 } else {
5415 vpshufd(dst, src, 0x08, vlen_enc);
5416 vpermq(dst, dst, 0x08, vlen_enc);
5417 vpackssdw(dst, dst, dst, AVX_128bit);
5418 vpacksswb(dst, dst, dst, AVX_128bit);
5419 }
5420 break;
5421 }
5422 default: ShouldNotReachHere();
5423 }
5424 }
5425 }
5426
5427 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5428 bool merge, BasicType bt, int vlen_enc) {
5429 if (bt == T_INT) {
5430 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5431 } else {
5432 assert(bt == T_LONG, "");
5433 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5434 }
5435 }
5436
5437 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5438 bool merge, BasicType bt, int vlen_enc) {
5439 if (bt == T_INT) {
5440 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5441 } else {
5442 assert(bt == T_LONG, "");
5443 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5444 }
5445 }
5446
5447 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5448 Register rtmp2, XMMRegister xtmp, int mask_len,
5449 int vec_enc) {
5450 int index = 0;
5451 int vindex = 0;
5452 mov64(rtmp1, 0x0101010101010101L);
5453 pdepq(rtmp1, src, rtmp1);
5454 if (mask_len > 8) {
5455 movq(rtmp2, src);
5456 vpxor(xtmp, xtmp, xtmp, vec_enc);
5457 movq(xtmp, rtmp1);
5458 }
5459 movq(dst, rtmp1);
5460
5461 mask_len -= 8;
5462 while (mask_len > 0) {
5463 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5464 index++;
5465 if ((index % 2) == 0) {
5466 pxor(xtmp, xtmp);
5467 }
5468 mov64(rtmp1, 0x0101010101010101L);
5469 shrq(rtmp2, 8);
5470 pdepq(rtmp1, rtmp2, rtmp1);
5471 pinsrq(xtmp, rtmp1, index % 2);
5472 vindex = index / 2;
5473 if (vindex) {
5474 // Write entire 16 byte vector when both 64 bit
5475 // lanes are update to save redundant instructions.
5476 if (index % 2) {
5477 vinsertf128(dst, dst, xtmp, vindex);
5478 }
5479 } else {
5480 vmovdqu(dst, xtmp);
5481 }
5482 mask_len -= 8;
5483 }
5484 }
5485
5486 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5487 switch(opc) {
5488 case Op_VectorMaskTrueCount:
5489 popcntq(dst, tmp);
5490 break;
5491 case Op_VectorMaskLastTrue:
5492 if (VM_Version::supports_lzcnt()) {
5493 lzcntq(tmp, tmp);
5494 movl(dst, 63);
5495 subl(dst, tmp);
5496 } else {
5497 movl(dst, -1);
5498 bsrq(tmp, tmp);
5499 cmov32(Assembler::notZero, dst, tmp);
5500 }
5501 break;
5502 case Op_VectorMaskFirstTrue:
5503 if (VM_Version::supports_bmi1()) {
5504 if (masklen < 32) {
5505 orl(tmp, 1 << masklen);
5506 tzcntl(dst, tmp);
5507 } else if (masklen == 32) {
5508 tzcntl(dst, tmp);
5509 } else {
5510 assert(masklen == 64, "");
5511 tzcntq(dst, tmp);
5512 }
5513 } else {
5514 if (masklen < 32) {
5515 orl(tmp, 1 << masklen);
5516 bsfl(dst, tmp);
5517 } else {
5518 assert(masklen == 32 || masklen == 64, "");
5519 movl(dst, masklen);
5520 if (masklen == 32) {
5521 bsfl(tmp, tmp);
5522 } else {
5523 bsfq(tmp, tmp);
5524 }
5525 cmov32(Assembler::notZero, dst, tmp);
5526 }
5527 }
5528 break;
5529 case Op_VectorMaskToLong:
5530 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5531 break;
5532 default: assert(false, "Unhandled mask operation");
5533 }
5534 }
5535
5536 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5537 int masklen, int masksize, int vec_enc) {
5538 assert(VM_Version::supports_popcnt(), "");
5539
5540 if(VM_Version::supports_avx512bw()) {
5541 kmovql(tmp, mask);
5542 } else {
5543 assert(masklen <= 16, "");
5544 kmovwl(tmp, mask);
5545 }
5546
5547 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5548 // operations needs to be clipped.
5549 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5550 andq(tmp, (1 << masklen) - 1);
5551 }
5552
5553 vector_mask_operation_helper(opc, dst, tmp, masklen);
5554 }
5555
5556 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5557 Register tmp, int masklen, BasicType bt, int vec_enc) {
5558 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5559 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5560 assert(VM_Version::supports_popcnt(), "");
5561
5562 bool need_clip = false;
5563 switch(bt) {
5564 case T_BOOLEAN:
5565 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5566 vpxor(xtmp, xtmp, xtmp, vec_enc);
5567 vpsubb(xtmp, xtmp, mask, vec_enc);
5568 vpmovmskb(tmp, xtmp, vec_enc);
5569 need_clip = masklen < 16;
5570 break;
5571 case T_BYTE:
5572 vpmovmskb(tmp, mask, vec_enc);
5573 need_clip = masklen < 16;
5574 break;
5575 case T_SHORT:
5576 vpacksswb(xtmp, mask, mask, vec_enc);
5577 if (masklen >= 16) {
5578 vpermpd(xtmp, xtmp, 8, vec_enc);
5579 }
5580 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5581 need_clip = masklen < 16;
5582 break;
5583 case T_INT:
5584 case T_FLOAT:
5585 vmovmskps(tmp, mask, vec_enc);
5586 need_clip = masklen < 4;
5587 break;
5588 case T_LONG:
5589 case T_DOUBLE:
5590 vmovmskpd(tmp, mask, vec_enc);
5591 need_clip = masklen < 2;
5592 break;
5593 default: assert(false, "Unhandled type, %s", type2name(bt));
5594 }
5595
5596 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5597 // operations needs to be clipped.
5598 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5599 // need_clip implies masklen < 32
5600 andq(tmp, (1 << masklen) - 1);
5601 }
5602
5603 vector_mask_operation_helper(opc, dst, tmp, masklen);
5604 }
5605
5606 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5607 Register rtmp2, int mask_len) {
5608 kmov(rtmp1, src);
5609 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5610 mov64(rtmp2, -1L);
5611 pextq(rtmp2, rtmp2, rtmp1);
5612 kmov(dst, rtmp2);
5613 }
5614
5615 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5616 XMMRegister mask, Register rtmp, Register rscratch,
5617 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5618 int vec_enc) {
5619 assert(type2aelembytes(bt) >= 4, "");
5620 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5621 address compress_perm_table = nullptr;
5622 address expand_perm_table = nullptr;
5623 if (type2aelembytes(bt) == 8) {
5624 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5625 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5626 vmovmskpd(rtmp, mask, vec_enc);
5627 } else {
5628 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5629 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5630 vmovmskps(rtmp, mask, vec_enc);
5631 }
5632 shlq(rtmp, 5); // for 32 byte permute row.
5633 if (opcode == Op_CompressV) {
5634 lea(rscratch, ExternalAddress(compress_perm_table));
5635 } else {
5636 lea(rscratch, ExternalAddress(expand_perm_table));
5637 }
5638 addptr(rtmp, rscratch);
5639 vmovdqu(permv, Address(rtmp));
5640 vpermps(dst, permv, src, Assembler::AVX_256bit);
5641 vpxor(xtmp, xtmp, xtmp, vec_enc);
5642 // Blend the result with zero vector using permute mask, each column entry
5643 // in a permute table row contains either a valid permute index or a -1 (default)
5644 // value, this can potentially be used as a blending mask after
5645 // compressing/expanding the source vector lanes.
5646 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5647 }
5648
5649 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5650 bool merge, BasicType bt, int vec_enc) {
5651 if (opcode == Op_CompressV) {
5652 switch(bt) {
5653 case T_BYTE:
5654 evpcompressb(dst, mask, src, merge, vec_enc);
5655 break;
5656 case T_CHAR:
5657 case T_SHORT:
5658 evpcompressw(dst, mask, src, merge, vec_enc);
5659 break;
5660 case T_INT:
5661 evpcompressd(dst, mask, src, merge, vec_enc);
5662 break;
5663 case T_FLOAT:
5664 evcompressps(dst, mask, src, merge, vec_enc);
5665 break;
5666 case T_LONG:
5667 evpcompressq(dst, mask, src, merge, vec_enc);
5668 break;
5669 case T_DOUBLE:
5670 evcompresspd(dst, mask, src, merge, vec_enc);
5671 break;
5672 default:
5673 fatal("Unsupported type %s", type2name(bt));
5674 break;
5675 }
5676 } else {
5677 assert(opcode == Op_ExpandV, "");
5678 switch(bt) {
5679 case T_BYTE:
5680 evpexpandb(dst, mask, src, merge, vec_enc);
5681 break;
5682 case T_CHAR:
5683 case T_SHORT:
5684 evpexpandw(dst, mask, src, merge, vec_enc);
5685 break;
5686 case T_INT:
5687 evpexpandd(dst, mask, src, merge, vec_enc);
5688 break;
5689 case T_FLOAT:
5690 evexpandps(dst, mask, src, merge, vec_enc);
5691 break;
5692 case T_LONG:
5693 evpexpandq(dst, mask, src, merge, vec_enc);
5694 break;
5695 case T_DOUBLE:
5696 evexpandpd(dst, mask, src, merge, vec_enc);
5697 break;
5698 default:
5699 fatal("Unsupported type %s", type2name(bt));
5700 break;
5701 }
5702 }
5703 }
5704
5705 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5706 KRegister ktmp1, int vec_enc) {
5707 if (opcode == Op_SignumVD) {
5708 vsubpd(dst, zero, one, vec_enc);
5709 // if src < 0 ? -1 : 1
5710 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5711 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5712 // if src == NaN, -0.0 or 0.0 return src.
5713 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5714 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5715 } else {
5716 assert(opcode == Op_SignumVF, "");
5717 vsubps(dst, zero, one, vec_enc);
5718 // if src < 0 ? -1 : 1
5719 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5720 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5721 // if src == NaN, -0.0 or 0.0 return src.
5722 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5723 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5724 }
5725 }
5726
5727 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5728 XMMRegister xtmp1, int vec_enc) {
5729 if (opcode == Op_SignumVD) {
5730 vsubpd(dst, zero, one, vec_enc);
5731 // if src < 0 ? -1 : 1
5732 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5733 // if src == NaN, -0.0 or 0.0 return src.
5734 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5735 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5736 } else {
5737 assert(opcode == Op_SignumVF, "");
5738 vsubps(dst, zero, one, vec_enc);
5739 // if src < 0 ? -1 : 1
5740 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5741 // if src == NaN, -0.0 or 0.0 return src.
5742 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5743 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5744 }
5745 }
5746
5747 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5748 if (VM_Version::supports_avx512bw()) {
5749 if (mask_len > 32) {
5750 kmovql(dst, src);
5751 } else {
5752 kmovdl(dst, src);
5753 if (mask_len != 32) {
5754 kshiftrdl(dst, dst, 32 - mask_len);
5755 }
5756 }
5757 } else {
5758 assert(mask_len <= 16, "");
5759 kmovwl(dst, src);
5760 if (mask_len != 16) {
5761 kshiftrwl(dst, dst, 16 - mask_len);
5762 }
5763 }
5764 }
5765
5766 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5767 int lane_size = type2aelembytes(bt);
5768 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5769 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5770 movptr(rtmp, imm32);
5771 switch(lane_size) {
5772 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5773 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5774 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5775 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5776 fatal("Unsupported lane size %d", lane_size);
5777 break;
5778 }
5779 } else {
5780 movptr(rtmp, imm32);
5781 movq(dst, rtmp);
5782 switch(lane_size) {
5783 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5784 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5785 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5786 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5787 fatal("Unsupported lane size %d", lane_size);
5788 break;
5789 }
5790 }
5791 }
5792
5793 //
5794 // Following is lookup table based popcount computation algorithm:-
5795 // Index Bit set count
5796 // [ 0000 -> 0,
5797 // 0001 -> 1,
5798 // 0010 -> 1,
5799 // 0011 -> 2,
5800 // 0100 -> 1,
5801 // 0101 -> 2,
5802 // 0110 -> 2,
5803 // 0111 -> 3,
5804 // 1000 -> 1,
5805 // 1001 -> 2,
5806 // 1010 -> 3,
5807 // 1011 -> 3,
5808 // 1100 -> 2,
5809 // 1101 -> 3,
5810 // 1111 -> 4 ]
5811 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5812 // shuffle indices for lookup table access.
5813 // b. Right shift each byte of vector lane by 4 positions.
5814 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5815 // shuffle indices for lookup table access.
5816 // d. Add the bitset count of upper and lower 4 bits of each byte.
5817 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5818 // count of all the bytes of a quadword.
5819 // f. Perform step e. for upper 128bit vector lane.
5820 // g. Pack the bitset count of quadwords back to double word.
5821 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5822
5823 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5824 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5825 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5826 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5827 vpsrlw(dst, src, 4, vec_enc);
5828 vpand(dst, dst, xtmp1, vec_enc);
5829 vpand(xtmp1, src, xtmp1, vec_enc);
5830 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5831 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5832 vpshufb(dst, xtmp2, dst, vec_enc);
5833 vpaddb(dst, dst, xtmp1, vec_enc);
5834 }
5835
5836 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5837 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5838 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5839 // Following code is as per steps e,f,g and h of above algorithm.
5840 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5841 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5842 vpsadbw(dst, dst, xtmp2, vec_enc);
5843 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5844 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5845 vpackuswb(dst, xtmp1, dst, vec_enc);
5846 }
5847
5848 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5849 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5850 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5851 // Add the popcount of upper and lower bytes of word.
5852 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5853 vpsrlw(dst, xtmp1, 8, vec_enc);
5854 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5855 vpaddw(dst, dst, xtmp1, vec_enc);
5856 }
5857
5858 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5859 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5860 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5861 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5862 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5863 }
5864
5865 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5866 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5867 switch(bt) {
5868 case T_LONG:
5869 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5870 break;
5871 case T_INT:
5872 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5873 break;
5874 case T_CHAR:
5875 case T_SHORT:
5876 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5877 break;
5878 case T_BYTE:
5879 case T_BOOLEAN:
5880 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5881 break;
5882 default:
5883 fatal("Unsupported type %s", type2name(bt));
5884 break;
5885 }
5886 }
5887
5888 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5889 KRegister mask, bool merge, int vec_enc) {
5890 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5891 switch(bt) {
5892 case T_LONG:
5893 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5894 evpopcntq(dst, mask, src, merge, vec_enc);
5895 break;
5896 case T_INT:
5897 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5898 evpopcntd(dst, mask, src, merge, vec_enc);
5899 break;
5900 case T_CHAR:
5901 case T_SHORT:
5902 assert(VM_Version::supports_avx512_bitalg(), "");
5903 evpopcntw(dst, mask, src, merge, vec_enc);
5904 break;
5905 case T_BYTE:
5906 case T_BOOLEAN:
5907 assert(VM_Version::supports_avx512_bitalg(), "");
5908 evpopcntb(dst, mask, src, merge, vec_enc);
5909 break;
5910 default:
5911 fatal("Unsupported type %s", type2name(bt));
5912 break;
5913 }
5914 }
5915
5916 // Bit reversal algorithm first reverses the bits of each byte followed by
5917 // a byte level reversal for multi-byte primitive types (short/int/long).
5918 // Algorithm performs a lookup table access to get reverse bit sequence
5919 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5920 // is obtained by swapping the reverse bit sequences of upper and lower
5921 // nibble of a byte.
5922 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5923 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5924 if (VM_Version::supports_avx512vlbw()) {
5925
5926 // Get the reverse bit sequence of lower nibble of each byte.
5927 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5928 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5929 evpandq(dst, xtmp2, src, vec_enc);
5930 vpshufb(dst, xtmp1, dst, vec_enc);
5931 vpsllq(dst, dst, 4, vec_enc);
5932
5933 // Get the reverse bit sequence of upper nibble of each byte.
5934 vpandn(xtmp2, xtmp2, src, vec_enc);
5935 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5936 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5937
5938 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5939 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5940 evporq(xtmp2, dst, xtmp2, vec_enc);
5941 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5942
5943 } else if(vec_enc == Assembler::AVX_512bit) {
5944 // Shift based bit reversal.
5945 assert(bt == T_LONG || bt == T_INT, "");
5946
5947 // Swap lower and upper nibble of each byte.
5948 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5949
5950 // Swap two least and most significant bits of each nibble.
5951 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5952
5953 // Swap adjacent pair of bits.
5954 evmovdqul(xtmp1, k0, dst, true, vec_enc);
5955 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5956
5957 evmovdqul(xtmp1, k0, dst, true, vec_enc);
5958 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5959 } else {
5960 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5961 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5962
5963 // Get the reverse bit sequence of lower nibble of each byte.
5964 vpand(dst, xtmp2, src, vec_enc);
5965 vpshufb(dst, xtmp1, dst, vec_enc);
5966 vpsllq(dst, dst, 4, vec_enc);
5967
5968 // Get the reverse bit sequence of upper nibble of each byte.
5969 vpandn(xtmp2, xtmp2, src, vec_enc);
5970 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5971 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5972
5973 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5974 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5975 vpor(xtmp2, dst, xtmp2, vec_enc);
5976 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5977 }
5978 }
5979
5980 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5981 XMMRegister xtmp, Register rscratch) {
5982 assert(VM_Version::supports_gfni(), "");
5983 assert(rscratch != noreg || always_reachable(mask), "missing");
5984
5985 // Galois field instruction based bit reversal based on following algorithm.
5986 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5987 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5988 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5989 vector_reverse_byte(bt, dst, xtmp, vec_enc);
5990 }
5991
5992 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5993 XMMRegister xtmp1, Register rtmp, int vec_enc) {
5994 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5995 evpandq(dst, xtmp1, src, vec_enc);
5996 vpsllq(dst, dst, nbits, vec_enc);
5997 vpandn(xtmp1, xtmp1, src, vec_enc);
5998 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5999 evporq(dst, dst, xtmp1, vec_enc);
6000 }
6001
6002 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6003 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6004 // Shift based bit reversal.
6005 assert(VM_Version::supports_evex(), "");
6006 switch(bt) {
6007 case T_LONG:
6008 // Swap upper and lower double word of each quad word.
6009 evprorq(xtmp1, k0, src, 32, true, vec_enc);
6010 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6011 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6012 break;
6013 case T_INT:
6014 // Swap upper and lower word of each double word.
6015 evprord(xtmp1, k0, src, 16, true, vec_enc);
6016 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6017 break;
6018 case T_CHAR:
6019 case T_SHORT:
6020 // Swap upper and lower byte of each word.
6021 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6022 break;
6023 case T_BYTE:
6024 evmovdquq(dst, k0, src, true, vec_enc);
6025 break;
6026 default:
6027 fatal("Unsupported type %s", type2name(bt));
6028 break;
6029 }
6030 }
6031
6032 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6033 if (bt == T_BYTE) {
6034 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6035 evmovdquq(dst, k0, src, true, vec_enc);
6036 } else {
6037 vmovdqu(dst, src);
6038 }
6039 return;
6040 }
6041 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6042 // pre-computed shuffle indices.
6043 switch(bt) {
6044 case T_LONG:
6045 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6046 break;
6047 case T_INT:
6048 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6049 break;
6050 case T_CHAR:
6051 case T_SHORT:
6052 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6053 break;
6054 default:
6055 fatal("Unsupported type %s", type2name(bt));
6056 break;
6057 }
6058 vpshufb(dst, src, dst, vec_enc);
6059 }
6060
6061 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6062 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6063 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6064 assert(is_integral_type(bt), "");
6065 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6066 assert(VM_Version::supports_avx512cd(), "");
6067 switch(bt) {
6068 case T_LONG:
6069 evplzcntq(dst, ktmp, src, merge, vec_enc);
6070 break;
6071 case T_INT:
6072 evplzcntd(dst, ktmp, src, merge, vec_enc);
6073 break;
6074 case T_SHORT:
6075 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6076 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6077 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6078 vpunpckhwd(dst, xtmp1, src, vec_enc);
6079 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6080 vpackusdw(dst, xtmp2, dst, vec_enc);
6081 break;
6082 case T_BYTE:
6083 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6084 // accessing the lookup table.
6085 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6086 // accessing the lookup table.
6087 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6088 assert(VM_Version::supports_avx512bw(), "");
6089 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6090 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6091 vpand(xtmp2, dst, src, vec_enc);
6092 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6093 vpsrlw(xtmp3, src, 4, vec_enc);
6094 vpand(xtmp3, dst, xtmp3, vec_enc);
6095 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6096 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6097 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6098 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6099 break;
6100 default:
6101 fatal("Unsupported type %s", type2name(bt));
6102 break;
6103 }
6104 }
6105
6106 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6107 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6108 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6109 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6110 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6111 // accessing the lookup table.
6112 vpand(dst, xtmp2, src, vec_enc);
6113 vpshufb(dst, xtmp1, dst, vec_enc);
6114 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6115 // accessing the lookup table.
6116 vpsrlw(xtmp3, src, 4, vec_enc);
6117 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6118 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6119 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6120 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6121 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6122 vpaddb(dst, dst, xtmp2, vec_enc);
6123 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6124 }
6125
6126 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6127 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6128 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6129 // Add zero counts of lower byte and upper byte of a word if
6130 // upper byte holds a zero value.
6131 vpsrlw(xtmp3, src, 8, vec_enc);
6132 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6133 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6134 vpsllw(xtmp2, dst, 8, vec_enc);
6135 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6136 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6137 vpsrlw(dst, dst, 8, vec_enc);
6138 }
6139
6140 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6141 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6142 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6143 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6144 // exponent as the leading zero count.
6145
6146 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6147 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6148 // contributes to the leading number of zeros.
6149 vpsrld(dst, src, 1, vec_enc);
6150 vpandn(dst, dst, src, vec_enc);
6151
6152 vcvtdq2ps(dst, dst, vec_enc);
6153
6154 // By comparing the register to itself, all the bits in the destination are set.
6155 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6156
6157 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6158 vpsrld(xtmp2, xtmp1, 24, vec_enc);
6159 vpsrld(dst, dst, 23, vec_enc);
6160 vpand(dst, xtmp2, dst, vec_enc);
6161
6162 // Subtract 127 from the exponent, which removes the bias from the exponent.
6163 vpsrld(xtmp2, xtmp1, 25, vec_enc);
6164 vpsubd(dst, dst, xtmp2, vec_enc);
6165
6166 vpsrld(xtmp2, xtmp1, 27, vec_enc);
6167
6168 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6169 // is found in any of the lanes, replace the lane with -1 from xtmp1.
6170 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6171
6172 // If the original value is negative, replace the lane with 31.
6173 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6174
6175 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6176 // and for negative numbers the result is 0 as the exponent was replaced with 31.
6177 vpsubd(dst, xtmp2, dst, vec_enc);
6178 }
6179
6180 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6181 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6182 // Find the leading zeros of the top and bottom halves of the long individually.
6183 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6184
6185 // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6186 vpsrlq(xtmp1, dst, 32, vec_enc);
6187 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6188 // be in the most significant position of the bottom half.
6189 vpsrlq(xtmp2, dst, 6, vec_enc);
6190
6191 // In the bottom half, add the top half and bottom half results.
6192 vpaddq(dst, xtmp1, dst, vec_enc);
6193
6194 // For the bottom half, choose between the values using the most significant bit of xtmp2.
6195 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6196 // which contains only the top half result.
6197 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6198 // the lane as required.
6199 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6200 }
6201
6202 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6203 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6204 Register rtmp, int vec_enc) {
6205 assert(is_integral_type(bt), "unexpected type");
6206 assert(vec_enc < Assembler::AVX_512bit, "");
6207 switch(bt) {
6208 case T_LONG:
6209 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6210 break;
6211 case T_INT:
6212 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6213 break;
6214 case T_SHORT:
6215 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6216 break;
6217 case T_BYTE:
6218 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6219 break;
6220 default:
6221 fatal("Unsupported type %s", type2name(bt));
6222 break;
6223 }
6224 }
6225
6226 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6227 switch(bt) {
6228 case T_BYTE:
6229 vpsubb(dst, src1, src2, vec_enc);
6230 break;
6231 case T_SHORT:
6232 vpsubw(dst, src1, src2, vec_enc);
6233 break;
6234 case T_INT:
6235 vpsubd(dst, src1, src2, vec_enc);
6236 break;
6237 case T_LONG:
6238 vpsubq(dst, src1, src2, vec_enc);
6239 break;
6240 default:
6241 fatal("Unsupported type %s", type2name(bt));
6242 break;
6243 }
6244 }
6245
6246 // Trailing zero count computation is based on leading zero count operation as per
6247 // following equation. All AVX3 targets support AVX512CD feature which offers
6248 // direct vector instruction to compute leading zero count.
6249 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6250 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6251 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6252 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6253 assert(is_integral_type(bt), "");
6254 // xtmp = -1
6255 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6256 // xtmp = xtmp + src
6257 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6258 // xtmp = xtmp & ~src
6259 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6260 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6261 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6262 vpsub(bt, dst, xtmp4, dst, vec_enc);
6263 }
6264
6265 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6266 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6267 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6268 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6269 assert(is_integral_type(bt), "");
6270 // xtmp = 0
6271 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6272 // xtmp = 0 - src
6273 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6274 // xtmp = xtmp | src
6275 vpor(xtmp3, xtmp3, src, vec_enc);
6276 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6277 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6278 vpsub(bt, dst, xtmp1, dst, vec_enc);
6279 }
6280
6281 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6282 Label done;
6283 Label neg_divisor_fastpath;
6284 cmpl(divisor, 0);
6285 jccb(Assembler::less, neg_divisor_fastpath);
6286 xorl(rdx, rdx);
6287 divl(divisor);
6288 jmpb(done);
6289 bind(neg_divisor_fastpath);
6290 // Fastpath for divisor < 0:
6291 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6292 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6293 movl(rdx, rax);
6294 subl(rdx, divisor);
6295 if (VM_Version::supports_bmi1()) {
6296 andnl(rax, rdx, rax);
6297 } else {
6298 notl(rdx);
6299 andl(rax, rdx);
6300 }
6301 shrl(rax, 31);
6302 bind(done);
6303 }
6304
6305 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6306 Label done;
6307 Label neg_divisor_fastpath;
6308 cmpl(divisor, 0);
6309 jccb(Assembler::less, neg_divisor_fastpath);
6310 xorl(rdx, rdx);
6311 divl(divisor);
6312 jmpb(done);
6313 bind(neg_divisor_fastpath);
6314 // Fastpath when divisor < 0:
6315 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6316 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6317 movl(rdx, rax);
6318 subl(rax, divisor);
6319 if (VM_Version::supports_bmi1()) {
6320 andnl(rax, rax, rdx);
6321 } else {
6322 notl(rax);
6323 andl(rax, rdx);
6324 }
6325 sarl(rax, 31);
6326 andl(rax, divisor);
6327 subl(rdx, rax);
6328 bind(done);
6329 }
6330
6331 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6332 Label done;
6333 Label neg_divisor_fastpath;
6334
6335 cmpl(divisor, 0);
6336 jccb(Assembler::less, neg_divisor_fastpath);
6337 xorl(rdx, rdx);
6338 divl(divisor);
6339 jmpb(done);
6340 bind(neg_divisor_fastpath);
6341 // Fastpath for divisor < 0:
6342 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6343 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6344 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6345 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6346 movl(rdx, rax);
6347 subl(rax, divisor);
6348 if (VM_Version::supports_bmi1()) {
6349 andnl(rax, rax, rdx);
6350 } else {
6351 notl(rax);
6352 andl(rax, rdx);
6353 }
6354 movl(tmp, rax);
6355 shrl(rax, 31); // quotient
6356 sarl(tmp, 31);
6357 andl(tmp, divisor);
6358 subl(rdx, tmp); // remainder
6359 bind(done);
6360 }
6361
6362 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6363 XMMRegister xtmp2, Register rtmp) {
6364 if(VM_Version::supports_gfni()) {
6365 // Galois field instruction based bit reversal based on following algorithm.
6366 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6367 mov64(rtmp, 0x8040201008040201L);
6368 movq(xtmp1, src);
6369 movq(xtmp2, rtmp);
6370 gf2p8affineqb(xtmp1, xtmp2, 0);
6371 movq(dst, xtmp1);
6372 } else {
6373 // Swap even and odd numbered bits.
6374 movl(rtmp, src);
6375 andl(rtmp, 0x55555555);
6376 shll(rtmp, 1);
6377 movl(dst, src);
6378 andl(dst, 0xAAAAAAAA);
6379 shrl(dst, 1);
6380 orl(dst, rtmp);
6381
6382 // Swap LSB and MSB 2 bits of each nibble.
6383 movl(rtmp, dst);
6384 andl(rtmp, 0x33333333);
6385 shll(rtmp, 2);
6386 andl(dst, 0xCCCCCCCC);
6387 shrl(dst, 2);
6388 orl(dst, rtmp);
6389
6390 // Swap LSB and MSB 4 bits of each byte.
6391 movl(rtmp, dst);
6392 andl(rtmp, 0x0F0F0F0F);
6393 shll(rtmp, 4);
6394 andl(dst, 0xF0F0F0F0);
6395 shrl(dst, 4);
6396 orl(dst, rtmp);
6397 }
6398 bswapl(dst);
6399 }
6400
6401 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6402 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6403 if(VM_Version::supports_gfni()) {
6404 // Galois field instruction based bit reversal based on following algorithm.
6405 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6406 mov64(rtmp1, 0x8040201008040201L);
6407 movq(xtmp1, src);
6408 movq(xtmp2, rtmp1);
6409 gf2p8affineqb(xtmp1, xtmp2, 0);
6410 movq(dst, xtmp1);
6411 } else {
6412 // Swap even and odd numbered bits.
6413 movq(rtmp1, src);
6414 mov64(rtmp2, 0x5555555555555555L);
6415 andq(rtmp1, rtmp2);
6416 shlq(rtmp1, 1);
6417 movq(dst, src);
6418 notq(rtmp2);
6419 andq(dst, rtmp2);
6420 shrq(dst, 1);
6421 orq(dst, rtmp1);
6422
6423 // Swap LSB and MSB 2 bits of each nibble.
6424 movq(rtmp1, dst);
6425 mov64(rtmp2, 0x3333333333333333L);
6426 andq(rtmp1, rtmp2);
6427 shlq(rtmp1, 2);
6428 notq(rtmp2);
6429 andq(dst, rtmp2);
6430 shrq(dst, 2);
6431 orq(dst, rtmp1);
6432
6433 // Swap LSB and MSB 4 bits of each byte.
6434 movq(rtmp1, dst);
6435 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6436 andq(rtmp1, rtmp2);
6437 shlq(rtmp1, 4);
6438 notq(rtmp2);
6439 andq(dst, rtmp2);
6440 shrq(dst, 4);
6441 orq(dst, rtmp1);
6442 }
6443 bswapq(dst);
6444 }
6445
6446 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6447 Label done;
6448 Label neg_divisor_fastpath;
6449 cmpq(divisor, 0);
6450 jccb(Assembler::less, neg_divisor_fastpath);
6451 xorl(rdx, rdx);
6452 divq(divisor);
6453 jmpb(done);
6454 bind(neg_divisor_fastpath);
6455 // Fastpath for divisor < 0:
6456 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6457 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6458 movq(rdx, rax);
6459 subq(rdx, divisor);
6460 if (VM_Version::supports_bmi1()) {
6461 andnq(rax, rdx, rax);
6462 } else {
6463 notq(rdx);
6464 andq(rax, rdx);
6465 }
6466 shrq(rax, 63);
6467 bind(done);
6468 }
6469
6470 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6471 Label done;
6472 Label neg_divisor_fastpath;
6473 cmpq(divisor, 0);
6474 jccb(Assembler::less, neg_divisor_fastpath);
6475 xorq(rdx, rdx);
6476 divq(divisor);
6477 jmp(done);
6478 bind(neg_divisor_fastpath);
6479 // Fastpath when divisor < 0:
6480 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6481 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6482 movq(rdx, rax);
6483 subq(rax, divisor);
6484 if (VM_Version::supports_bmi1()) {
6485 andnq(rax, rax, rdx);
6486 } else {
6487 notq(rax);
6488 andq(rax, rdx);
6489 }
6490 sarq(rax, 63);
6491 andq(rax, divisor);
6492 subq(rdx, rax);
6493 bind(done);
6494 }
6495
6496 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6497 Label done;
6498 Label neg_divisor_fastpath;
6499 cmpq(divisor, 0);
6500 jccb(Assembler::less, neg_divisor_fastpath);
6501 xorq(rdx, rdx);
6502 divq(divisor);
6503 jmp(done);
6504 bind(neg_divisor_fastpath);
6505 // Fastpath for divisor < 0:
6506 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6507 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6508 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6509 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6510 movq(rdx, rax);
6511 subq(rax, divisor);
6512 if (VM_Version::supports_bmi1()) {
6513 andnq(rax, rax, rdx);
6514 } else {
6515 notq(rax);
6516 andq(rax, rdx);
6517 }
6518 movq(tmp, rax);
6519 shrq(rax, 63); // quotient
6520 sarq(tmp, 63);
6521 andq(tmp, divisor);
6522 subq(rdx, tmp); // remainder
6523 bind(done);
6524 }
6525
6526 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6527 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6528 int vlen_enc) {
6529 assert(VM_Version::supports_avx512bw(), "");
6530 // Byte shuffles are inlane operations and indices are determined using
6531 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6532 // normalized to index range 0-15. This makes sure that all the multiples
6533 // of an index value are placed at same relative position in 128 bit
6534 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6535 // will be 16th element in their respective 128 bit lanes.
6536 movl(rtmp, 16);
6537 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6538
6539 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6540 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6541 // original shuffle indices and move the shuffled lanes corresponding to true
6542 // mask to destination vector.
6543 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6544 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6545 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6546
6547 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6548 // and broadcasting second 128 bit lane.
6549 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6550 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6551 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6552 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6553 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6554
6555 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6556 // and broadcasting third 128 bit lane.
6557 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6558 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6559 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6560 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6561 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6562
6563 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6564 // and broadcasting third 128 bit lane.
6565 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6566 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6567 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6568 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6569 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6570 }
6571
6572 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6573 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6574 if (vlen_enc == AVX_128bit) {
6575 vpermilps(dst, src, shuffle, vlen_enc);
6576 } else if (bt == T_INT) {
6577 vpermd(dst, shuffle, src, vlen_enc);
6578 } else {
6579 assert(bt == T_FLOAT, "");
6580 vpermps(dst, shuffle, src, vlen_enc);
6581 }
6582 }
6583
6584 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6585 switch(opcode) {
6586 case Op_AddHF: vaddsh(dst, src1, src2); break;
6587 case Op_SubHF: vsubsh(dst, src1, src2); break;
6588 case Op_MulHF: vmulsh(dst, src1, src2); break;
6589 case Op_DivHF: vdivsh(dst, src1, src2); break;
6590 default: assert(false, "%s", NodeClassNames[opcode]); break;
6591 }
6592 }
6593
6594 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6595 switch(elem_bt) {
6596 case T_BYTE:
6597 if (ideal_opc == Op_SaturatingAddV) {
6598 vpaddsb(dst, src1, src2, vlen_enc);
6599 } else {
6600 assert(ideal_opc == Op_SaturatingSubV, "");
6601 vpsubsb(dst, src1, src2, vlen_enc);
6602 }
6603 break;
6604 case T_SHORT:
6605 if (ideal_opc == Op_SaturatingAddV) {
6606 vpaddsw(dst, src1, src2, vlen_enc);
6607 } else {
6608 assert(ideal_opc == Op_SaturatingSubV, "");
6609 vpsubsw(dst, src1, src2, vlen_enc);
6610 }
6611 break;
6612 default:
6613 fatal("Unsupported type %s", type2name(elem_bt));
6614 break;
6615 }
6616 }
6617
6618 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6619 switch(elem_bt) {
6620 case T_BYTE:
6621 if (ideal_opc == Op_SaturatingAddV) {
6622 vpaddusb(dst, src1, src2, vlen_enc);
6623 } else {
6624 assert(ideal_opc == Op_SaturatingSubV, "");
6625 vpsubusb(dst, src1, src2, vlen_enc);
6626 }
6627 break;
6628 case T_SHORT:
6629 if (ideal_opc == Op_SaturatingAddV) {
6630 vpaddusw(dst, src1, src2, vlen_enc);
6631 } else {
6632 assert(ideal_opc == Op_SaturatingSubV, "");
6633 vpsubusw(dst, src1, src2, vlen_enc);
6634 }
6635 break;
6636 default:
6637 fatal("Unsupported type %s", type2name(elem_bt));
6638 break;
6639 }
6640 }
6641
6642 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6643 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6644 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6645 // overflow_mask = Inp1 <u Inp2
6646 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6647 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6648 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6649 }
6650
6651 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6652 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6653 // Emulate unsigned comparison using signed comparison
6654 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6655 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6656 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6657 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6658
6659 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6660
6661 // Res = INP1 - INP2 (non-commutative and non-associative)
6662 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6663 // Res = Mask ? Zero : Res
6664 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6665 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6666 }
6667
6668 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6669 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6670 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6671 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6672 // Res = Signed Add INP1, INP2
6673 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6674 // T1 = SRC1 | SRC2
6675 vpor(xtmp1, src1, src2, vlen_enc);
6676 // Max_Unsigned = -1
6677 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6678 // Unsigned compare: Mask = Res <u T1
6679 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6680 // res = Mask ? Max_Unsigned : Res
6681 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6682 }
6683
6684 //
6685 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6686 // unsigned addition operation.
6687 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6688 //
6689 // We empirically determined its semantic equivalence to following reduced expression
6690 // overflow_mask = (a + b) <u (a | b)
6691 //
6692 // and also verified it though Alive2 solver.
6693 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6694 //
6695
6696 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6697 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6698 // Res = Signed Add INP1, INP2
6699 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6700 // Compute T1 = INP1 | INP2
6701 vpor(xtmp3, src1, src2, vlen_enc);
6702 // T1 = Minimum signed value.
6703 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6704 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6705 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6706 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6707 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6708 // Compute overflow detection mask = Res<1> <s T1
6709 if (elem_bt == T_INT) {
6710 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6711 } else {
6712 assert(elem_bt == T_LONG, "");
6713 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6714 }
6715 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6716 }
6717
6718 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6719 int vlen_enc, bool xtmp2_hold_M1) {
6720 if (VM_Version::supports_avx512dq()) {
6721 evpmovq2m(ktmp, src, vlen_enc);
6722 } else {
6723 assert(VM_Version::supports_evex(), "");
6724 if (!xtmp2_hold_M1) {
6725 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6726 }
6727 evpsraq(xtmp1, src, 63, vlen_enc);
6728 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6729 }
6730 }
6731
6732 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6733 int vlen_enc, bool xtmp2_hold_M1) {
6734 if (VM_Version::supports_avx512dq()) {
6735 evpmovd2m(ktmp, src, vlen_enc);
6736 } else {
6737 assert(VM_Version::supports_evex(), "");
6738 if (!xtmp2_hold_M1) {
6739 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6740 }
6741 vpsrad(xtmp1, src, 31, vlen_enc);
6742 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6743 }
6744 }
6745
6746
6747 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6748 if (elem_bt == T_LONG) {
6749 if (VM_Version::supports_evex()) {
6750 evpsraq(dst, src, 63, vlen_enc);
6751 } else {
6752 vpsrad(dst, src, 31, vlen_enc);
6753 vpshufd(dst, dst, 0xF5, vlen_enc);
6754 }
6755 } else {
6756 assert(elem_bt == T_INT, "");
6757 vpsrad(dst, src, 31, vlen_enc);
6758 }
6759 }
6760
6761 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6762 if (compute_allones) {
6763 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6764 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6765 } else {
6766 vpcmpeqq(allones, allones, allones, vlen_enc);
6767 }
6768 }
6769 if (elem_bt == T_LONG) {
6770 vpsrlq(dst, allones, 1, vlen_enc);
6771 } else {
6772 assert(elem_bt == T_INT, "");
6773 vpsrld(dst, allones, 1, vlen_enc);
6774 }
6775 }
6776
6777 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6778 if (compute_allones) {
6779 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6780 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6781 } else {
6782 vpcmpeqq(allones, allones, allones, vlen_enc);
6783 }
6784 }
6785 if (elem_bt == T_LONG) {
6786 vpsllq(dst, allones, 63, vlen_enc);
6787 } else {
6788 assert(elem_bt == T_INT, "");
6789 vpslld(dst, allones, 31, vlen_enc);
6790 }
6791 }
6792
6793 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6794 Assembler::ComparisonPredicate cond, int vlen_enc) {
6795 switch(elem_bt) {
6796 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6797 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6798 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6799 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6800 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6801 }
6802 }
6803
6804 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6805 switch(elem_bt) {
6806 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6807 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6808 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6809 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6810 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6811 }
6812 }
6813
6814 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6815 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6816 if (elem_bt == T_LONG) {
6817 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6818 } else {
6819 assert(elem_bt == T_INT, "");
6820 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6821 }
6822 }
6823
6824 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6825 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6826 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6827 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6828 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6829 // Overflow detection based on Hacker's delight section 2-13.
6830 if (ideal_opc == Op_SaturatingAddV) {
6831 // res = src1 + src2
6832 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6833 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6834 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6835 vpxor(xtmp1, dst, src1, vlen_enc);
6836 vpxor(xtmp2, dst, src2, vlen_enc);
6837 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6838 } else {
6839 assert(ideal_opc == Op_SaturatingSubV, "");
6840 // res = src1 - src2
6841 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6842 // Overflow occurs when both inputs have opposite polarity and
6843 // result polarity does not comply with first input polarity.
6844 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6845 vpxor(xtmp1, src1, src2, vlen_enc);
6846 vpxor(xtmp2, dst, src1, vlen_enc);
6847 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6848 }
6849
6850 // Compute overflow detection mask.
6851 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6852 // Note: xtmp1 hold -1 in all its lanes after above call.
6853
6854 // Compute mask based on first input polarity.
6855 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6856
6857 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6858 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6859
6860 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6861 // set bits in first input polarity mask holds a min value.
6862 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6863 // Blend destination lanes with saturated values using overflow detection mask.
6864 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6865 }
6866
6867
6868 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6869 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6870 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6871 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6872 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6873 // Overflow detection based on Hacker's delight section 2-13.
6874 if (ideal_opc == Op_SaturatingAddV) {
6875 // res = src1 + src2
6876 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6877 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6878 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6879 vpxor(xtmp1, dst, src1, vlen_enc);
6880 vpxor(xtmp2, dst, src2, vlen_enc);
6881 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6882 } else {
6883 assert(ideal_opc == Op_SaturatingSubV, "");
6884 // res = src1 - src2
6885 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6886 // Overflow occurs when both inputs have opposite polarity and
6887 // result polarity does not comply with first input polarity.
6888 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6889 vpxor(xtmp1, src1, src2, vlen_enc);
6890 vpxor(xtmp2, dst, src1, vlen_enc);
6891 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6892 }
6893
6894 // Sign-extend to compute overflow detection mask.
6895 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6896
6897 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6898 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6899 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6900
6901 // Compose saturating min/max vector using first input polarity mask.
6902 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6903 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6904
6905 // Blend result with saturating vector using overflow detection mask.
6906 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6907 }
6908
6909 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6910 switch(elem_bt) {
6911 case T_BYTE:
6912 if (ideal_opc == Op_SaturatingAddV) {
6913 vpaddsb(dst, src1, src2, vlen_enc);
6914 } else {
6915 assert(ideal_opc == Op_SaturatingSubV, "");
6916 vpsubsb(dst, src1, src2, vlen_enc);
6917 }
6918 break;
6919 case T_SHORT:
6920 if (ideal_opc == Op_SaturatingAddV) {
6921 vpaddsw(dst, src1, src2, vlen_enc);
6922 } else {
6923 assert(ideal_opc == Op_SaturatingSubV, "");
6924 vpsubsw(dst, src1, src2, vlen_enc);
6925 }
6926 break;
6927 default:
6928 fatal("Unsupported type %s", type2name(elem_bt));
6929 break;
6930 }
6931 }
6932
6933 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6934 switch(elem_bt) {
6935 case T_BYTE:
6936 if (ideal_opc == Op_SaturatingAddV) {
6937 vpaddusb(dst, src1, src2, vlen_enc);
6938 } else {
6939 assert(ideal_opc == Op_SaturatingSubV, "");
6940 vpsubusb(dst, src1, src2, vlen_enc);
6941 }
6942 break;
6943 case T_SHORT:
6944 if (ideal_opc == Op_SaturatingAddV) {
6945 vpaddusw(dst, src1, src2, vlen_enc);
6946 } else {
6947 assert(ideal_opc == Op_SaturatingSubV, "");
6948 vpsubusw(dst, src1, src2, vlen_enc);
6949 }
6950 break;
6951 default:
6952 fatal("Unsupported type %s", type2name(elem_bt));
6953 break;
6954 }
6955 }
6956
6957 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6958 XMMRegister src2, int vlen_enc) {
6959 switch(elem_bt) {
6960 case T_BYTE:
6961 evpermi2b(dst, src1, src2, vlen_enc);
6962 break;
6963 case T_SHORT:
6964 evpermi2w(dst, src1, src2, vlen_enc);
6965 break;
6966 case T_INT:
6967 evpermi2d(dst, src1, src2, vlen_enc);
6968 break;
6969 case T_LONG:
6970 evpermi2q(dst, src1, src2, vlen_enc);
6971 break;
6972 case T_FLOAT:
6973 evpermi2ps(dst, src1, src2, vlen_enc);
6974 break;
6975 case T_DOUBLE:
6976 evpermi2pd(dst, src1, src2, vlen_enc);
6977 break;
6978 default:
6979 fatal("Unsupported type %s", type2name(elem_bt));
6980 break;
6981 }
6982 }
6983
6984 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6985 if (is_unsigned) {
6986 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6987 } else {
6988 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6989 }
6990 }
6991
6992 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6993 if (is_unsigned) {
6994 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6995 } else {
6996 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6997 }
6998 }
6999
7000 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7001 switch(opcode) {
7002 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7003 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7004 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7005 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7006 default: assert(false, "%s", NodeClassNames[opcode]); break;
7007 }
7008 }
7009
7010 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7011 switch(opcode) {
7012 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7013 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7014 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7015 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7016 default: assert(false, "%s", NodeClassNames[opcode]); break;
7017 }
7018 }
7019
7020 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7021 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7022 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7023 }
7024
7025 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7026 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7027 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7028 // Move sign bits of src2 to mask register.
7029 evpmovw2m(ktmp, src2, vlen_enc);
7030 // xtmp1 = src2 < 0 ? src2 : src1
7031 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7032 // xtmp2 = src2 < 0 ? ? src1 : src2
7033 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7034 // Idea behind above swapping is to make seconds source operand a +ve value.
7035 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7036 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7037 // the second source operand, either a NaN or a valid floating-point value, is returned
7038 // dst = max(xtmp1, xtmp2)
7039 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7040 // isNaN = is_unordered_quiet(xtmp1)
7041 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7042 // Final result is same as first source if its a NaN value,
7043 // in case second operand holds a NaN value then as per above semantics
7044 // result is same as second operand.
7045 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7046 } else {
7047 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7048 // Move sign bits of src1 to mask register.
7049 evpmovw2m(ktmp, src1, vlen_enc);
7050 // xtmp1 = src1 < 0 ? src2 : src1
7051 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7052 // xtmp2 = src1 < 0 ? src1 : src2
7053 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7054 // Idea behind above swapping is to make seconds source operand a -ve value.
7055 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7056 // the second source operand is returned.
7057 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7058 // or a valid floating-point value, is written to the result.
7059 // dst = min(xtmp1, xtmp2)
7060 evminph(dst, xtmp1, xtmp2, vlen_enc);
7061 // isNaN = is_unordered_quiet(xtmp1)
7062 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7063 // Final result is same as first source if its a NaN value,
7064 // in case second operand holds a NaN value then as per above semantics
7065 // result is same as second operand.
7066 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7067 }
7068 }