1 /*
2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "oops/methodData.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/opcodes.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/globals.hpp"
36 #include "runtime/objectMonitor.hpp"
37 #include "runtime/stubRoutines.hpp"
38 #include "utilities/checkedCast.hpp"
39 #include "utilities/globalDefinitions.hpp"
40 #include "utilities/powerOfTwo.hpp"
41 #include "utilities/sizes.hpp"
42
43 #ifdef PRODUCT
44 #define BLOCK_COMMENT(str) /* nothing */
45 #define STOP(error) stop(error)
46 #else
47 #define BLOCK_COMMENT(str) block_comment(str)
48 #define STOP(error) block_comment(error); stop(error)
49 #endif
50
51 // C2 compiled method's prolog code.
52 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
53 if (C->clinit_barrier_on_entry()) {
54 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
55 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
56
57 Label L_skip_barrier;
58 Register klass = rscratch1;
59
60 mov_metadata(klass, C->method()->holder()->constant_encoding());
61 clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
62
63 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
64
65 bind(L_skip_barrier);
66 }
67
68 int framesize = C->output()->frame_size_in_bytes();
69 int bangsize = C->output()->bang_size_in_bytes();
70 bool fp_mode_24b = false;
71 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
72
73 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
74
75 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
76 // Remove word for return addr
77 framesize -= wordSize;
78 stack_bang_size -= wordSize;
79
80 // Calls to C2R adapters often do not accept exceptional returns.
81 // We require that their callers must bang for them. But be careful, because
82 // some VM calls (such as call site linkage) can use several kilobytes of
83 // stack. But the stack safety zone should account for that.
84 // See bugs 4446381, 4468289, 4497237.
85 if (stack_bang_size > 0) {
86 generate_stack_overflow_check(stack_bang_size);
87
88 // We always push rbp, so that on return to interpreter rbp, will be
89 // restored correctly and we can correct the stack.
90 push(rbp);
91 // Save caller's stack pointer into RBP if the frame pointer is preserved.
92 if (PreserveFramePointer) {
93 mov(rbp, rsp);
94 }
95 // Remove word for ebp
96 framesize -= wordSize;
97
98 // Create frame
99 if (framesize) {
100 subptr(rsp, framesize);
101 }
102 } else {
103 subptr(rsp, framesize);
104
105 // Save RBP register now.
106 framesize -= wordSize;
107 movptr(Address(rsp, framesize), rbp);
108 // Save caller's stack pointer into RBP if the frame pointer is preserved.
109 if (PreserveFramePointer) {
110 movptr(rbp, rsp);
111 if (framesize > 0) {
112 addptr(rbp, framesize);
113 }
114 }
115 }
116
117 if (C->needs_stack_repair()) {
118 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
119 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
120 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize);
121 }
122
123 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
124 framesize -= wordSize;
125 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
126 }
127
128 #ifdef ASSERT
129 if (VerifyStackAtCalls) {
130 Label L;
131 push(rax);
132 mov(rax, rsp);
133 andptr(rax, StackAlignmentInBytes-1);
134 cmpptr(rax, StackAlignmentInBytes-wordSize);
135 pop(rax);
136 jcc(Assembler::equal, L);
137 STOP("Stack is not properly aligned!");
138 bind(L);
139 }
140 #endif
141 }
142
143 void C2_MacroAssembler::entry_barrier() {
144 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
145 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
146 Label dummy_slow_path;
147 Label dummy_continuation;
148 Label* slow_path = &dummy_slow_path;
149 Label* continuation = &dummy_continuation;
150 if (!Compile::current()->output()->in_scratch_emit_size()) {
151 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
152 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
153 Compile::current()->output()->add_stub(stub);
154 slow_path = &stub->entry();
155 continuation = &stub->continuation();
156 }
157 bs->nmethod_entry_barrier(this, slow_path, continuation);
158 }
159
160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
161 switch (vlen_in_bytes) {
162 case 4: // fall-through
163 case 8: // fall-through
164 case 16: return Assembler::AVX_128bit;
165 case 32: return Assembler::AVX_256bit;
166 case 64: return Assembler::AVX_512bit;
167
168 default: {
169 ShouldNotReachHere();
170 return Assembler::AVX_NoVec;
171 }
172 }
173 }
174
175 // fast_lock and fast_unlock used by C2
176
177 // Because the transitions from emitted code to the runtime
178 // monitorenter/exit helper stubs are so slow it's critical that
179 // we inline both the stack-locking fast path and the inflated fast path.
180 //
181 // See also: cmpFastLock and cmpFastUnlock.
182 //
183 // What follows is a specialized inline transliteration of the code
184 // in enter() and exit(). If we're concerned about I$ bloat another
185 // option would be to emit TrySlowEnter and TrySlowExit methods
186 // at startup-time. These methods would accept arguments as
187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
190 // In practice, however, the # of lock sites is bounded and is usually small.
191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
192 // if the processor uses simple bimodal branch predictors keyed by EIP
193 // Since the helper routines would be called from multiple synchronization
194 // sites.
195 //
196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
198 // to those specialized methods. That'd give us a mostly platform-independent
199 // implementation that the JITs could optimize and inline at their pleasure.
200 // Done correctly, the only time we'd need to cross to native could would be
201 // to park() or unpark() threads. We'd also need a few more unsafe operators
202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
203 // (b) explicit barriers or fence operations.
204 //
205 // TODO:
206 //
207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
210 // the lock operators would typically be faster than reifying Self.
211 //
212 // * Ideally I'd define the primitives as:
213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
215 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
216 // Instead, we're stuck with a rather awkward and brittle register assignments below.
217 // Furthermore the register assignments are overconstrained, possibly resulting in
218 // sub-optimal code near the synchronization site.
219 //
220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
221 // Alternately, use a better sp-proximity test.
222 //
223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
224 // Either one is sufficient to uniquely identify a thread.
225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
226 //
227 // * Intrinsify notify() and notifyAll() for the common cases where the
228 // object is locked by the calling thread but the waitlist is empty.
229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
230 //
231 // * use jccb and jmpb instead of jcc and jmp to improve code density.
232 // But beware of excessive branch density on AMD Opterons.
233 //
234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
235 // or failure of the fast path. If the fast path fails then we pass
236 // control to the slow path, typically in C. In fast_lock and
237 // fast_unlock we often branch to DONE_LABEL, just to find that C2
238 // will emit a conditional branch immediately after the node.
239 // So we have branches to branches and lots of ICC.ZF games.
240 // Instead, it might be better to have C2 pass a "FailureLabel"
241 // into fast_lock and fast_unlock. In the case of success, control
242 // will drop through the node. ICC.ZF is undefined at exit.
243 // In the case of failure, the node will branch directly to the
244 // FailureLabel
245
246
247 // obj: object to lock
248 // box: on-stack box address -- KILLED
249 // rax: tmp -- KILLED
250 // t : tmp -- KILLED
251 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
252 Register t, Register thread) {
253 assert(rax_reg == rax, "Used for CAS");
254 assert_different_registers(obj, box, rax_reg, t, thread);
255
256 // Handle inflated monitor.
257 Label inflated;
258 // Finish fast lock successfully. ZF value is irrelevant.
259 Label locked;
260 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
261 Label slow_path;
262
263 if (UseObjectMonitorTable) {
264 // Clear cache in case fast locking succeeds or we need to take the slow-path.
265 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
266 }
267
268 if (DiagnoseSyncOnValueBasedClasses != 0) {
269 load_klass(rax_reg, obj, t);
270 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
271 jcc(Assembler::notZero, slow_path);
272 }
273
274 const Register mark = t;
275
276 { // Lightweight Lock
277
278 Label push;
279
280 const Register top = UseObjectMonitorTable ? rax_reg : box;
281
282 // Load the mark.
283 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
284
285 // Prefetch top.
286 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
287
288 // Check for monitor (0b10).
289 testptr(mark, markWord::monitor_value);
290 jcc(Assembler::notZero, inflated);
291
292 // Check if lock-stack is full.
293 cmpl(top, LockStack::end_offset() - 1);
294 jcc(Assembler::greater, slow_path);
295
296 // Check if recursive.
297 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
298 jccb(Assembler::equal, push);
299
300 // Try to lock. Transition lock bits 0b01 => 0b00
301 movptr(rax_reg, mark);
302 orptr(rax_reg, markWord::unlocked_value);
303 andptr(mark, ~(int32_t)markWord::unlocked_value);
304 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
305 jcc(Assembler::notEqual, slow_path);
306
307 if (UseObjectMonitorTable) {
308 // Need to reload top, clobbered by CAS.
309 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
310 }
311 bind(push);
312 // After successful lock, push object on lock-stack.
313 movptr(Address(thread, top), obj);
314 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
315 jmpb(locked);
316 }
317
318 { // Handle inflated monitor.
319 bind(inflated);
320
321 const Register monitor = t;
322
323 if (!UseObjectMonitorTable) {
324 assert(mark == monitor, "should be the same here");
325 } else {
326 // Uses ObjectMonitorTable. Look for the monitor in the om_cache.
327 // Fetch ObjectMonitor* from the cache or take the slow-path.
328 Label monitor_found;
329
330 // Load cache address
331 lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
332
333 const int num_unrolled = 2;
334 for (int i = 0; i < num_unrolled; i++) {
335 cmpptr(obj, Address(t));
336 jccb(Assembler::equal, monitor_found);
337 increment(t, in_bytes(OMCache::oop_to_oop_difference()));
338 }
339
340 Label loop;
341
342 // Search for obj in cache.
343 bind(loop);
344
345 // Check for match.
346 cmpptr(obj, Address(t));
347 jccb(Assembler::equal, monitor_found);
348
349 // Search until null encountered, guaranteed _null_sentinel at end.
350 cmpptr(Address(t), 1);
351 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
352 increment(t, in_bytes(OMCache::oop_to_oop_difference()));
353 jmpb(loop);
354
355 // Cache hit.
356 bind(monitor_found);
357 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
358 }
359 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
360 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
361 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
362
363 Label monitor_locked;
364 // Lock the monitor.
365
366 if (UseObjectMonitorTable) {
367 // Cache the monitor for unlock before trashing box. On failure to acquire
368 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
369 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
370 }
371
372 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
373 xorptr(rax_reg, rax_reg);
374 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
375 lock(); cmpxchgptr(box, owner_address);
376 jccb(Assembler::equal, monitor_locked);
377
378 // Check if recursive.
379 cmpptr(box, rax_reg);
380 jccb(Assembler::notEqual, slow_path);
381
382 // Recursive.
383 increment(recursions_address);
384
385 bind(monitor_locked);
386 }
387
388 bind(locked);
389 // Set ZF = 1
390 xorl(rax_reg, rax_reg);
391
392 #ifdef ASSERT
393 // Check that locked label is reached with ZF set.
394 Label zf_correct;
395 Label zf_bad_zero;
396 jcc(Assembler::zero, zf_correct);
397 jmp(zf_bad_zero);
398 #endif
399
400 bind(slow_path);
401 #ifdef ASSERT
402 // Check that slow_path label is reached with ZF not set.
403 jcc(Assembler::notZero, zf_correct);
404 stop("Fast Lock ZF != 0");
405 bind(zf_bad_zero);
406 stop("Fast Lock ZF != 1");
407 bind(zf_correct);
408 #endif
409 // C2 uses the value of ZF to determine the continuation.
410 }
411
412 // obj: object to lock
413 // rax: tmp -- KILLED
414 // t : tmp - cannot be obj nor rax -- KILLED
415 //
416 // Some commentary on balanced locking:
417 //
418 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
419 // Methods that don't have provably balanced locking are forced to run in the
420 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
421 // The interpreter provides two properties:
422 // I1: At return-time the interpreter automatically and quietly unlocks any
423 // objects acquired in the current activation (frame). Recall that the
424 // interpreter maintains an on-stack list of locks currently held by
425 // a frame.
426 // I2: If a method attempts to unlock an object that is not held by the
427 // frame the interpreter throws IMSX.
428 //
429 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
430 // B() doesn't have provably balanced locking so it runs in the interpreter.
431 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
432 // is still locked by A().
433 //
434 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
435 // Specification" states that an object locked by JNI's MonitorEnter should not be
436 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
437 // specify what will occur if a program engages in such mixed-mode locking, however.
438 // Arguably given that the spec legislates the JNI case as undefined our implementation
439 // could reasonably *avoid* checking owner in fast_unlock().
440 // In the interest of performance we elide m->Owner==Self check in unlock.
441 // A perfectly viable alternative is to elide the owner check except when
442 // Xcheck:jni is enabled.
443
444 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
445 assert(reg_rax == rax, "Used for CAS");
446 assert_different_registers(obj, reg_rax, t);
447
448 // Handle inflated monitor.
449 Label inflated, inflated_check_lock_stack;
450 // Finish fast unlock successfully. MUST jump with ZF == 1
451 Label unlocked, slow_path;
452
453 const Register mark = t;
454 const Register monitor = t;
455 const Register top = UseObjectMonitorTable ? t : reg_rax;
456 const Register box = reg_rax;
457
458 Label dummy;
459 C2FastUnlockLightweightStub* stub = nullptr;
460
461 if (!Compile::current()->output()->in_scratch_emit_size()) {
462 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
463 Compile::current()->output()->add_stub(stub);
464 }
465
466 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
467
468 { // Lightweight Unlock
469
470 // Load top.
471 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
472
473 if (!UseObjectMonitorTable) {
474 // Prefetch mark.
475 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
476 }
477
478 // Check if obj is top of lock-stack.
479 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
480 // Top of lock stack was not obj. Must be monitor.
481 jcc(Assembler::notEqual, inflated_check_lock_stack);
482
483 // Pop lock-stack.
484 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
485 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
486
487 // Check if recursive.
488 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
489 jcc(Assembler::equal, unlocked);
490
491 // We elide the monitor check, let the CAS fail instead.
492
493 if (UseObjectMonitorTable) {
494 // Load mark.
495 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
496 }
497
498 // Try to unlock. Transition lock bits 0b00 => 0b01
499 movptr(reg_rax, mark);
500 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
501 orptr(mark, markWord::unlocked_value);
502 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
503 jcc(Assembler::notEqual, push_and_slow_path);
504 jmp(unlocked);
505 }
506
507
508 { // Handle inflated monitor.
509 bind(inflated_check_lock_stack);
510 #ifdef ASSERT
511 Label check_done;
512 subl(top, oopSize);
513 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
514 jcc(Assembler::below, check_done);
515 cmpptr(obj, Address(thread, top));
516 jccb(Assembler::notEqual, inflated_check_lock_stack);
517 stop("Fast Unlock lock on stack");
518 bind(check_done);
519 if (UseObjectMonitorTable) {
520 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
521 }
522 testptr(mark, markWord::monitor_value);
523 jccb(Assembler::notZero, inflated);
524 stop("Fast Unlock not monitor");
525 #endif
526
527 bind(inflated);
528
529 if (!UseObjectMonitorTable) {
530 assert(mark == monitor, "should be the same here");
531 } else {
532 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
533 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
534 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
535 cmpptr(monitor, alignof(ObjectMonitor*));
536 jcc(Assembler::below, slow_path);
537 }
538 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
539 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
540 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
541 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
542 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
543
544 Label recursive;
545
546 // Check if recursive.
547 cmpptr(recursions_address, 0);
548 jccb(Assembler::notZero, recursive);
549
550 // Set owner to null.
551 // Release to satisfy the JMM
552 movptr(owner_address, NULL_WORD);
553 // We need a full fence after clearing owner to avoid stranding.
554 // StoreLoad achieves this.
555 membar(StoreLoad);
556
557 // Check if the entry_list is empty.
558 cmpptr(entry_list_address, NULL_WORD);
559 jccb(Assembler::zero, unlocked); // If so we are done.
560
561 // Check if there is a successor.
562 cmpptr(succ_address, NULL_WORD);
563 jccb(Assembler::notZero, unlocked); // If so we are done.
564
565 // Save the monitor pointer in the current thread, so we can try to
566 // reacquire the lock in SharedRuntime::monitor_exit_helper().
567 if (!UseObjectMonitorTable) {
568 andptr(monitor, ~(int32_t)markWord::monitor_value);
569 }
570 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
571
572 orl(t, 1); // Fast Unlock ZF = 0
573 jmpb(slow_path);
574
575 // Recursive unlock.
576 bind(recursive);
577 decrement(recursions_address);
578 }
579
580 bind(unlocked);
581 xorl(t, t); // Fast Unlock ZF = 1
582
583 #ifdef ASSERT
584 // Check that unlocked label is reached with ZF set.
585 Label zf_correct;
586 Label zf_bad_zero;
587 jcc(Assembler::zero, zf_correct);
588 jmp(zf_bad_zero);
589 #endif
590
591 bind(slow_path);
592 if (stub != nullptr) {
593 bind(stub->slow_path_continuation());
594 }
595 #ifdef ASSERT
596 // Check that stub->continuation() label is reached with ZF not set.
597 jcc(Assembler::notZero, zf_correct);
598 stop("Fast Unlock ZF != 0");
599 bind(zf_bad_zero);
600 stop("Fast Unlock ZF != 1");
601 bind(zf_correct);
602 #endif
603 // C2 uses the value of ZF to determine the continuation.
604 }
605
606 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
607 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
608 }
609
610 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
611 const int framesize = Compile::current()->output()->frame_size_in_bytes();
612 masm->movptr(dst, rsp);
613 if (framesize > 2 * wordSize) {
614 masm->addptr(dst, framesize - 2 * wordSize);
615 }
616 }
617
618 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
619 if (PreserveFramePointer) {
620 // frame pointer is valid
621 #ifdef ASSERT
622 // Verify frame pointer value in rbp.
623 reconstruct_frame_pointer_helper(this, rtmp);
624 Label L_success;
625 cmpq(rbp, rtmp);
626 jccb(Assembler::equal, L_success);
627 STOP("frame pointer mismatch");
628 bind(L_success);
629 #endif // ASSERT
630 } else {
631 reconstruct_frame_pointer_helper(this, rbp);
632 }
633 }
634
635 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
636 jint lo = t->_lo;
637 jint hi = t->_hi;
638 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
639 if (t == TypeInt::INT) {
640 return;
641 }
642
643 BLOCK_COMMENT("CastII {");
644 Label fail;
645 Label succeed;
646 if (hi == max_jint) {
647 cmpl(val, lo);
648 jccb(Assembler::greaterEqual, succeed);
649 } else {
650 if (lo != min_jint) {
651 cmpl(val, lo);
652 jccb(Assembler::less, fail);
653 }
654 cmpl(val, hi);
655 jccb(Assembler::lessEqual, succeed);
656 }
657
658 bind(fail);
659 movl(c_rarg0, idx);
660 movl(c_rarg1, val);
661 movl(c_rarg2, lo);
662 movl(c_rarg3, hi);
663 reconstruct_frame_pointer(rscratch1);
664 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
665 hlt();
666 bind(succeed);
667 BLOCK_COMMENT("} // CastII");
668 }
669
670 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
671 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
672 }
673
674 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
675 jlong lo = t->_lo;
676 jlong hi = t->_hi;
677 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
678 if (t == TypeLong::LONG) {
679 return;
680 }
681
682 BLOCK_COMMENT("CastLL {");
683 Label fail;
684 Label succeed;
685
686 auto cmp_val = [&](jlong bound) {
687 if (is_simm32(bound)) {
688 cmpq(val, checked_cast<int>(bound));
689 } else {
690 mov64(tmp, bound);
691 cmpq(val, tmp);
692 }
693 };
694
695 if (hi == max_jlong) {
696 cmp_val(lo);
697 jccb(Assembler::greaterEqual, succeed);
698 } else {
699 if (lo != min_jlong) {
700 cmp_val(lo);
701 jccb(Assembler::less, fail);
702 }
703 cmp_val(hi);
704 jccb(Assembler::lessEqual, succeed);
705 }
706
707 bind(fail);
708 movl(c_rarg0, idx);
709 movq(c_rarg1, val);
710 mov64(c_rarg2, lo);
711 mov64(c_rarg3, hi);
712 reconstruct_frame_pointer(rscratch1);
713 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
714 hlt();
715 bind(succeed);
716 BLOCK_COMMENT("} // CastLL");
717 }
718
719 //-------------------------------------------------------------------------------------------
720 // Generic instructions support for use in .ad files C2 code generation
721
722 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
723 if (dst != src) {
724 movdqu(dst, src);
725 }
726 if (opcode == Op_AbsVD) {
727 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
728 } else {
729 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
730 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
731 }
732 }
733
734 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
735 if (opcode == Op_AbsVD) {
736 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
737 } else {
738 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
739 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
740 }
741 }
742
743 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
744 if (dst != src) {
745 movdqu(dst, src);
746 }
747 if (opcode == Op_AbsVF) {
748 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
749 } else {
750 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
751 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
752 }
753 }
754
755 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
756 if (opcode == Op_AbsVF) {
757 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
758 } else {
759 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
760 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
761 }
762 }
763
764 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
765 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
766 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
767
768 if (opcode == Op_MinV) {
769 if (elem_bt == T_BYTE) {
770 pminsb(dst, src);
771 } else if (elem_bt == T_SHORT) {
772 pminsw(dst, src);
773 } else if (elem_bt == T_INT) {
774 pminsd(dst, src);
775 } else {
776 assert(elem_bt == T_LONG, "required");
777 assert(tmp == xmm0, "required");
778 assert_different_registers(dst, src, tmp);
779 movdqu(xmm0, dst);
780 pcmpgtq(xmm0, src);
781 blendvpd(dst, src); // xmm0 as mask
782 }
783 } else { // opcode == Op_MaxV
784 if (elem_bt == T_BYTE) {
785 pmaxsb(dst, src);
786 } else if (elem_bt == T_SHORT) {
787 pmaxsw(dst, src);
788 } else if (elem_bt == T_INT) {
789 pmaxsd(dst, src);
790 } else {
791 assert(elem_bt == T_LONG, "required");
792 assert(tmp == xmm0, "required");
793 assert_different_registers(dst, src, tmp);
794 movdqu(xmm0, src);
795 pcmpgtq(xmm0, dst);
796 blendvpd(dst, src); // xmm0 as mask
797 }
798 }
799 }
800
801 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
802 XMMRegister src1, Address src2, int vlen_enc) {
803 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
804 if (opcode == Op_UMinV) {
805 switch(elem_bt) {
806 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
807 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
808 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
809 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
810 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
811 }
812 } else {
813 assert(opcode == Op_UMaxV, "required");
814 switch(elem_bt) {
815 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
816 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
817 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
818 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
819 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
820 }
821 }
822 }
823
824 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
825 // For optimality, leverage a full vector width of 512 bits
826 // for operations over smaller vector sizes on AVX512 targets.
827 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
828 if (opcode == Op_UMaxV) {
829 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
830 } else {
831 assert(opcode == Op_UMinV, "required");
832 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
833 }
834 } else {
835 // T1 = -1
836 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
837 // T1 = -1 << 63
838 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
839 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
840 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
841 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
842 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
843 // Mask = T2 > T1
844 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
845 if (opcode == Op_UMaxV) {
846 // Res = Mask ? Src2 : Src1
847 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
848 } else {
849 // Res = Mask ? Src1 : Src2
850 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
851 }
852 }
853 }
854
855 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
856 XMMRegister src1, XMMRegister src2, int vlen_enc) {
857 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
858 if (opcode == Op_UMinV) {
859 switch(elem_bt) {
860 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
861 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
862 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
863 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
864 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
865 }
866 } else {
867 assert(opcode == Op_UMaxV, "required");
868 switch(elem_bt) {
869 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
870 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
871 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
872 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
873 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
874 }
875 }
876 }
877
878 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
879 XMMRegister dst, XMMRegister src1, XMMRegister src2,
880 int vlen_enc) {
881 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
882
883 if (opcode == Op_MinV) {
884 if (elem_bt == T_BYTE) {
885 vpminsb(dst, src1, src2, vlen_enc);
886 } else if (elem_bt == T_SHORT) {
887 vpminsw(dst, src1, src2, vlen_enc);
888 } else if (elem_bt == T_INT) {
889 vpminsd(dst, src1, src2, vlen_enc);
890 } else {
891 assert(elem_bt == T_LONG, "required");
892 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
893 vpminsq(dst, src1, src2, vlen_enc);
894 } else {
895 assert_different_registers(dst, src1, src2);
896 vpcmpgtq(dst, src1, src2, vlen_enc);
897 vblendvpd(dst, src1, src2, dst, vlen_enc);
898 }
899 }
900 } else { // opcode == Op_MaxV
901 if (elem_bt == T_BYTE) {
902 vpmaxsb(dst, src1, src2, vlen_enc);
903 } else if (elem_bt == T_SHORT) {
904 vpmaxsw(dst, src1, src2, vlen_enc);
905 } else if (elem_bt == T_INT) {
906 vpmaxsd(dst, src1, src2, vlen_enc);
907 } else {
908 assert(elem_bt == T_LONG, "required");
909 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
910 vpmaxsq(dst, src1, src2, vlen_enc);
911 } else {
912 assert_different_registers(dst, src1, src2);
913 vpcmpgtq(dst, src1, src2, vlen_enc);
914 vblendvpd(dst, src2, src1, dst, vlen_enc);
915 }
916 }
917 }
918 }
919
920 // Float/Double min max
921
922 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
923 XMMRegister dst, XMMRegister a, XMMRegister b,
924 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
925 int vlen_enc) {
926 assert(UseAVX > 0, "required");
927 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
928 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
929 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
930 assert_different_registers(a, tmp, atmp, btmp);
931 assert_different_registers(b, tmp, atmp, btmp);
932
933 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
934 bool is_double_word = is_double_word_type(elem_bt);
935
936 /* Note on 'non-obvious' assembly sequence:
937 *
938 * While there are vminps/vmaxps instructions, there are two important differences between hardware
939 * and Java on how they handle floats:
940 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
941 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
942 *
943 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
944 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
945 * (only useful when signs differ, noop otherwise)
946 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
947
948 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
949 * btmp = (b < +0.0) ? a : b
950 * atmp = (b < +0.0) ? b : a
951 * Tmp = Max_Float(atmp , btmp)
952 * Res = (atmp == NaN) ? atmp : Tmp
953 */
954
955 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
956 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
957 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
958 XMMRegister mask;
959
960 if (!is_double_word && is_min) {
961 mask = a;
962 vblend = &MacroAssembler::vblendvps;
963 vmaxmin = &MacroAssembler::vminps;
964 vcmp = &MacroAssembler::vcmpps;
965 } else if (!is_double_word && !is_min) {
966 mask = b;
967 vblend = &MacroAssembler::vblendvps;
968 vmaxmin = &MacroAssembler::vmaxps;
969 vcmp = &MacroAssembler::vcmpps;
970 } else if (is_double_word && is_min) {
971 mask = a;
972 vblend = &MacroAssembler::vblendvpd;
973 vmaxmin = &MacroAssembler::vminpd;
974 vcmp = &MacroAssembler::vcmppd;
975 } else {
976 assert(is_double_word && !is_min, "sanity");
977 mask = b;
978 vblend = &MacroAssembler::vblendvpd;
979 vmaxmin = &MacroAssembler::vmaxpd;
980 vcmp = &MacroAssembler::vcmppd;
981 }
982
983 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
984 XMMRegister maxmin, scratch;
985 if (dst == btmp) {
986 maxmin = btmp;
987 scratch = tmp;
988 } else {
989 maxmin = tmp;
990 scratch = btmp;
991 }
992
993 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
994 if (precompute_mask && !is_double_word) {
995 vpsrad(tmp, mask, 32, vlen_enc);
996 mask = tmp;
997 } else if (precompute_mask && is_double_word) {
998 vpxor(tmp, tmp, tmp, vlen_enc);
999 vpcmpgtq(tmp, tmp, mask, vlen_enc);
1000 mask = tmp;
1001 }
1002
1003 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1004 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1005 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1006 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1007 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1008 }
1009
1010 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1011 XMMRegister dst, XMMRegister a, XMMRegister b,
1012 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1013 int vlen_enc) {
1014 assert(UseAVX > 2, "required");
1015 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1016 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1017 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1018 assert_different_registers(dst, a, atmp, btmp);
1019 assert_different_registers(dst, b, atmp, btmp);
1020
1021 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1022 bool is_double_word = is_double_word_type(elem_bt);
1023 bool merge = true;
1024
1025 if (!is_double_word && is_min) {
1026 evpmovd2m(ktmp, a, vlen_enc);
1027 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1028 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1029 vminps(dst, atmp, btmp, vlen_enc);
1030 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1031 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1032 } else if (!is_double_word && !is_min) {
1033 evpmovd2m(ktmp, b, vlen_enc);
1034 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1035 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1036 vmaxps(dst, atmp, btmp, vlen_enc);
1037 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1038 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1039 } else if (is_double_word && is_min) {
1040 evpmovq2m(ktmp, a, vlen_enc);
1041 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1042 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1043 vminpd(dst, atmp, btmp, vlen_enc);
1044 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1045 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1046 } else {
1047 assert(is_double_word && !is_min, "sanity");
1048 evpmovq2m(ktmp, b, vlen_enc);
1049 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1050 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1051 vmaxpd(dst, atmp, btmp, vlen_enc);
1052 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1053 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1054 }
1055 }
1056
1057 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1058 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1059 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1060 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1061
1062 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN
1063 : AVX10_MINMAX_MAX_COMPARE_SIGN;
1064 if (elem_bt == T_FLOAT) {
1065 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1066 } else {
1067 assert(elem_bt == T_DOUBLE, "");
1068 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1069 }
1070 }
1071
1072 // Float/Double signum
1073 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1074 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1075
1076 Label DONE_LABEL;
1077
1078 if (opcode == Op_SignumF) {
1079 ucomiss(dst, zero);
1080 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1081 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1082 movflt(dst, one);
1083 jcc(Assembler::above, DONE_LABEL);
1084 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1085 } else if (opcode == Op_SignumD) {
1086 ucomisd(dst, zero);
1087 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1088 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1089 movdbl(dst, one);
1090 jcc(Assembler::above, DONE_LABEL);
1091 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1092 }
1093
1094 bind(DONE_LABEL);
1095 }
1096
1097 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1098 if (sign) {
1099 pmovsxbw(dst, src);
1100 } else {
1101 pmovzxbw(dst, src);
1102 }
1103 }
1104
1105 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1106 if (sign) {
1107 vpmovsxbw(dst, src, vector_len);
1108 } else {
1109 vpmovzxbw(dst, src, vector_len);
1110 }
1111 }
1112
1113 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1114 if (sign) {
1115 vpmovsxbd(dst, src, vector_len);
1116 } else {
1117 vpmovzxbd(dst, src, vector_len);
1118 }
1119 }
1120
1121 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1122 if (sign) {
1123 vpmovsxwd(dst, src, vector_len);
1124 } else {
1125 vpmovzxwd(dst, src, vector_len);
1126 }
1127 }
1128
1129 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1130 int shift, int vector_len) {
1131 if (opcode == Op_RotateLeftV) {
1132 if (etype == T_INT) {
1133 evprold(dst, src, shift, vector_len);
1134 } else {
1135 assert(etype == T_LONG, "expected type T_LONG");
1136 evprolq(dst, src, shift, vector_len);
1137 }
1138 } else {
1139 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1140 if (etype == T_INT) {
1141 evprord(dst, src, shift, vector_len);
1142 } else {
1143 assert(etype == T_LONG, "expected type T_LONG");
1144 evprorq(dst, src, shift, vector_len);
1145 }
1146 }
1147 }
1148
1149 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1150 XMMRegister shift, int vector_len) {
1151 if (opcode == Op_RotateLeftV) {
1152 if (etype == T_INT) {
1153 evprolvd(dst, src, shift, vector_len);
1154 } else {
1155 assert(etype == T_LONG, "expected type T_LONG");
1156 evprolvq(dst, src, shift, vector_len);
1157 }
1158 } else {
1159 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1160 if (etype == T_INT) {
1161 evprorvd(dst, src, shift, vector_len);
1162 } else {
1163 assert(etype == T_LONG, "expected type T_LONG");
1164 evprorvq(dst, src, shift, vector_len);
1165 }
1166 }
1167 }
1168
1169 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1170 if (opcode == Op_RShiftVI) {
1171 psrad(dst, shift);
1172 } else if (opcode == Op_LShiftVI) {
1173 pslld(dst, shift);
1174 } else {
1175 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1176 psrld(dst, shift);
1177 }
1178 }
1179
1180 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1181 switch (opcode) {
1182 case Op_RShiftVI: psrad(dst, shift); break;
1183 case Op_LShiftVI: pslld(dst, shift); break;
1184 case Op_URShiftVI: psrld(dst, shift); break;
1185
1186 default: assert(false, "%s", NodeClassNames[opcode]);
1187 }
1188 }
1189
1190 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1191 if (opcode == Op_RShiftVI) {
1192 vpsrad(dst, nds, shift, vector_len);
1193 } else if (opcode == Op_LShiftVI) {
1194 vpslld(dst, nds, shift, vector_len);
1195 } else {
1196 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1197 vpsrld(dst, nds, shift, vector_len);
1198 }
1199 }
1200
1201 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1202 switch (opcode) {
1203 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1204 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1205 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1206
1207 default: assert(false, "%s", NodeClassNames[opcode]);
1208 }
1209 }
1210
1211 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1212 switch (opcode) {
1213 case Op_RShiftVB: // fall-through
1214 case Op_RShiftVS: psraw(dst, shift); break;
1215
1216 case Op_LShiftVB: // fall-through
1217 case Op_LShiftVS: psllw(dst, shift); break;
1218
1219 case Op_URShiftVS: // fall-through
1220 case Op_URShiftVB: psrlw(dst, shift); break;
1221
1222 default: assert(false, "%s", NodeClassNames[opcode]);
1223 }
1224 }
1225
1226 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1227 switch (opcode) {
1228 case Op_RShiftVB: // fall-through
1229 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1230
1231 case Op_LShiftVB: // fall-through
1232 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1233
1234 case Op_URShiftVS: // fall-through
1235 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1236
1237 default: assert(false, "%s", NodeClassNames[opcode]);
1238 }
1239 }
1240
1241 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1242 switch (opcode) {
1243 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1244 case Op_LShiftVL: psllq(dst, shift); break;
1245 case Op_URShiftVL: psrlq(dst, shift); break;
1246
1247 default: assert(false, "%s", NodeClassNames[opcode]);
1248 }
1249 }
1250
1251 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1252 if (opcode == Op_RShiftVL) {
1253 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1254 } else if (opcode == Op_LShiftVL) {
1255 psllq(dst, shift);
1256 } else {
1257 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1258 psrlq(dst, shift);
1259 }
1260 }
1261
1262 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1263 switch (opcode) {
1264 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1265 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1266 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1267
1268 default: assert(false, "%s", NodeClassNames[opcode]);
1269 }
1270 }
1271
1272 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1273 if (opcode == Op_RShiftVL) {
1274 evpsraq(dst, nds, shift, vector_len);
1275 } else if (opcode == Op_LShiftVL) {
1276 vpsllq(dst, nds, shift, vector_len);
1277 } else {
1278 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1279 vpsrlq(dst, nds, shift, vector_len);
1280 }
1281 }
1282
1283 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1284 switch (opcode) {
1285 case Op_RShiftVB: // fall-through
1286 case Op_RShiftVS: // fall-through
1287 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1288
1289 case Op_LShiftVB: // fall-through
1290 case Op_LShiftVS: // fall-through
1291 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1292
1293 case Op_URShiftVB: // fall-through
1294 case Op_URShiftVS: // fall-through
1295 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1296
1297 default: assert(false, "%s", NodeClassNames[opcode]);
1298 }
1299 }
1300
1301 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1302 switch (opcode) {
1303 case Op_RShiftVB: // fall-through
1304 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1305
1306 case Op_LShiftVB: // fall-through
1307 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1308
1309 case Op_URShiftVB: // fall-through
1310 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1311
1312 default: assert(false, "%s", NodeClassNames[opcode]);
1313 }
1314 }
1315
1316 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1317 assert(UseAVX >= 2, "required");
1318 switch (opcode) {
1319 case Op_RShiftVL: {
1320 if (UseAVX > 2) {
1321 assert(tmp == xnoreg, "not used");
1322 if (!VM_Version::supports_avx512vl()) {
1323 vlen_enc = Assembler::AVX_512bit;
1324 }
1325 evpsravq(dst, src, shift, vlen_enc);
1326 } else {
1327 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1328 vpsrlvq(dst, src, shift, vlen_enc);
1329 vpsrlvq(tmp, tmp, shift, vlen_enc);
1330 vpxor(dst, dst, tmp, vlen_enc);
1331 vpsubq(dst, dst, tmp, vlen_enc);
1332 }
1333 break;
1334 }
1335 case Op_LShiftVL: {
1336 assert(tmp == xnoreg, "not used");
1337 vpsllvq(dst, src, shift, vlen_enc);
1338 break;
1339 }
1340 case Op_URShiftVL: {
1341 assert(tmp == xnoreg, "not used");
1342 vpsrlvq(dst, src, shift, vlen_enc);
1343 break;
1344 }
1345 default: assert(false, "%s", NodeClassNames[opcode]);
1346 }
1347 }
1348
1349 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1350 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1351 assert(opcode == Op_LShiftVB ||
1352 opcode == Op_RShiftVB ||
1353 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1354 bool sign = (opcode != Op_URShiftVB);
1355 assert(vector_len == 0, "required");
1356 vextendbd(sign, dst, src, 1);
1357 vpmovzxbd(vtmp, shift, 1);
1358 varshiftd(opcode, dst, dst, vtmp, 1);
1359 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1360 vextracti128_high(vtmp, dst);
1361 vpackusdw(dst, dst, vtmp, 0);
1362 }
1363
1364 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1365 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1366 assert(opcode == Op_LShiftVB ||
1367 opcode == Op_RShiftVB ||
1368 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1369 bool sign = (opcode != Op_URShiftVB);
1370 int ext_vector_len = vector_len + 1;
1371 vextendbw(sign, dst, src, ext_vector_len);
1372 vpmovzxbw(vtmp, shift, ext_vector_len);
1373 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1374 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1375 if (vector_len == 0) {
1376 vextracti128_high(vtmp, dst);
1377 vpackuswb(dst, dst, vtmp, vector_len);
1378 } else {
1379 vextracti64x4_high(vtmp, dst);
1380 vpackuswb(dst, dst, vtmp, vector_len);
1381 vpermq(dst, dst, 0xD8, vector_len);
1382 }
1383 }
1384
1385 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1386 switch(typ) {
1387 case T_BYTE:
1388 pinsrb(dst, val, idx);
1389 break;
1390 case T_SHORT:
1391 pinsrw(dst, val, idx);
1392 break;
1393 case T_INT:
1394 pinsrd(dst, val, idx);
1395 break;
1396 case T_LONG:
1397 pinsrq(dst, val, idx);
1398 break;
1399 default:
1400 assert(false,"Should not reach here.");
1401 break;
1402 }
1403 }
1404
1405 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1406 switch(typ) {
1407 case T_BYTE:
1408 vpinsrb(dst, src, val, idx);
1409 break;
1410 case T_SHORT:
1411 vpinsrw(dst, src, val, idx);
1412 break;
1413 case T_INT:
1414 vpinsrd(dst, src, val, idx);
1415 break;
1416 case T_LONG:
1417 vpinsrq(dst, src, val, idx);
1418 break;
1419 default:
1420 assert(false,"Should not reach here.");
1421 break;
1422 }
1423 }
1424
1425 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1426 Register base, Register idx_base,
1427 Register mask, Register mask_idx,
1428 Register rtmp, int vlen_enc) {
1429 vpxor(dst, dst, dst, vlen_enc);
1430 if (elem_bt == T_SHORT) {
1431 for (int i = 0; i < 4; i++) {
1432 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1433 Label skip_load;
1434 btq(mask, mask_idx);
1435 jccb(Assembler::carryClear, skip_load);
1436 movl(rtmp, Address(idx_base, i * 4));
1437 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1438 bind(skip_load);
1439 incq(mask_idx);
1440 }
1441 } else {
1442 assert(elem_bt == T_BYTE, "");
1443 for (int i = 0; i < 8; i++) {
1444 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1445 Label skip_load;
1446 btq(mask, mask_idx);
1447 jccb(Assembler::carryClear, skip_load);
1448 movl(rtmp, Address(idx_base, i * 4));
1449 pinsrb(dst, Address(base, rtmp), i);
1450 bind(skip_load);
1451 incq(mask_idx);
1452 }
1453 }
1454 }
1455
1456 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1457 Register base, Register idx_base,
1458 Register rtmp, int vlen_enc) {
1459 vpxor(dst, dst, dst, vlen_enc);
1460 if (elem_bt == T_SHORT) {
1461 for (int i = 0; i < 4; i++) {
1462 // dst[i] = src[idx_base[i]]
1463 movl(rtmp, Address(idx_base, i * 4));
1464 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1465 }
1466 } else {
1467 assert(elem_bt == T_BYTE, "");
1468 for (int i = 0; i < 8; i++) {
1469 // dst[i] = src[idx_base[i]]
1470 movl(rtmp, Address(idx_base, i * 4));
1471 pinsrb(dst, Address(base, rtmp), i);
1472 }
1473 }
1474 }
1475
1476 /*
1477 * Gather using hybrid algorithm, first partially unroll scalar loop
1478 * to accumulate values from gather indices into a quad-word(64bit) slice.
1479 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1480 * permutation to place the slice into appropriate vector lane
1481 * locations in destination vector. Following pseudo code describes the
1482 * algorithm in detail:
1483 *
1484 * DST_VEC = ZERO_VEC
1485 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1486 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1487 * FOREACH_ITER:
1488 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1489 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1490 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1491 * PERM_INDEX = PERM_INDEX - TWO_VEC
1492 *
1493 * With each iteration, doubleword permute indices (0,1) corresponding
1494 * to gathered quadword gets right shifted by two lane positions.
1495 *
1496 */
1497 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1498 Register base, Register idx_base,
1499 Register mask, XMMRegister xtmp1,
1500 XMMRegister xtmp2, XMMRegister temp_dst,
1501 Register rtmp, Register mask_idx,
1502 Register length, int vector_len, int vlen_enc) {
1503 Label GATHER8_LOOP;
1504 assert(is_subword_type(elem_ty), "");
1505 movl(length, vector_len);
1506 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1507 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1508 vallones(xtmp2, vlen_enc);
1509 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1510 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1511 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1512
1513 bind(GATHER8_LOOP);
1514 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1515 if (mask == noreg) {
1516 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1517 } else {
1518 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1519 }
1520 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1521 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1522 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1523 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1524 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1525 vpor(dst, dst, temp_dst, vlen_enc);
1526 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1527 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1528 jcc(Assembler::notEqual, GATHER8_LOOP);
1529 }
1530
1531 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1532 switch(typ) {
1533 case T_INT:
1534 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1535 break;
1536 case T_FLOAT:
1537 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1538 break;
1539 case T_LONG:
1540 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1541 break;
1542 case T_DOUBLE:
1543 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1544 break;
1545 default:
1546 assert(false,"Should not reach here.");
1547 break;
1548 }
1549 }
1550
1551 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1552 switch(typ) {
1553 case T_INT:
1554 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1555 break;
1556 case T_FLOAT:
1557 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1558 break;
1559 case T_LONG:
1560 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1561 break;
1562 case T_DOUBLE:
1563 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1564 break;
1565 default:
1566 assert(false,"Should not reach here.");
1567 break;
1568 }
1569 }
1570
1571 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1572 switch(typ) {
1573 case T_INT:
1574 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1575 break;
1576 case T_FLOAT:
1577 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1578 break;
1579 case T_LONG:
1580 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1581 break;
1582 case T_DOUBLE:
1583 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1584 break;
1585 default:
1586 assert(false,"Should not reach here.");
1587 break;
1588 }
1589 }
1590
1591 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1592 if (vlen_in_bytes <= 16) {
1593 pxor (dst, dst);
1594 psubb(dst, src);
1595 switch (elem_bt) {
1596 case T_BYTE: /* nothing to do */ break;
1597 case T_SHORT: pmovsxbw(dst, dst); break;
1598 case T_INT: pmovsxbd(dst, dst); break;
1599 case T_FLOAT: pmovsxbd(dst, dst); break;
1600 case T_LONG: pmovsxbq(dst, dst); break;
1601 case T_DOUBLE: pmovsxbq(dst, dst); break;
1602
1603 default: assert(false, "%s", type2name(elem_bt));
1604 }
1605 } else {
1606 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1607 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1608
1609 vpxor (dst, dst, dst, vlen_enc);
1610 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1611
1612 switch (elem_bt) {
1613 case T_BYTE: /* nothing to do */ break;
1614 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1615 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1616 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1617 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1618 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1619
1620 default: assert(false, "%s", type2name(elem_bt));
1621 }
1622 }
1623 }
1624
1625 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1626 if (novlbwdq) {
1627 vpmovsxbd(xtmp, src, vlen_enc);
1628 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1629 Assembler::eq, true, vlen_enc, noreg);
1630 } else {
1631 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1632 vpsubb(xtmp, xtmp, src, vlen_enc);
1633 evpmovb2m(dst, xtmp, vlen_enc);
1634 }
1635 }
1636
1637 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1638 if (is_integral_type(bt)) {
1639 switch (vlen_in_bytes) {
1640 case 4: movdl(dst, src); break;
1641 case 8: movq(dst, src); break;
1642 case 16: movdqu(dst, src); break;
1643 case 32: vmovdqu(dst, src); break;
1644 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1645 default: ShouldNotReachHere();
1646 }
1647 } else {
1648 switch (vlen_in_bytes) {
1649 case 4: movflt(dst, src); break;
1650 case 8: movdbl(dst, src); break;
1651 case 16: movups(dst, src); break;
1652 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1653 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1654 default: ShouldNotReachHere();
1655 }
1656 }
1657 }
1658
1659 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1660 assert(rscratch != noreg || always_reachable(src), "missing");
1661
1662 if (reachable(src)) {
1663 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1664 } else {
1665 lea(rscratch, src);
1666 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1667 }
1668 }
1669
1670 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1671 int vlen_enc = vector_length_encoding(vlen);
1672 if (VM_Version::supports_avx()) {
1673 if (bt == T_LONG) {
1674 if (VM_Version::supports_avx2()) {
1675 vpbroadcastq(dst, src, vlen_enc);
1676 } else {
1677 vmovddup(dst, src, vlen_enc);
1678 }
1679 } else if (bt == T_DOUBLE) {
1680 if (vlen_enc != Assembler::AVX_128bit) {
1681 vbroadcastsd(dst, src, vlen_enc, noreg);
1682 } else {
1683 vmovddup(dst, src, vlen_enc);
1684 }
1685 } else {
1686 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1687 vpbroadcastd(dst, src, vlen_enc);
1688 } else {
1689 vbroadcastss(dst, src, vlen_enc);
1690 }
1691 }
1692 } else if (VM_Version::supports_sse3()) {
1693 movddup(dst, src);
1694 } else {
1695 load_vector(bt, dst, src, vlen);
1696 }
1697 }
1698
1699 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1700 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1701 int offset = exact_log2(type2aelembytes(bt)) << 6;
1702 if (is_floating_point_type(bt)) {
1703 offset += 128;
1704 }
1705 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1706 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1707 }
1708
1709 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1710
1711 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1712 int vector_len = Assembler::AVX_128bit;
1713
1714 switch (opcode) {
1715 case Op_AndReductionV: pand(dst, src); break;
1716 case Op_OrReductionV: por (dst, src); break;
1717 case Op_XorReductionV: pxor(dst, src); break;
1718 case Op_MinReductionV:
1719 switch (typ) {
1720 case T_BYTE: pminsb(dst, src); break;
1721 case T_SHORT: pminsw(dst, src); break;
1722 case T_INT: pminsd(dst, src); break;
1723 case T_LONG: assert(UseAVX > 2, "required");
1724 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1725 default: assert(false, "wrong type");
1726 }
1727 break;
1728 case Op_MaxReductionV:
1729 switch (typ) {
1730 case T_BYTE: pmaxsb(dst, src); break;
1731 case T_SHORT: pmaxsw(dst, src); break;
1732 case T_INT: pmaxsd(dst, src); break;
1733 case T_LONG: assert(UseAVX > 2, "required");
1734 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1735 default: assert(false, "wrong type");
1736 }
1737 break;
1738 case Op_AddReductionVF: addss(dst, src); break;
1739 case Op_AddReductionVD: addsd(dst, src); break;
1740 case Op_AddReductionVI:
1741 switch (typ) {
1742 case T_BYTE: paddb(dst, src); break;
1743 case T_SHORT: paddw(dst, src); break;
1744 case T_INT: paddd(dst, src); break;
1745 default: assert(false, "wrong type");
1746 }
1747 break;
1748 case Op_AddReductionVL: paddq(dst, src); break;
1749 case Op_MulReductionVF: mulss(dst, src); break;
1750 case Op_MulReductionVD: mulsd(dst, src); break;
1751 case Op_MulReductionVI:
1752 switch (typ) {
1753 case T_SHORT: pmullw(dst, src); break;
1754 case T_INT: pmulld(dst, src); break;
1755 default: assert(false, "wrong type");
1756 }
1757 break;
1758 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1759 evpmullq(dst, dst, src, vector_len); break;
1760 default: assert(false, "wrong opcode");
1761 }
1762 }
1763
1764 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1765 switch (opcode) {
1766 case Op_AddReductionVF: addps(dst, src); break;
1767 case Op_AddReductionVD: addpd(dst, src); break;
1768 case Op_MulReductionVF: mulps(dst, src); break;
1769 case Op_MulReductionVD: mulpd(dst, src); break;
1770 default: assert(false, "%s", NodeClassNames[opcode]);
1771 }
1772 }
1773
1774 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1775 int vector_len = Assembler::AVX_256bit;
1776
1777 switch (opcode) {
1778 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1779 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1780 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1781 case Op_MinReductionV:
1782 switch (typ) {
1783 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1784 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1785 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1786 case T_LONG: assert(UseAVX > 2, "required");
1787 vpminsq(dst, src1, src2, vector_len); break;
1788 default: assert(false, "wrong type");
1789 }
1790 break;
1791 case Op_MaxReductionV:
1792 switch (typ) {
1793 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1794 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1795 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1796 case T_LONG: assert(UseAVX > 2, "required");
1797 vpmaxsq(dst, src1, src2, vector_len); break;
1798 default: assert(false, "wrong type");
1799 }
1800 break;
1801 case Op_AddReductionVI:
1802 switch (typ) {
1803 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1804 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1805 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1806 default: assert(false, "wrong type");
1807 }
1808 break;
1809 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1810 case Op_MulReductionVI:
1811 switch (typ) {
1812 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1813 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1814 default: assert(false, "wrong type");
1815 }
1816 break;
1817 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1818 default: assert(false, "wrong opcode");
1819 }
1820 }
1821
1822 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1823 int vector_len = Assembler::AVX_256bit;
1824
1825 switch (opcode) {
1826 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1827 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1828 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1829 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1830 default: assert(false, "%s", NodeClassNames[opcode]);
1831 }
1832 }
1833
1834 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1835 XMMRegister dst, XMMRegister src,
1836 XMMRegister vtmp1, XMMRegister vtmp2) {
1837 switch (opcode) {
1838 case Op_AddReductionVF:
1839 case Op_MulReductionVF:
1840 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1841 break;
1842
1843 case Op_AddReductionVD:
1844 case Op_MulReductionVD:
1845 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1846 break;
1847
1848 default: assert(false, "wrong opcode");
1849 }
1850 }
1851
1852 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1853 XMMRegister dst, XMMRegister src,
1854 XMMRegister vtmp1, XMMRegister vtmp2) {
1855 switch (opcode) {
1856 case Op_AddReductionVF:
1857 case Op_MulReductionVF:
1858 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1859 break;
1860
1861 case Op_AddReductionVD:
1862 case Op_MulReductionVD:
1863 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1864 break;
1865
1866 default: assert(false, "%s", NodeClassNames[opcode]);
1867 }
1868 }
1869
1870 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1871 Register dst, Register src1, XMMRegister src2,
1872 XMMRegister vtmp1, XMMRegister vtmp2) {
1873 switch (vlen) {
1874 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1876 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1877 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1878
1879 default: assert(false, "wrong vector length");
1880 }
1881 }
1882
1883 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1884 Register dst, Register src1, XMMRegister src2,
1885 XMMRegister vtmp1, XMMRegister vtmp2) {
1886 switch (vlen) {
1887 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1888 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1889 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1890 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1891
1892 default: assert(false, "wrong vector length");
1893 }
1894 }
1895
1896 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1897 Register dst, Register src1, XMMRegister src2,
1898 XMMRegister vtmp1, XMMRegister vtmp2) {
1899 switch (vlen) {
1900 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1901 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1902 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1903 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1904
1905 default: assert(false, "wrong vector length");
1906 }
1907 }
1908
1909 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1910 Register dst, Register src1, XMMRegister src2,
1911 XMMRegister vtmp1, XMMRegister vtmp2) {
1912 switch (vlen) {
1913 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1914 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1915 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1916 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1917
1918 default: assert(false, "wrong vector length");
1919 }
1920 }
1921
1922 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1923 Register dst, Register src1, XMMRegister src2,
1924 XMMRegister vtmp1, XMMRegister vtmp2) {
1925 switch (vlen) {
1926 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1927 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1928 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1929
1930 default: assert(false, "wrong vector length");
1931 }
1932 }
1933
1934 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1935 switch (vlen) {
1936 case 2:
1937 assert(vtmp2 == xnoreg, "");
1938 reduce2F(opcode, dst, src, vtmp1);
1939 break;
1940 case 4:
1941 assert(vtmp2 == xnoreg, "");
1942 reduce4F(opcode, dst, src, vtmp1);
1943 break;
1944 case 8:
1945 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1946 break;
1947 case 16:
1948 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1949 break;
1950 default: assert(false, "wrong vector length");
1951 }
1952 }
1953
1954 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1955 switch (vlen) {
1956 case 2:
1957 assert(vtmp2 == xnoreg, "");
1958 reduce2D(opcode, dst, src, vtmp1);
1959 break;
1960 case 4:
1961 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1962 break;
1963 case 8:
1964 reduce8D(opcode, dst, src, vtmp1, vtmp2);
1965 break;
1966 default: assert(false, "wrong vector length");
1967 }
1968 }
1969
1970 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1971 switch (vlen) {
1972 case 2:
1973 assert(vtmp1 == xnoreg, "");
1974 assert(vtmp2 == xnoreg, "");
1975 unorderedReduce2F(opcode, dst, src);
1976 break;
1977 case 4:
1978 assert(vtmp2 == xnoreg, "");
1979 unorderedReduce4F(opcode, dst, src, vtmp1);
1980 break;
1981 case 8:
1982 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1983 break;
1984 case 16:
1985 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1986 break;
1987 default: assert(false, "wrong vector length");
1988 }
1989 }
1990
1991 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1992 switch (vlen) {
1993 case 2:
1994 assert(vtmp1 == xnoreg, "");
1995 assert(vtmp2 == xnoreg, "");
1996 unorderedReduce2D(opcode, dst, src);
1997 break;
1998 case 4:
1999 assert(vtmp2 == xnoreg, "");
2000 unorderedReduce4D(opcode, dst, src, vtmp1);
2001 break;
2002 case 8:
2003 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2004 break;
2005 default: assert(false, "wrong vector length");
2006 }
2007 }
2008
2009 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2010 if (opcode == Op_AddReductionVI) {
2011 if (vtmp1 != src2) {
2012 movdqu(vtmp1, src2);
2013 }
2014 phaddd(vtmp1, vtmp1);
2015 } else {
2016 pshufd(vtmp1, src2, 0x1);
2017 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2018 }
2019 movdl(vtmp2, src1);
2020 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2021 movdl(dst, vtmp1);
2022 }
2023
2024 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2025 if (opcode == Op_AddReductionVI) {
2026 if (vtmp1 != src2) {
2027 movdqu(vtmp1, src2);
2028 }
2029 phaddd(vtmp1, src2);
2030 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2031 } else {
2032 pshufd(vtmp2, src2, 0xE);
2033 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2034 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2035 }
2036 }
2037
2038 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2039 if (opcode == Op_AddReductionVI) {
2040 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2041 vextracti128_high(vtmp2, vtmp1);
2042 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2043 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2044 } else {
2045 vextracti128_high(vtmp1, src2);
2046 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2047 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2048 }
2049 }
2050
2051 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2052 vextracti64x4_high(vtmp2, src2);
2053 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2054 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2055 }
2056
2057 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2058 pshufd(vtmp2, src2, 0x1);
2059 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2060 movdqu(vtmp1, vtmp2);
2061 psrldq(vtmp1, 2);
2062 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2063 movdqu(vtmp2, vtmp1);
2064 psrldq(vtmp2, 1);
2065 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2066 movdl(vtmp2, src1);
2067 pmovsxbd(vtmp1, vtmp1);
2068 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2069 pextrb(dst, vtmp1, 0x0);
2070 movsbl(dst, dst);
2071 }
2072
2073 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2074 pshufd(vtmp1, src2, 0xE);
2075 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2076 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2077 }
2078
2079 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2080 vextracti128_high(vtmp2, src2);
2081 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2082 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2083 }
2084
2085 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2086 vextracti64x4_high(vtmp1, src2);
2087 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2088 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2089 }
2090
2091 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2092 pmovsxbw(vtmp2, src2);
2093 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2094 }
2095
2096 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2097 if (UseAVX > 1) {
2098 int vector_len = Assembler::AVX_256bit;
2099 vpmovsxbw(vtmp1, src2, vector_len);
2100 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2101 } else {
2102 pmovsxbw(vtmp2, src2);
2103 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2104 pshufd(vtmp2, src2, 0x1);
2105 pmovsxbw(vtmp2, src2);
2106 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2107 }
2108 }
2109
2110 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2111 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2112 int vector_len = Assembler::AVX_512bit;
2113 vpmovsxbw(vtmp1, src2, vector_len);
2114 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2115 } else {
2116 assert(UseAVX >= 2,"Should not reach here.");
2117 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2118 vextracti128_high(vtmp2, src2);
2119 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2120 }
2121 }
2122
2123 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2124 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2125 vextracti64x4_high(vtmp2, src2);
2126 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2127 }
2128
2129 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2130 if (opcode == Op_AddReductionVI) {
2131 if (vtmp1 != src2) {
2132 movdqu(vtmp1, src2);
2133 }
2134 phaddw(vtmp1, vtmp1);
2135 phaddw(vtmp1, vtmp1);
2136 } else {
2137 pshufd(vtmp2, src2, 0x1);
2138 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2139 movdqu(vtmp1, vtmp2);
2140 psrldq(vtmp1, 2);
2141 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2142 }
2143 movdl(vtmp2, src1);
2144 pmovsxwd(vtmp1, vtmp1);
2145 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2146 pextrw(dst, vtmp1, 0x0);
2147 movswl(dst, dst);
2148 }
2149
2150 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2151 if (opcode == Op_AddReductionVI) {
2152 if (vtmp1 != src2) {
2153 movdqu(vtmp1, src2);
2154 }
2155 phaddw(vtmp1, src2);
2156 } else {
2157 pshufd(vtmp1, src2, 0xE);
2158 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2159 }
2160 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2161 }
2162
2163 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2164 if (opcode == Op_AddReductionVI) {
2165 int vector_len = Assembler::AVX_256bit;
2166 vphaddw(vtmp2, src2, src2, vector_len);
2167 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2168 } else {
2169 vextracti128_high(vtmp2, src2);
2170 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2171 }
2172 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2173 }
2174
2175 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2176 int vector_len = Assembler::AVX_256bit;
2177 vextracti64x4_high(vtmp1, src2);
2178 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2179 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2180 }
2181
2182 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2183 pshufd(vtmp2, src2, 0xE);
2184 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2185 movdq(vtmp1, src1);
2186 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2187 movdq(dst, vtmp1);
2188 }
2189
2190 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2191 vextracti128_high(vtmp1, src2);
2192 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2193 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2194 }
2195
2196 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2197 vextracti64x4_high(vtmp2, src2);
2198 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2199 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2200 }
2201
2202 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2203 mov64(temp, -1L);
2204 bzhiq(temp, temp, len);
2205 kmovql(dst, temp);
2206 }
2207
2208 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2209 reduce_operation_128(T_FLOAT, opcode, dst, src);
2210 pshufd(vtmp, src, 0x1);
2211 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2212 }
2213
2214 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2215 reduce2F(opcode, dst, src, vtmp);
2216 pshufd(vtmp, src, 0x2);
2217 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2218 pshufd(vtmp, src, 0x3);
2219 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2220 }
2221
2222 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2223 reduce4F(opcode, dst, src, vtmp2);
2224 vextractf128_high(vtmp2, src);
2225 reduce4F(opcode, dst, vtmp2, vtmp1);
2226 }
2227
2228 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2229 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2230 vextracti64x4_high(vtmp1, src);
2231 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2232 }
2233
2234 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2235 pshufd(dst, src, 0x1);
2236 reduce_operation_128(T_FLOAT, opcode, dst, src);
2237 }
2238
2239 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2240 pshufd(vtmp, src, 0xE);
2241 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2242 unorderedReduce2F(opcode, dst, vtmp);
2243 }
2244
2245 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2246 vextractf128_high(vtmp1, src);
2247 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2248 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2249 }
2250
2251 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2252 vextractf64x4_high(vtmp2, src);
2253 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2254 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2255 }
2256
2257 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2258 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2259 pshufd(vtmp, src, 0xE);
2260 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2261 }
2262
2263 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2264 reduce2D(opcode, dst, src, vtmp2);
2265 vextractf128_high(vtmp2, src);
2266 reduce2D(opcode, dst, vtmp2, vtmp1);
2267 }
2268
2269 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2270 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2271 vextracti64x4_high(vtmp1, src);
2272 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2273 }
2274
2275 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2276 pshufd(dst, src, 0xE);
2277 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2278 }
2279
2280 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2281 vextractf128_high(vtmp, src);
2282 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2283 unorderedReduce2D(opcode, dst, vtmp);
2284 }
2285
2286 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2287 vextractf64x4_high(vtmp2, src);
2288 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2289 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2290 }
2291
2292 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2293 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2294 }
2295
2296 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2297 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2298 }
2299
2300 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2301 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2302 }
2303
2304 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2305 int vec_enc) {
2306 switch(elem_bt) {
2307 case T_INT:
2308 case T_FLOAT:
2309 vmaskmovps(dst, src, mask, vec_enc);
2310 break;
2311 case T_LONG:
2312 case T_DOUBLE:
2313 vmaskmovpd(dst, src, mask, vec_enc);
2314 break;
2315 default:
2316 fatal("Unsupported type %s", type2name(elem_bt));
2317 break;
2318 }
2319 }
2320
2321 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2322 int vec_enc) {
2323 switch(elem_bt) {
2324 case T_INT:
2325 case T_FLOAT:
2326 vmaskmovps(dst, src, mask, vec_enc);
2327 break;
2328 case T_LONG:
2329 case T_DOUBLE:
2330 vmaskmovpd(dst, src, mask, vec_enc);
2331 break;
2332 default:
2333 fatal("Unsupported type %s", type2name(elem_bt));
2334 break;
2335 }
2336 }
2337
2338 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2339 XMMRegister dst, XMMRegister src,
2340 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2341 XMMRegister xmm_0, XMMRegister xmm_1) {
2342 const int permconst[] = {1, 14};
2343 XMMRegister wsrc = src;
2344 XMMRegister wdst = xmm_0;
2345 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2346
2347 int vlen_enc = Assembler::AVX_128bit;
2348 if (vlen == 16) {
2349 vlen_enc = Assembler::AVX_256bit;
2350 }
2351
2352 for (int i = log2(vlen) - 1; i >=0; i--) {
2353 if (i == 0 && !is_dst_valid) {
2354 wdst = dst;
2355 }
2356 if (i == 3) {
2357 vextracti64x4_high(wtmp, wsrc);
2358 } else if (i == 2) {
2359 vextracti128_high(wtmp, wsrc);
2360 } else { // i = [0,1]
2361 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2362 }
2363
2364 if (VM_Version::supports_avx10_2()) {
2365 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2366 } else {
2367 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2368 }
2369 wsrc = wdst;
2370 vlen_enc = Assembler::AVX_128bit;
2371 }
2372 if (is_dst_valid) {
2373 if (VM_Version::supports_avx10_2()) {
2374 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2375 } else {
2376 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2377 }
2378 }
2379 }
2380
2381 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2382 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2383 XMMRegister xmm_0, XMMRegister xmm_1) {
2384 XMMRegister wsrc = src;
2385 XMMRegister wdst = xmm_0;
2386 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2387 int vlen_enc = Assembler::AVX_128bit;
2388 if (vlen == 8) {
2389 vlen_enc = Assembler::AVX_256bit;
2390 }
2391 for (int i = log2(vlen) - 1; i >=0; i--) {
2392 if (i == 0 && !is_dst_valid) {
2393 wdst = dst;
2394 }
2395 if (i == 1) {
2396 vextracti128_high(wtmp, wsrc);
2397 } else if (i == 2) {
2398 vextracti64x4_high(wtmp, wsrc);
2399 } else {
2400 assert(i == 0, "%d", i);
2401 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2402 }
2403
2404 if (VM_Version::supports_avx10_2()) {
2405 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2406 } else {
2407 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2408 }
2409
2410 wsrc = wdst;
2411 vlen_enc = Assembler::AVX_128bit;
2412 }
2413
2414 if (is_dst_valid) {
2415 if (VM_Version::supports_avx10_2()) {
2416 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2417 } else {
2418 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2419 }
2420 }
2421 }
2422
2423 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2424 switch (bt) {
2425 case T_BYTE: pextrb(dst, src, idx); break;
2426 case T_SHORT: pextrw(dst, src, idx); break;
2427 case T_INT: pextrd(dst, src, idx); break;
2428 case T_LONG: pextrq(dst, src, idx); break;
2429
2430 default:
2431 assert(false,"Should not reach here.");
2432 break;
2433 }
2434 }
2435
2436 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2437 int esize = type2aelembytes(typ);
2438 int elem_per_lane = 16/esize;
2439 int lane = elemindex / elem_per_lane;
2440 int eindex = elemindex % elem_per_lane;
2441
2442 if (lane >= 2) {
2443 assert(UseAVX > 2, "required");
2444 vextractf32x4(dst, src, lane & 3);
2445 return dst;
2446 } else if (lane > 0) {
2447 assert(UseAVX > 0, "required");
2448 vextractf128(dst, src, lane);
2449 return dst;
2450 } else {
2451 return src;
2452 }
2453 }
2454
2455 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2456 if (typ == T_BYTE) {
2457 movsbl(dst, dst);
2458 } else if (typ == T_SHORT) {
2459 movswl(dst, dst);
2460 }
2461 }
2462
2463 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2464 int esize = type2aelembytes(typ);
2465 int elem_per_lane = 16/esize;
2466 int eindex = elemindex % elem_per_lane;
2467 assert(is_integral_type(typ),"required");
2468
2469 if (eindex == 0) {
2470 if (typ == T_LONG) {
2471 movq(dst, src);
2472 } else {
2473 movdl(dst, src);
2474 movsxl(typ, dst);
2475 }
2476 } else {
2477 extract(typ, dst, src, eindex);
2478 movsxl(typ, dst);
2479 }
2480 }
2481
2482 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2483 int esize = type2aelembytes(typ);
2484 int elem_per_lane = 16/esize;
2485 int eindex = elemindex % elem_per_lane;
2486 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2487
2488 if (eindex == 0) {
2489 movq(dst, src);
2490 } else {
2491 if (typ == T_FLOAT) {
2492 if (UseAVX == 0) {
2493 movdqu(dst, src);
2494 shufps(dst, dst, eindex);
2495 } else {
2496 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2497 }
2498 } else {
2499 if (UseAVX == 0) {
2500 movdqu(dst, src);
2501 psrldq(dst, eindex*esize);
2502 } else {
2503 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2504 }
2505 movq(dst, dst);
2506 }
2507 }
2508 // Zero upper bits
2509 if (typ == T_FLOAT) {
2510 if (UseAVX == 0) {
2511 assert(vtmp != xnoreg, "required.");
2512 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2513 pand(dst, vtmp);
2514 } else {
2515 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2516 }
2517 }
2518 }
2519
2520 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2521 switch(typ) {
2522 case T_BYTE:
2523 case T_BOOLEAN:
2524 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2525 break;
2526 case T_SHORT:
2527 case T_CHAR:
2528 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2529 break;
2530 case T_INT:
2531 case T_FLOAT:
2532 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2533 break;
2534 case T_LONG:
2535 case T_DOUBLE:
2536 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2537 break;
2538 default:
2539 assert(false,"Should not reach here.");
2540 break;
2541 }
2542 }
2543
2544 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2545 assert(rscratch != noreg || always_reachable(src2), "missing");
2546
2547 switch(typ) {
2548 case T_BOOLEAN:
2549 case T_BYTE:
2550 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2551 break;
2552 case T_CHAR:
2553 case T_SHORT:
2554 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2555 break;
2556 case T_INT:
2557 case T_FLOAT:
2558 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2559 break;
2560 case T_LONG:
2561 case T_DOUBLE:
2562 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2563 break;
2564 default:
2565 assert(false,"Should not reach here.");
2566 break;
2567 }
2568 }
2569
2570 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2571 switch(typ) {
2572 case T_BYTE:
2573 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2574 break;
2575 case T_SHORT:
2576 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2577 break;
2578 case T_INT:
2579 case T_FLOAT:
2580 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2581 break;
2582 case T_LONG:
2583 case T_DOUBLE:
2584 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2585 break;
2586 default:
2587 assert(false,"Should not reach here.");
2588 break;
2589 }
2590 }
2591
2592 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2593 assert(vlen_in_bytes <= 32, "");
2594 int esize = type2aelembytes(bt);
2595 if (vlen_in_bytes == 32) {
2596 assert(vtmp == xnoreg, "required.");
2597 if (esize >= 4) {
2598 vtestps(src1, src2, AVX_256bit);
2599 } else {
2600 vptest(src1, src2, AVX_256bit);
2601 }
2602 return;
2603 }
2604 if (vlen_in_bytes < 16) {
2605 // Duplicate the lower part to fill the whole register,
2606 // Don't need to do so for src2
2607 assert(vtmp != xnoreg, "required");
2608 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2609 pshufd(vtmp, src1, shuffle_imm);
2610 } else {
2611 assert(vtmp == xnoreg, "required");
2612 vtmp = src1;
2613 }
2614 if (esize >= 4 && VM_Version::supports_avx()) {
2615 vtestps(vtmp, src2, AVX_128bit);
2616 } else {
2617 ptest(vtmp, src2);
2618 }
2619 }
2620
2621 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2622 #ifdef ASSERT
2623 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2624 bool is_bw_supported = VM_Version::supports_avx512bw();
2625 if (is_bw && !is_bw_supported) {
2626 assert(vlen_enc != Assembler::AVX_512bit, "required");
2627 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2628 "XMM register should be 0-15");
2629 }
2630 #endif // ASSERT
2631 switch (elem_bt) {
2632 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2633 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2634 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2635 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2636 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2637 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2638 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2639 }
2640 }
2641
2642 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2643 assert(UseAVX >= 2, "required");
2644 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2645 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2646 if ((UseAVX > 2) &&
2647 (!is_bw || VM_Version::supports_avx512bw()) &&
2648 (!is_vl || VM_Version::supports_avx512vl())) {
2649 switch (elem_bt) {
2650 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2651 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2652 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2653 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2654 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2655 }
2656 } else {
2657 assert(vlen_enc != Assembler::AVX_512bit, "required");
2658 assert((dst->encoding() < 16),"XMM register should be 0-15");
2659 switch (elem_bt) {
2660 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2661 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2662 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2663 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2664 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2665 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2666 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2667 }
2668 }
2669 }
2670
2671 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2672 switch (to_elem_bt) {
2673 case T_SHORT:
2674 vpmovsxbw(dst, src, vlen_enc);
2675 break;
2676 case T_INT:
2677 vpmovsxbd(dst, src, vlen_enc);
2678 break;
2679 case T_FLOAT:
2680 vpmovsxbd(dst, src, vlen_enc);
2681 vcvtdq2ps(dst, dst, vlen_enc);
2682 break;
2683 case T_LONG:
2684 vpmovsxbq(dst, src, vlen_enc);
2685 break;
2686 case T_DOUBLE: {
2687 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2688 vpmovsxbd(dst, src, mid_vlen_enc);
2689 vcvtdq2pd(dst, dst, vlen_enc);
2690 break;
2691 }
2692 default:
2693 fatal("Unsupported type %s", type2name(to_elem_bt));
2694 break;
2695 }
2696 }
2697
2698 //-------------------------------------------------------------------------------------------
2699
2700 // IndexOf for constant substrings with size >= 8 chars
2701 // which don't need to be loaded through stack.
2702 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2703 Register cnt1, Register cnt2,
2704 int int_cnt2, Register result,
2705 XMMRegister vec, Register tmp,
2706 int ae) {
2707 ShortBranchVerifier sbv(this);
2708 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2709 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2710
2711 // This method uses the pcmpestri instruction with bound registers
2712 // inputs:
2713 // xmm - substring
2714 // rax - substring length (elements count)
2715 // mem - scanned string
2716 // rdx - string length (elements count)
2717 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2718 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2719 // outputs:
2720 // rcx - matched index in string
2721 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2722 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2723 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2724 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2725 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2726
2727 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2728 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2729 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2730
2731 // Note, inline_string_indexOf() generates checks:
2732 // if (substr.count > string.count) return -1;
2733 // if (substr.count == 0) return 0;
2734 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2735
2736 // Load substring.
2737 if (ae == StrIntrinsicNode::UL) {
2738 pmovzxbw(vec, Address(str2, 0));
2739 } else {
2740 movdqu(vec, Address(str2, 0));
2741 }
2742 movl(cnt2, int_cnt2);
2743 movptr(result, str1); // string addr
2744
2745 if (int_cnt2 > stride) {
2746 jmpb(SCAN_TO_SUBSTR);
2747
2748 // Reload substr for rescan, this code
2749 // is executed only for large substrings (> 8 chars)
2750 bind(RELOAD_SUBSTR);
2751 if (ae == StrIntrinsicNode::UL) {
2752 pmovzxbw(vec, Address(str2, 0));
2753 } else {
2754 movdqu(vec, Address(str2, 0));
2755 }
2756 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2757
2758 bind(RELOAD_STR);
2759 // We came here after the beginning of the substring was
2760 // matched but the rest of it was not so we need to search
2761 // again. Start from the next element after the previous match.
2762
2763 // cnt2 is number of substring reminding elements and
2764 // cnt1 is number of string reminding elements when cmp failed.
2765 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2766 subl(cnt1, cnt2);
2767 addl(cnt1, int_cnt2);
2768 movl(cnt2, int_cnt2); // Now restore cnt2
2769
2770 decrementl(cnt1); // Shift to next element
2771 cmpl(cnt1, cnt2);
2772 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2773
2774 addptr(result, (1<<scale1));
2775
2776 } // (int_cnt2 > 8)
2777
2778 // Scan string for start of substr in 16-byte vectors
2779 bind(SCAN_TO_SUBSTR);
2780 pcmpestri(vec, Address(result, 0), mode);
2781 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2782 subl(cnt1, stride);
2783 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2784 cmpl(cnt1, cnt2);
2785 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2786 addptr(result, 16);
2787 jmpb(SCAN_TO_SUBSTR);
2788
2789 // Found a potential substr
2790 bind(FOUND_CANDIDATE);
2791 // Matched whole vector if first element matched (tmp(rcx) == 0).
2792 if (int_cnt2 == stride) {
2793 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2794 } else { // int_cnt2 > 8
2795 jccb(Assembler::overflow, FOUND_SUBSTR);
2796 }
2797 // After pcmpestri tmp(rcx) contains matched element index
2798 // Compute start addr of substr
2799 lea(result, Address(result, tmp, scale1));
2800
2801 // Make sure string is still long enough
2802 subl(cnt1, tmp);
2803 cmpl(cnt1, cnt2);
2804 if (int_cnt2 == stride) {
2805 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2806 } else { // int_cnt2 > 8
2807 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2808 }
2809 // Left less then substring.
2810
2811 bind(RET_NOT_FOUND);
2812 movl(result, -1);
2813 jmp(EXIT);
2814
2815 if (int_cnt2 > stride) {
2816 // This code is optimized for the case when whole substring
2817 // is matched if its head is matched.
2818 bind(MATCH_SUBSTR_HEAD);
2819 pcmpestri(vec, Address(result, 0), mode);
2820 // Reload only string if does not match
2821 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2822
2823 Label CONT_SCAN_SUBSTR;
2824 // Compare the rest of substring (> 8 chars).
2825 bind(FOUND_SUBSTR);
2826 // First 8 chars are already matched.
2827 negptr(cnt2);
2828 addptr(cnt2, stride);
2829
2830 bind(SCAN_SUBSTR);
2831 subl(cnt1, stride);
2832 cmpl(cnt2, -stride); // Do not read beyond substring
2833 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2834 // Back-up strings to avoid reading beyond substring:
2835 // cnt1 = cnt1 - cnt2 + 8
2836 addl(cnt1, cnt2); // cnt2 is negative
2837 addl(cnt1, stride);
2838 movl(cnt2, stride); negptr(cnt2);
2839 bind(CONT_SCAN_SUBSTR);
2840 if (int_cnt2 < (int)G) {
2841 int tail_off1 = int_cnt2<<scale1;
2842 int tail_off2 = int_cnt2<<scale2;
2843 if (ae == StrIntrinsicNode::UL) {
2844 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2845 } else {
2846 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2847 }
2848 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2849 } else {
2850 // calculate index in register to avoid integer overflow (int_cnt2*2)
2851 movl(tmp, int_cnt2);
2852 addptr(tmp, cnt2);
2853 if (ae == StrIntrinsicNode::UL) {
2854 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2855 } else {
2856 movdqu(vec, Address(str2, tmp, scale2, 0));
2857 }
2858 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2859 }
2860 // Need to reload strings pointers if not matched whole vector
2861 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2862 addptr(cnt2, stride);
2863 jcc(Assembler::negative, SCAN_SUBSTR);
2864 // Fall through if found full substring
2865
2866 } // (int_cnt2 > 8)
2867
2868 bind(RET_FOUND);
2869 // Found result if we matched full small substring.
2870 // Compute substr offset
2871 subptr(result, str1);
2872 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2873 shrl(result, 1); // index
2874 }
2875 bind(EXIT);
2876
2877 } // string_indexofC8
2878
2879 // Small strings are loaded through stack if they cross page boundary.
2880 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2881 Register cnt1, Register cnt2,
2882 int int_cnt2, Register result,
2883 XMMRegister vec, Register tmp,
2884 int ae) {
2885 ShortBranchVerifier sbv(this);
2886 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2887 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2888
2889 //
2890 // int_cnt2 is length of small (< 8 chars) constant substring
2891 // or (-1) for non constant substring in which case its length
2892 // is in cnt2 register.
2893 //
2894 // Note, inline_string_indexOf() generates checks:
2895 // if (substr.count > string.count) return -1;
2896 // if (substr.count == 0) return 0;
2897 //
2898 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2899 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2900 // This method uses the pcmpestri instruction with bound registers
2901 // inputs:
2902 // xmm - substring
2903 // rax - substring length (elements count)
2904 // mem - scanned string
2905 // rdx - string length (elements count)
2906 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2907 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2908 // outputs:
2909 // rcx - matched index in string
2910 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2911 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2912 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2913 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2914
2915 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2916 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2917 FOUND_CANDIDATE;
2918
2919 { //========================================================
2920 // We don't know where these strings are located
2921 // and we can't read beyond them. Load them through stack.
2922 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2923
2924 movptr(tmp, rsp); // save old SP
2925
2926 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2927 if (int_cnt2 == (1>>scale2)) { // One byte
2928 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2929 load_unsigned_byte(result, Address(str2, 0));
2930 movdl(vec, result); // move 32 bits
2931 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2932 // Not enough header space in 32-bit VM: 12+3 = 15.
2933 movl(result, Address(str2, -1));
2934 shrl(result, 8);
2935 movdl(vec, result); // move 32 bits
2936 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2937 load_unsigned_short(result, Address(str2, 0));
2938 movdl(vec, result); // move 32 bits
2939 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2940 movdl(vec, Address(str2, 0)); // move 32 bits
2941 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2942 movq(vec, Address(str2, 0)); // move 64 bits
2943 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2944 // Array header size is 12 bytes in 32-bit VM
2945 // + 6 bytes for 3 chars == 18 bytes,
2946 // enough space to load vec and shift.
2947 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2948 if (ae == StrIntrinsicNode::UL) {
2949 int tail_off = int_cnt2-8;
2950 pmovzxbw(vec, Address(str2, tail_off));
2951 psrldq(vec, -2*tail_off);
2952 }
2953 else {
2954 int tail_off = int_cnt2*(1<<scale2);
2955 movdqu(vec, Address(str2, tail_off-16));
2956 psrldq(vec, 16-tail_off);
2957 }
2958 }
2959 } else { // not constant substring
2960 cmpl(cnt2, stride);
2961 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2962
2963 // We can read beyond string if srt+16 does not cross page boundary
2964 // since heaps are aligned and mapped by pages.
2965 assert(os::vm_page_size() < (int)G, "default page should be small");
2966 movl(result, str2); // We need only low 32 bits
2967 andl(result, ((int)os::vm_page_size()-1));
2968 cmpl(result, ((int)os::vm_page_size()-16));
2969 jccb(Assembler::belowEqual, CHECK_STR);
2970
2971 // Move small strings to stack to allow load 16 bytes into vec.
2972 subptr(rsp, 16);
2973 int stk_offset = wordSize-(1<<scale2);
2974 push(cnt2);
2975
2976 bind(COPY_SUBSTR);
2977 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2978 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2979 movb(Address(rsp, cnt2, scale2, stk_offset), result);
2980 } else if (ae == StrIntrinsicNode::UU) {
2981 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2982 movw(Address(rsp, cnt2, scale2, stk_offset), result);
2983 }
2984 decrement(cnt2);
2985 jccb(Assembler::notZero, COPY_SUBSTR);
2986
2987 pop(cnt2);
2988 movptr(str2, rsp); // New substring address
2989 } // non constant
2990
2991 bind(CHECK_STR);
2992 cmpl(cnt1, stride);
2993 jccb(Assembler::aboveEqual, BIG_STRINGS);
2994
2995 // Check cross page boundary.
2996 movl(result, str1); // We need only low 32 bits
2997 andl(result, ((int)os::vm_page_size()-1));
2998 cmpl(result, ((int)os::vm_page_size()-16));
2999 jccb(Assembler::belowEqual, BIG_STRINGS);
3000
3001 subptr(rsp, 16);
3002 int stk_offset = -(1<<scale1);
3003 if (int_cnt2 < 0) { // not constant
3004 push(cnt2);
3005 stk_offset += wordSize;
3006 }
3007 movl(cnt2, cnt1);
3008
3009 bind(COPY_STR);
3010 if (ae == StrIntrinsicNode::LL) {
3011 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3012 movb(Address(rsp, cnt2, scale1, stk_offset), result);
3013 } else {
3014 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3015 movw(Address(rsp, cnt2, scale1, stk_offset), result);
3016 }
3017 decrement(cnt2);
3018 jccb(Assembler::notZero, COPY_STR);
3019
3020 if (int_cnt2 < 0) { // not constant
3021 pop(cnt2);
3022 }
3023 movptr(str1, rsp); // New string address
3024
3025 bind(BIG_STRINGS);
3026 // Load substring.
3027 if (int_cnt2 < 0) { // -1
3028 if (ae == StrIntrinsicNode::UL) {
3029 pmovzxbw(vec, Address(str2, 0));
3030 } else {
3031 movdqu(vec, Address(str2, 0));
3032 }
3033 push(cnt2); // substr count
3034 push(str2); // substr addr
3035 push(str1); // string addr
3036 } else {
3037 // Small (< 8 chars) constant substrings are loaded already.
3038 movl(cnt2, int_cnt2);
3039 }
3040 push(tmp); // original SP
3041
3042 } // Finished loading
3043
3044 //========================================================
3045 // Start search
3046 //
3047
3048 movptr(result, str1); // string addr
3049
3050 if (int_cnt2 < 0) { // Only for non constant substring
3051 jmpb(SCAN_TO_SUBSTR);
3052
3053 // SP saved at sp+0
3054 // String saved at sp+1*wordSize
3055 // Substr saved at sp+2*wordSize
3056 // Substr count saved at sp+3*wordSize
3057
3058 // Reload substr for rescan, this code
3059 // is executed only for large substrings (> 8 chars)
3060 bind(RELOAD_SUBSTR);
3061 movptr(str2, Address(rsp, 2*wordSize));
3062 movl(cnt2, Address(rsp, 3*wordSize));
3063 if (ae == StrIntrinsicNode::UL) {
3064 pmovzxbw(vec, Address(str2, 0));
3065 } else {
3066 movdqu(vec, Address(str2, 0));
3067 }
3068 // We came here after the beginning of the substring was
3069 // matched but the rest of it was not so we need to search
3070 // again. Start from the next element after the previous match.
3071 subptr(str1, result); // Restore counter
3072 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3073 shrl(str1, 1);
3074 }
3075 addl(cnt1, str1);
3076 decrementl(cnt1); // Shift to next element
3077 cmpl(cnt1, cnt2);
3078 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3079
3080 addptr(result, (1<<scale1));
3081 } // non constant
3082
3083 // Scan string for start of substr in 16-byte vectors
3084 bind(SCAN_TO_SUBSTR);
3085 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3086 pcmpestri(vec, Address(result, 0), mode);
3087 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3088 subl(cnt1, stride);
3089 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3090 cmpl(cnt1, cnt2);
3091 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3092 addptr(result, 16);
3093
3094 bind(ADJUST_STR);
3095 cmpl(cnt1, stride); // Do not read beyond string
3096 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3097 // Back-up string to avoid reading beyond string.
3098 lea(result, Address(result, cnt1, scale1, -16));
3099 movl(cnt1, stride);
3100 jmpb(SCAN_TO_SUBSTR);
3101
3102 // Found a potential substr
3103 bind(FOUND_CANDIDATE);
3104 // After pcmpestri tmp(rcx) contains matched element index
3105
3106 // Make sure string is still long enough
3107 subl(cnt1, tmp);
3108 cmpl(cnt1, cnt2);
3109 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3110 // Left less then substring.
3111
3112 bind(RET_NOT_FOUND);
3113 movl(result, -1);
3114 jmp(CLEANUP);
3115
3116 bind(FOUND_SUBSTR);
3117 // Compute start addr of substr
3118 lea(result, Address(result, tmp, scale1));
3119 if (int_cnt2 > 0) { // Constant substring
3120 // Repeat search for small substring (< 8 chars)
3121 // from new point without reloading substring.
3122 // Have to check that we don't read beyond string.
3123 cmpl(tmp, stride-int_cnt2);
3124 jccb(Assembler::greater, ADJUST_STR);
3125 // Fall through if matched whole substring.
3126 } else { // non constant
3127 assert(int_cnt2 == -1, "should be != 0");
3128
3129 addl(tmp, cnt2);
3130 // Found result if we matched whole substring.
3131 cmpl(tmp, stride);
3132 jcc(Assembler::lessEqual, RET_FOUND);
3133
3134 // Repeat search for small substring (<= 8 chars)
3135 // from new point 'str1' without reloading substring.
3136 cmpl(cnt2, stride);
3137 // Have to check that we don't read beyond string.
3138 jccb(Assembler::lessEqual, ADJUST_STR);
3139
3140 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3141 // Compare the rest of substring (> 8 chars).
3142 movptr(str1, result);
3143
3144 cmpl(tmp, cnt2);
3145 // First 8 chars are already matched.
3146 jccb(Assembler::equal, CHECK_NEXT);
3147
3148 bind(SCAN_SUBSTR);
3149 pcmpestri(vec, Address(str1, 0), mode);
3150 // Need to reload strings pointers if not matched whole vector
3151 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3152
3153 bind(CHECK_NEXT);
3154 subl(cnt2, stride);
3155 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3156 addptr(str1, 16);
3157 if (ae == StrIntrinsicNode::UL) {
3158 addptr(str2, 8);
3159 } else {
3160 addptr(str2, 16);
3161 }
3162 subl(cnt1, stride);
3163 cmpl(cnt2, stride); // Do not read beyond substring
3164 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3165 // Back-up strings to avoid reading beyond substring.
3166
3167 if (ae == StrIntrinsicNode::UL) {
3168 lea(str2, Address(str2, cnt2, scale2, -8));
3169 lea(str1, Address(str1, cnt2, scale1, -16));
3170 } else {
3171 lea(str2, Address(str2, cnt2, scale2, -16));
3172 lea(str1, Address(str1, cnt2, scale1, -16));
3173 }
3174 subl(cnt1, cnt2);
3175 movl(cnt2, stride);
3176 addl(cnt1, stride);
3177 bind(CONT_SCAN_SUBSTR);
3178 if (ae == StrIntrinsicNode::UL) {
3179 pmovzxbw(vec, Address(str2, 0));
3180 } else {
3181 movdqu(vec, Address(str2, 0));
3182 }
3183 jmp(SCAN_SUBSTR);
3184
3185 bind(RET_FOUND_LONG);
3186 movptr(str1, Address(rsp, wordSize));
3187 } // non constant
3188
3189 bind(RET_FOUND);
3190 // Compute substr offset
3191 subptr(result, str1);
3192 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3193 shrl(result, 1); // index
3194 }
3195 bind(CLEANUP);
3196 pop(rsp); // restore SP
3197
3198 } // string_indexof
3199
3200 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3201 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3202 ShortBranchVerifier sbv(this);
3203 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3204
3205 int stride = 8;
3206
3207 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3208 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3209 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3210 FOUND_SEQ_CHAR, DONE_LABEL;
3211
3212 movptr(result, str1);
3213 if (UseAVX >= 2) {
3214 cmpl(cnt1, stride);
3215 jcc(Assembler::less, SCAN_TO_CHAR);
3216 cmpl(cnt1, 2*stride);
3217 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3218 movdl(vec1, ch);
3219 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3220 vpxor(vec2, vec2);
3221 movl(tmp, cnt1);
3222 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3223 andl(cnt1,0x0000000F); //tail count (in chars)
3224
3225 bind(SCAN_TO_16_CHAR_LOOP);
3226 vmovdqu(vec3, Address(result, 0));
3227 vpcmpeqw(vec3, vec3, vec1, 1);
3228 vptest(vec2, vec3);
3229 jcc(Assembler::carryClear, FOUND_CHAR);
3230 addptr(result, 32);
3231 subl(tmp, 2*stride);
3232 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3233 jmp(SCAN_TO_8_CHAR);
3234 bind(SCAN_TO_8_CHAR_INIT);
3235 movdl(vec1, ch);
3236 pshuflw(vec1, vec1, 0x00);
3237 pshufd(vec1, vec1, 0);
3238 pxor(vec2, vec2);
3239 }
3240 bind(SCAN_TO_8_CHAR);
3241 cmpl(cnt1, stride);
3242 jcc(Assembler::less, SCAN_TO_CHAR);
3243 if (UseAVX < 2) {
3244 movdl(vec1, ch);
3245 pshuflw(vec1, vec1, 0x00);
3246 pshufd(vec1, vec1, 0);
3247 pxor(vec2, vec2);
3248 }
3249 movl(tmp, cnt1);
3250 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3251 andl(cnt1,0x00000007); //tail count (in chars)
3252
3253 bind(SCAN_TO_8_CHAR_LOOP);
3254 movdqu(vec3, Address(result, 0));
3255 pcmpeqw(vec3, vec1);
3256 ptest(vec2, vec3);
3257 jcc(Assembler::carryClear, FOUND_CHAR);
3258 addptr(result, 16);
3259 subl(tmp, stride);
3260 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3261 bind(SCAN_TO_CHAR);
3262 testl(cnt1, cnt1);
3263 jcc(Assembler::zero, RET_NOT_FOUND);
3264 bind(SCAN_TO_CHAR_LOOP);
3265 load_unsigned_short(tmp, Address(result, 0));
3266 cmpl(ch, tmp);
3267 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3268 addptr(result, 2);
3269 subl(cnt1, 1);
3270 jccb(Assembler::zero, RET_NOT_FOUND);
3271 jmp(SCAN_TO_CHAR_LOOP);
3272
3273 bind(RET_NOT_FOUND);
3274 movl(result, -1);
3275 jmpb(DONE_LABEL);
3276
3277 bind(FOUND_CHAR);
3278 if (UseAVX >= 2) {
3279 vpmovmskb(tmp, vec3);
3280 } else {
3281 pmovmskb(tmp, vec3);
3282 }
3283 bsfl(ch, tmp);
3284 addptr(result, ch);
3285
3286 bind(FOUND_SEQ_CHAR);
3287 subptr(result, str1);
3288 shrl(result, 1);
3289
3290 bind(DONE_LABEL);
3291 } // string_indexof_char
3292
3293 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3294 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3295 ShortBranchVerifier sbv(this);
3296 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3297
3298 int stride = 16;
3299
3300 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3301 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3302 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3303 FOUND_SEQ_CHAR, DONE_LABEL;
3304
3305 movptr(result, str1);
3306 if (UseAVX >= 2) {
3307 cmpl(cnt1, stride);
3308 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3309 cmpl(cnt1, stride*2);
3310 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3311 movdl(vec1, ch);
3312 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3313 vpxor(vec2, vec2);
3314 movl(tmp, cnt1);
3315 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3316 andl(cnt1,0x0000001F); //tail count (in chars)
3317
3318 bind(SCAN_TO_32_CHAR_LOOP);
3319 vmovdqu(vec3, Address(result, 0));
3320 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3321 vptest(vec2, vec3);
3322 jcc(Assembler::carryClear, FOUND_CHAR);
3323 addptr(result, 32);
3324 subl(tmp, stride*2);
3325 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3326 jmp(SCAN_TO_16_CHAR);
3327
3328 bind(SCAN_TO_16_CHAR_INIT);
3329 movdl(vec1, ch);
3330 pxor(vec2, vec2);
3331 pshufb(vec1, vec2);
3332 }
3333
3334 bind(SCAN_TO_16_CHAR);
3335 cmpl(cnt1, stride);
3336 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3337 if (UseAVX < 2) {
3338 movdl(vec1, ch);
3339 pxor(vec2, vec2);
3340 pshufb(vec1, vec2);
3341 }
3342 movl(tmp, cnt1);
3343 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3344 andl(cnt1,0x0000000F); //tail count (in bytes)
3345
3346 bind(SCAN_TO_16_CHAR_LOOP);
3347 movdqu(vec3, Address(result, 0));
3348 pcmpeqb(vec3, vec1);
3349 ptest(vec2, vec3);
3350 jcc(Assembler::carryClear, FOUND_CHAR);
3351 addptr(result, 16);
3352 subl(tmp, stride);
3353 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3354
3355 bind(SCAN_TO_CHAR_INIT);
3356 testl(cnt1, cnt1);
3357 jcc(Assembler::zero, RET_NOT_FOUND);
3358 bind(SCAN_TO_CHAR_LOOP);
3359 load_unsigned_byte(tmp, Address(result, 0));
3360 cmpl(ch, tmp);
3361 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3362 addptr(result, 1);
3363 subl(cnt1, 1);
3364 jccb(Assembler::zero, RET_NOT_FOUND);
3365 jmp(SCAN_TO_CHAR_LOOP);
3366
3367 bind(RET_NOT_FOUND);
3368 movl(result, -1);
3369 jmpb(DONE_LABEL);
3370
3371 bind(FOUND_CHAR);
3372 if (UseAVX >= 2) {
3373 vpmovmskb(tmp, vec3);
3374 } else {
3375 pmovmskb(tmp, vec3);
3376 }
3377 bsfl(ch, tmp);
3378 addptr(result, ch);
3379
3380 bind(FOUND_SEQ_CHAR);
3381 subptr(result, str1);
3382
3383 bind(DONE_LABEL);
3384 } // stringL_indexof_char
3385
3386 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3387 switch (eltype) {
3388 case T_BOOLEAN: return sizeof(jboolean);
3389 case T_BYTE: return sizeof(jbyte);
3390 case T_SHORT: return sizeof(jshort);
3391 case T_CHAR: return sizeof(jchar);
3392 case T_INT: return sizeof(jint);
3393 default:
3394 ShouldNotReachHere();
3395 return -1;
3396 }
3397 }
3398
3399 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3400 switch (eltype) {
3401 // T_BOOLEAN used as surrogate for unsigned byte
3402 case T_BOOLEAN: movzbl(dst, src); break;
3403 case T_BYTE: movsbl(dst, src); break;
3404 case T_SHORT: movswl(dst, src); break;
3405 case T_CHAR: movzwl(dst, src); break;
3406 case T_INT: movl(dst, src); break;
3407 default:
3408 ShouldNotReachHere();
3409 }
3410 }
3411
3412 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3413 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3414 }
3415
3416 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3417 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3418 }
3419
3420 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3421 const int vlen = Assembler::AVX_256bit;
3422 switch (eltype) {
3423 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3424 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3425 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3426 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3427 case T_INT:
3428 // do nothing
3429 break;
3430 default:
3431 ShouldNotReachHere();
3432 }
3433 }
3434
3435 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3436 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3437 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3438 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3439 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3440 BasicType eltype) {
3441 ShortBranchVerifier sbv(this);
3442 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3443 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3444 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3445
3446 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3447 SHORT_UNROLLED_LOOP_EXIT,
3448 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3449 UNROLLED_VECTOR_LOOP_BEGIN,
3450 END;
3451 switch (eltype) {
3452 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3453 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3454 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3455 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3456 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3457 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3458 }
3459
3460 // For "renaming" for readibility of the code
3461 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3462 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3463 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3464
3465 const int elsize = arrays_hashcode_elsize(eltype);
3466
3467 /*
3468 if (cnt1 >= 2) {
3469 if (cnt1 >= 32) {
3470 UNROLLED VECTOR LOOP
3471 }
3472 UNROLLED SCALAR LOOP
3473 }
3474 SINGLE SCALAR
3475 */
3476
3477 cmpl(cnt1, 32);
3478 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3479
3480 // cnt1 >= 32 && generate_vectorized_loop
3481 xorl(index, index);
3482
3483 // vresult = IntVector.zero(I256);
3484 for (int idx = 0; idx < 4; idx++) {
3485 vpxor(vresult[idx], vresult[idx]);
3486 }
3487 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3488 Register bound = tmp2;
3489 Register next = tmp3;
3490 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3491 movl(next, Address(tmp2, 0));
3492 movdl(vnext, next);
3493 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3494
3495 // index = 0;
3496 // bound = cnt1 & ~(32 - 1);
3497 movl(bound, cnt1);
3498 andl(bound, ~(32 - 1));
3499 // for (; index < bound; index += 32) {
3500 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3501 // result *= next;
3502 imull(result, next);
3503 // loop fission to upfront the cost of fetching from memory, OOO execution
3504 // can then hopefully do a better job of prefetching
3505 for (int idx = 0; idx < 4; idx++) {
3506 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3507 }
3508 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3509 for (int idx = 0; idx < 4; idx++) {
3510 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3511 arrays_hashcode_elvcast(vtmp[idx], eltype);
3512 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3513 }
3514 // index += 32;
3515 addl(index, 32);
3516 // index < bound;
3517 cmpl(index, bound);
3518 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3519 // }
3520
3521 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3522 subl(cnt1, bound);
3523 // release bound
3524
3525 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3526 for (int idx = 0; idx < 4; idx++) {
3527 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3528 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3529 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3530 }
3531 // result += vresult.reduceLanes(ADD);
3532 for (int idx = 0; idx < 4; idx++) {
3533 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3534 }
3535
3536 // } else if (cnt1 < 32) {
3537
3538 bind(SHORT_UNROLLED_BEGIN);
3539 // int i = 1;
3540 movl(index, 1);
3541 cmpl(index, cnt1);
3542 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3543
3544 // for (; i < cnt1 ; i += 2) {
3545 bind(SHORT_UNROLLED_LOOP_BEGIN);
3546 movl(tmp3, 961);
3547 imull(result, tmp3);
3548 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3549 movl(tmp3, tmp2);
3550 shll(tmp3, 5);
3551 subl(tmp3, tmp2);
3552 addl(result, tmp3);
3553 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3554 addl(result, tmp3);
3555 addl(index, 2);
3556 cmpl(index, cnt1);
3557 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3558
3559 // }
3560 // if (i >= cnt1) {
3561 bind(SHORT_UNROLLED_LOOP_EXIT);
3562 jccb(Assembler::greater, END);
3563 movl(tmp2, result);
3564 shll(result, 5);
3565 subl(result, tmp2);
3566 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3567 addl(result, tmp3);
3568 // }
3569 bind(END);
3570
3571 BLOCK_COMMENT("} // arrays_hashcode");
3572
3573 } // arrays_hashcode
3574
3575 // helper function for string_compare
3576 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3577 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3578 Address::ScaleFactor scale2, Register index, int ae) {
3579 if (ae == StrIntrinsicNode::LL) {
3580 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3581 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3582 } else if (ae == StrIntrinsicNode::UU) {
3583 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3584 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3585 } else {
3586 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3587 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3588 }
3589 }
3590
3591 // Compare strings, used for char[] and byte[].
3592 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3593 Register cnt1, Register cnt2, Register result,
3594 XMMRegister vec1, int ae, KRegister mask) {
3595 ShortBranchVerifier sbv(this);
3596 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3597 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3598 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3599 int stride2x2 = 0x40;
3600 Address::ScaleFactor scale = Address::no_scale;
3601 Address::ScaleFactor scale1 = Address::no_scale;
3602 Address::ScaleFactor scale2 = Address::no_scale;
3603
3604 if (ae != StrIntrinsicNode::LL) {
3605 stride2x2 = 0x20;
3606 }
3607
3608 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3609 shrl(cnt2, 1);
3610 }
3611 // Compute the minimum of the string lengths and the
3612 // difference of the string lengths (stack).
3613 // Do the conditional move stuff
3614 movl(result, cnt1);
3615 subl(cnt1, cnt2);
3616 push(cnt1);
3617 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3618
3619 // Is the minimum length zero?
3620 testl(cnt2, cnt2);
3621 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3622 if (ae == StrIntrinsicNode::LL) {
3623 // Load first bytes
3624 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3625 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3626 } else if (ae == StrIntrinsicNode::UU) {
3627 // Load first characters
3628 load_unsigned_short(result, Address(str1, 0));
3629 load_unsigned_short(cnt1, Address(str2, 0));
3630 } else {
3631 load_unsigned_byte(result, Address(str1, 0));
3632 load_unsigned_short(cnt1, Address(str2, 0));
3633 }
3634 subl(result, cnt1);
3635 jcc(Assembler::notZero, POP_LABEL);
3636
3637 if (ae == StrIntrinsicNode::UU) {
3638 // Divide length by 2 to get number of chars
3639 shrl(cnt2, 1);
3640 }
3641 cmpl(cnt2, 1);
3642 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3643
3644 // Check if the strings start at the same location and setup scale and stride
3645 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3646 cmpptr(str1, str2);
3647 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3648 if (ae == StrIntrinsicNode::LL) {
3649 scale = Address::times_1;
3650 stride = 16;
3651 } else {
3652 scale = Address::times_2;
3653 stride = 8;
3654 }
3655 } else {
3656 scale1 = Address::times_1;
3657 scale2 = Address::times_2;
3658 // scale not used
3659 stride = 8;
3660 }
3661
3662 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3663 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3664 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3665 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3666 Label COMPARE_TAIL_LONG;
3667 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3668
3669 int pcmpmask = 0x19;
3670 if (ae == StrIntrinsicNode::LL) {
3671 pcmpmask &= ~0x01;
3672 }
3673
3674 // Setup to compare 16-chars (32-bytes) vectors,
3675 // start from first character again because it has aligned address.
3676 if (ae == StrIntrinsicNode::LL) {
3677 stride2 = 32;
3678 } else {
3679 stride2 = 16;
3680 }
3681 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3682 adr_stride = stride << scale;
3683 } else {
3684 adr_stride1 = 8; //stride << scale1;
3685 adr_stride2 = 16; //stride << scale2;
3686 }
3687
3688 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3689 // rax and rdx are used by pcmpestri as elements counters
3690 movl(result, cnt2);
3691 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3692 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3693
3694 // fast path : compare first 2 8-char vectors.
3695 bind(COMPARE_16_CHARS);
3696 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3697 movdqu(vec1, Address(str1, 0));
3698 } else {
3699 pmovzxbw(vec1, Address(str1, 0));
3700 }
3701 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3702 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3703
3704 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3705 movdqu(vec1, Address(str1, adr_stride));
3706 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3707 } else {
3708 pmovzxbw(vec1, Address(str1, adr_stride1));
3709 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3710 }
3711 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3712 addl(cnt1, stride);
3713
3714 // Compare the characters at index in cnt1
3715 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3716 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3717 subl(result, cnt2);
3718 jmp(POP_LABEL);
3719
3720 // Setup the registers to start vector comparison loop
3721 bind(COMPARE_WIDE_VECTORS);
3722 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3723 lea(str1, Address(str1, result, scale));
3724 lea(str2, Address(str2, result, scale));
3725 } else {
3726 lea(str1, Address(str1, result, scale1));
3727 lea(str2, Address(str2, result, scale2));
3728 }
3729 subl(result, stride2);
3730 subl(cnt2, stride2);
3731 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3732 negptr(result);
3733
3734 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3735 bind(COMPARE_WIDE_VECTORS_LOOP);
3736
3737 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3738 cmpl(cnt2, stride2x2);
3739 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3740 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3741 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3742
3743 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3744 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3745 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3746 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3747 } else {
3748 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3749 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3750 }
3751 kortestql(mask, mask);
3752 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3753 addptr(result, stride2x2); // update since we already compared at this addr
3754 subl(cnt2, stride2x2); // and sub the size too
3755 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3756
3757 vpxor(vec1, vec1);
3758 jmpb(COMPARE_WIDE_TAIL);
3759 }//if (VM_Version::supports_avx512vlbw())
3760
3761 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3762 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3763 vmovdqu(vec1, Address(str1, result, scale));
3764 vpxor(vec1, Address(str2, result, scale));
3765 } else {
3766 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3767 vpxor(vec1, Address(str2, result, scale2));
3768 }
3769 vptest(vec1, vec1);
3770 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3771 addptr(result, stride2);
3772 subl(cnt2, stride2);
3773 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3774 // clean upper bits of YMM registers
3775 vpxor(vec1, vec1);
3776
3777 // compare wide vectors tail
3778 bind(COMPARE_WIDE_TAIL);
3779 testptr(result, result);
3780 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3781
3782 movl(result, stride2);
3783 movl(cnt2, result);
3784 negptr(result);
3785 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3786
3787 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3788 bind(VECTOR_NOT_EQUAL);
3789 // clean upper bits of YMM registers
3790 vpxor(vec1, vec1);
3791 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3792 lea(str1, Address(str1, result, scale));
3793 lea(str2, Address(str2, result, scale));
3794 } else {
3795 lea(str1, Address(str1, result, scale1));
3796 lea(str2, Address(str2, result, scale2));
3797 }
3798 jmp(COMPARE_16_CHARS);
3799
3800 // Compare tail chars, length between 1 to 15 chars
3801 bind(COMPARE_TAIL_LONG);
3802 movl(cnt2, result);
3803 cmpl(cnt2, stride);
3804 jcc(Assembler::less, COMPARE_SMALL_STR);
3805
3806 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3807 movdqu(vec1, Address(str1, 0));
3808 } else {
3809 pmovzxbw(vec1, Address(str1, 0));
3810 }
3811 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3812 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3813 subptr(cnt2, stride);
3814 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3815 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3816 lea(str1, Address(str1, result, scale));
3817 lea(str2, Address(str2, result, scale));
3818 } else {
3819 lea(str1, Address(str1, result, scale1));
3820 lea(str2, Address(str2, result, scale2));
3821 }
3822 negptr(cnt2);
3823 jmpb(WHILE_HEAD_LABEL);
3824
3825 bind(COMPARE_SMALL_STR);
3826 } else if (UseSSE42Intrinsics) {
3827 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3828 int pcmpmask = 0x19;
3829 // Setup to compare 8-char (16-byte) vectors,
3830 // start from first character again because it has aligned address.
3831 movl(result, cnt2);
3832 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3833 if (ae == StrIntrinsicNode::LL) {
3834 pcmpmask &= ~0x01;
3835 }
3836 jcc(Assembler::zero, COMPARE_TAIL);
3837 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3838 lea(str1, Address(str1, result, scale));
3839 lea(str2, Address(str2, result, scale));
3840 } else {
3841 lea(str1, Address(str1, result, scale1));
3842 lea(str2, Address(str2, result, scale2));
3843 }
3844 negptr(result);
3845
3846 // pcmpestri
3847 // inputs:
3848 // vec1- substring
3849 // rax - negative string length (elements count)
3850 // mem - scanned string
3851 // rdx - string length (elements count)
3852 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3853 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3854 // outputs:
3855 // rcx - first mismatched element index
3856 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3857
3858 bind(COMPARE_WIDE_VECTORS);
3859 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3860 movdqu(vec1, Address(str1, result, scale));
3861 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3862 } else {
3863 pmovzxbw(vec1, Address(str1, result, scale1));
3864 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3865 }
3866 // After pcmpestri cnt1(rcx) contains mismatched element index
3867
3868 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3869 addptr(result, stride);
3870 subptr(cnt2, stride);
3871 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3872
3873 // compare wide vectors tail
3874 testptr(result, result);
3875 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3876
3877 movl(cnt2, stride);
3878 movl(result, stride);
3879 negptr(result);
3880 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3881 movdqu(vec1, Address(str1, result, scale));
3882 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3883 } else {
3884 pmovzxbw(vec1, Address(str1, result, scale1));
3885 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3886 }
3887 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3888
3889 // Mismatched characters in the vectors
3890 bind(VECTOR_NOT_EQUAL);
3891 addptr(cnt1, result);
3892 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3893 subl(result, cnt2);
3894 jmpb(POP_LABEL);
3895
3896 bind(COMPARE_TAIL); // limit is zero
3897 movl(cnt2, result);
3898 // Fallthru to tail compare
3899 }
3900 // Shift str2 and str1 to the end of the arrays, negate min
3901 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3902 lea(str1, Address(str1, cnt2, scale));
3903 lea(str2, Address(str2, cnt2, scale));
3904 } else {
3905 lea(str1, Address(str1, cnt2, scale1));
3906 lea(str2, Address(str2, cnt2, scale2));
3907 }
3908 decrementl(cnt2); // first character was compared already
3909 negptr(cnt2);
3910
3911 // Compare the rest of the elements
3912 bind(WHILE_HEAD_LABEL);
3913 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3914 subl(result, cnt1);
3915 jccb(Assembler::notZero, POP_LABEL);
3916 increment(cnt2);
3917 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3918
3919 // Strings are equal up to min length. Return the length difference.
3920 bind(LENGTH_DIFF_LABEL);
3921 pop(result);
3922 if (ae == StrIntrinsicNode::UU) {
3923 // Divide diff by 2 to get number of chars
3924 sarl(result, 1);
3925 }
3926 jmpb(DONE_LABEL);
3927
3928 if (VM_Version::supports_avx512vlbw()) {
3929
3930 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3931
3932 kmovql(cnt1, mask);
3933 notq(cnt1);
3934 bsfq(cnt2, cnt1);
3935 if (ae != StrIntrinsicNode::LL) {
3936 // Divide diff by 2 to get number of chars
3937 sarl(cnt2, 1);
3938 }
3939 addq(result, cnt2);
3940 if (ae == StrIntrinsicNode::LL) {
3941 load_unsigned_byte(cnt1, Address(str2, result));
3942 load_unsigned_byte(result, Address(str1, result));
3943 } else if (ae == StrIntrinsicNode::UU) {
3944 load_unsigned_short(cnt1, Address(str2, result, scale));
3945 load_unsigned_short(result, Address(str1, result, scale));
3946 } else {
3947 load_unsigned_short(cnt1, Address(str2, result, scale2));
3948 load_unsigned_byte(result, Address(str1, result, scale1));
3949 }
3950 subl(result, cnt1);
3951 jmpb(POP_LABEL);
3952 }//if (VM_Version::supports_avx512vlbw())
3953
3954 // Discard the stored length difference
3955 bind(POP_LABEL);
3956 pop(cnt1);
3957
3958 // That's it
3959 bind(DONE_LABEL);
3960 if(ae == StrIntrinsicNode::UL) {
3961 negl(result);
3962 }
3963
3964 }
3965
3966 // Search for Non-ASCII character (Negative byte value) in a byte array,
3967 // return the index of the first such character, otherwise the length
3968 // of the array segment searched.
3969 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3970 // @IntrinsicCandidate
3971 // public static int countPositives(byte[] ba, int off, int len) {
3972 // for (int i = off; i < off + len; i++) {
3973 // if (ba[i] < 0) {
3974 // return i - off;
3975 // }
3976 // }
3977 // return len;
3978 // }
3979 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3980 Register result, Register tmp1,
3981 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3982 // rsi: byte array
3983 // rcx: len
3984 // rax: result
3985 ShortBranchVerifier sbv(this);
3986 assert_different_registers(ary1, len, result, tmp1);
3987 assert_different_registers(vec1, vec2);
3988 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3989
3990 movl(result, len); // copy
3991 // len == 0
3992 testl(len, len);
3993 jcc(Assembler::zero, DONE);
3994
3995 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3996 VM_Version::supports_avx512vlbw() &&
3997 VM_Version::supports_bmi2()) {
3998
3999 Label test_64_loop, test_tail, BREAK_LOOP;
4000 movl(tmp1, len);
4001 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4002
4003 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4004 andl(len, 0xffffffc0); // vector count (in chars)
4005 jccb(Assembler::zero, test_tail);
4006
4007 lea(ary1, Address(ary1, len, Address::times_1));
4008 negptr(len);
4009
4010 bind(test_64_loop);
4011 // Check whether our 64 elements of size byte contain negatives
4012 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4013 kortestql(mask1, mask1);
4014 jcc(Assembler::notZero, BREAK_LOOP);
4015
4016 addptr(len, 64);
4017 jccb(Assembler::notZero, test_64_loop);
4018
4019 bind(test_tail);
4020 // bail out when there is nothing to be done
4021 testl(tmp1, -1);
4022 jcc(Assembler::zero, DONE);
4023
4024
4025 // check the tail for absense of negatives
4026 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4027 {
4028 Register tmp3_aliased = len;
4029 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4030 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4031 notq(tmp3_aliased);
4032 kmovql(mask2, tmp3_aliased);
4033 }
4034
4035 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4036 ktestq(mask1, mask2);
4037 jcc(Assembler::zero, DONE);
4038
4039 // do a full check for negative registers in the tail
4040 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4041 // ary1 already pointing to the right place
4042 jmpb(TAIL_START);
4043
4044 bind(BREAK_LOOP);
4045 // At least one byte in the last 64 byte block was negative.
4046 // Set up to look at the last 64 bytes as if they were a tail
4047 lea(ary1, Address(ary1, len, Address::times_1));
4048 addptr(result, len);
4049 // Ignore the very last byte: if all others are positive,
4050 // it must be negative, so we can skip right to the 2+1 byte
4051 // end comparison at this point
4052 orl(result, 63);
4053 movl(len, 63);
4054 // Fallthru to tail compare
4055 } else {
4056
4057 if (UseAVX >= 2) {
4058 // With AVX2, use 32-byte vector compare
4059 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4060
4061 // Compare 32-byte vectors
4062 testl(len, 0xffffffe0); // vector count (in bytes)
4063 jccb(Assembler::zero, TAIL_START);
4064
4065 andl(len, 0xffffffe0);
4066 lea(ary1, Address(ary1, len, Address::times_1));
4067 negptr(len);
4068
4069 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4070 movdl(vec2, tmp1);
4071 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4072
4073 bind(COMPARE_WIDE_VECTORS);
4074 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4075 vptest(vec1, vec2);
4076 jccb(Assembler::notZero, BREAK_LOOP);
4077 addptr(len, 32);
4078 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4079
4080 testl(result, 0x0000001f); // any bytes remaining?
4081 jcc(Assembler::zero, DONE);
4082
4083 // Quick test using the already prepared vector mask
4084 movl(len, result);
4085 andl(len, 0x0000001f);
4086 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4087 vptest(vec1, vec2);
4088 jcc(Assembler::zero, DONE);
4089 // There are zeros, jump to the tail to determine exactly where
4090 jmpb(TAIL_START);
4091
4092 bind(BREAK_LOOP);
4093 // At least one byte in the last 32-byte vector is negative.
4094 // Set up to look at the last 32 bytes as if they were a tail
4095 lea(ary1, Address(ary1, len, Address::times_1));
4096 addptr(result, len);
4097 // Ignore the very last byte: if all others are positive,
4098 // it must be negative, so we can skip right to the 2+1 byte
4099 // end comparison at this point
4100 orl(result, 31);
4101 movl(len, 31);
4102 // Fallthru to tail compare
4103 } else if (UseSSE42Intrinsics) {
4104 // With SSE4.2, use double quad vector compare
4105 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4106
4107 // Compare 16-byte vectors
4108 testl(len, 0xfffffff0); // vector count (in bytes)
4109 jcc(Assembler::zero, TAIL_START);
4110
4111 andl(len, 0xfffffff0);
4112 lea(ary1, Address(ary1, len, Address::times_1));
4113 negptr(len);
4114
4115 movl(tmp1, 0x80808080);
4116 movdl(vec2, tmp1);
4117 pshufd(vec2, vec2, 0);
4118
4119 bind(COMPARE_WIDE_VECTORS);
4120 movdqu(vec1, Address(ary1, len, Address::times_1));
4121 ptest(vec1, vec2);
4122 jccb(Assembler::notZero, BREAK_LOOP);
4123 addptr(len, 16);
4124 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4125
4126 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4127 jcc(Assembler::zero, DONE);
4128
4129 // Quick test using the already prepared vector mask
4130 movl(len, result);
4131 andl(len, 0x0000000f); // tail count (in bytes)
4132 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4133 ptest(vec1, vec2);
4134 jcc(Assembler::zero, DONE);
4135 jmpb(TAIL_START);
4136
4137 bind(BREAK_LOOP);
4138 // At least one byte in the last 16-byte vector is negative.
4139 // Set up and look at the last 16 bytes as if they were a tail
4140 lea(ary1, Address(ary1, len, Address::times_1));
4141 addptr(result, len);
4142 // Ignore the very last byte: if all others are positive,
4143 // it must be negative, so we can skip right to the 2+1 byte
4144 // end comparison at this point
4145 orl(result, 15);
4146 movl(len, 15);
4147 // Fallthru to tail compare
4148 }
4149 }
4150
4151 bind(TAIL_START);
4152 // Compare 4-byte vectors
4153 andl(len, 0xfffffffc); // vector count (in bytes)
4154 jccb(Assembler::zero, COMPARE_CHAR);
4155
4156 lea(ary1, Address(ary1, len, Address::times_1));
4157 negptr(len);
4158
4159 bind(COMPARE_VECTORS);
4160 movl(tmp1, Address(ary1, len, Address::times_1));
4161 andl(tmp1, 0x80808080);
4162 jccb(Assembler::notZero, TAIL_ADJUST);
4163 addptr(len, 4);
4164 jccb(Assembler::notZero, COMPARE_VECTORS);
4165
4166 // Compare trailing char (final 2-3 bytes), if any
4167 bind(COMPARE_CHAR);
4168
4169 testl(result, 0x2); // tail char
4170 jccb(Assembler::zero, COMPARE_BYTE);
4171 load_unsigned_short(tmp1, Address(ary1, 0));
4172 andl(tmp1, 0x00008080);
4173 jccb(Assembler::notZero, CHAR_ADJUST);
4174 lea(ary1, Address(ary1, 2));
4175
4176 bind(COMPARE_BYTE);
4177 testl(result, 0x1); // tail byte
4178 jccb(Assembler::zero, DONE);
4179 load_unsigned_byte(tmp1, Address(ary1, 0));
4180 testl(tmp1, 0x00000080);
4181 jccb(Assembler::zero, DONE);
4182 subptr(result, 1);
4183 jmpb(DONE);
4184
4185 bind(TAIL_ADJUST);
4186 // there are negative bits in the last 4 byte block.
4187 // Adjust result and check the next three bytes
4188 addptr(result, len);
4189 orl(result, 3);
4190 lea(ary1, Address(ary1, len, Address::times_1));
4191 jmpb(COMPARE_CHAR);
4192
4193 bind(CHAR_ADJUST);
4194 // We are looking at a char + optional byte tail, and found that one
4195 // of the bytes in the char is negative. Adjust the result, check the
4196 // first byte and readjust if needed.
4197 andl(result, 0xfffffffc);
4198 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4199 jccb(Assembler::notZero, DONE);
4200 addptr(result, 1);
4201
4202 // That's it
4203 bind(DONE);
4204 if (UseAVX >= 2) {
4205 // clean upper bits of YMM registers
4206 vpxor(vec1, vec1);
4207 vpxor(vec2, vec2);
4208 }
4209 }
4210
4211 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4212 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4213 Register limit, Register result, Register chr,
4214 XMMRegister vec1, XMMRegister vec2, bool is_char,
4215 KRegister mask, bool expand_ary2) {
4216 // for expand_ary2, limit is the (smaller) size of the second array.
4217 ShortBranchVerifier sbv(this);
4218 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4219
4220 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4221 "Expansion only implemented for AVX2");
4222
4223 int length_offset = arrayOopDesc::length_offset_in_bytes();
4224 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4225
4226 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4227 int scaleIncr = expand_ary2 ? 8 : 16;
4228
4229 if (is_array_equ) {
4230 // Check the input args
4231 cmpoop(ary1, ary2);
4232 jcc(Assembler::equal, TRUE_LABEL);
4233
4234 // Need additional checks for arrays_equals.
4235 testptr(ary1, ary1);
4236 jcc(Assembler::zero, FALSE_LABEL);
4237 testptr(ary2, ary2);
4238 jcc(Assembler::zero, FALSE_LABEL);
4239
4240 // Check the lengths
4241 movl(limit, Address(ary1, length_offset));
4242 cmpl(limit, Address(ary2, length_offset));
4243 jcc(Assembler::notEqual, FALSE_LABEL);
4244 }
4245
4246 // count == 0
4247 testl(limit, limit);
4248 jcc(Assembler::zero, TRUE_LABEL);
4249
4250 if (is_array_equ) {
4251 // Load array address
4252 lea(ary1, Address(ary1, base_offset));
4253 lea(ary2, Address(ary2, base_offset));
4254 }
4255
4256 if (is_array_equ && is_char) {
4257 // arrays_equals when used for char[].
4258 shll(limit, 1); // byte count != 0
4259 }
4260 movl(result, limit); // copy
4261
4262 if (UseAVX >= 2) {
4263 // With AVX2, use 32-byte vector compare
4264 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4265
4266 // Compare 32-byte vectors
4267 if (expand_ary2) {
4268 andl(result, 0x0000000f); // tail count (in bytes)
4269 andl(limit, 0xfffffff0); // vector count (in bytes)
4270 jcc(Assembler::zero, COMPARE_TAIL);
4271 } else {
4272 andl(result, 0x0000001f); // tail count (in bytes)
4273 andl(limit, 0xffffffe0); // vector count (in bytes)
4274 jcc(Assembler::zero, COMPARE_TAIL_16);
4275 }
4276
4277 lea(ary1, Address(ary1, limit, scaleFactor));
4278 lea(ary2, Address(ary2, limit, Address::times_1));
4279 negptr(limit);
4280
4281 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4282 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4283
4284 cmpl(limit, -64);
4285 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4286
4287 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4288
4289 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4290 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4291 kortestql(mask, mask);
4292 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4293 addptr(limit, 64); // update since we already compared at this addr
4294 cmpl(limit, -64);
4295 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4296
4297 // At this point we may still need to compare -limit+result bytes.
4298 // We could execute the next two instruction and just continue via non-wide path:
4299 // cmpl(limit, 0);
4300 // jcc(Assembler::equal, COMPARE_TAIL); // true
4301 // But since we stopped at the points ary{1,2}+limit which are
4302 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4303 // (|limit| <= 32 and result < 32),
4304 // we may just compare the last 64 bytes.
4305 //
4306 addptr(result, -64); // it is safe, bc we just came from this area
4307 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4308 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4309 kortestql(mask, mask);
4310 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4311
4312 jmp(TRUE_LABEL);
4313
4314 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4315
4316 }//if (VM_Version::supports_avx512vlbw())
4317
4318 bind(COMPARE_WIDE_VECTORS);
4319 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4320 if (expand_ary2) {
4321 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4322 } else {
4323 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4324 }
4325 vpxor(vec1, vec2);
4326
4327 vptest(vec1, vec1);
4328 jcc(Assembler::notZero, FALSE_LABEL);
4329 addptr(limit, scaleIncr * 2);
4330 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4331
4332 testl(result, result);
4333 jcc(Assembler::zero, TRUE_LABEL);
4334
4335 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4336 if (expand_ary2) {
4337 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4338 } else {
4339 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4340 }
4341 vpxor(vec1, vec2);
4342
4343 vptest(vec1, vec1);
4344 jcc(Assembler::notZero, FALSE_LABEL);
4345 jmp(TRUE_LABEL);
4346
4347 bind(COMPARE_TAIL_16); // limit is zero
4348 movl(limit, result);
4349
4350 // Compare 16-byte chunks
4351 andl(result, 0x0000000f); // tail count (in bytes)
4352 andl(limit, 0xfffffff0); // vector count (in bytes)
4353 jcc(Assembler::zero, COMPARE_TAIL);
4354
4355 lea(ary1, Address(ary1, limit, scaleFactor));
4356 lea(ary2, Address(ary2, limit, Address::times_1));
4357 negptr(limit);
4358
4359 bind(COMPARE_WIDE_VECTORS_16);
4360 movdqu(vec1, Address(ary1, limit, scaleFactor));
4361 if (expand_ary2) {
4362 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4363 } else {
4364 movdqu(vec2, Address(ary2, limit, Address::times_1));
4365 }
4366 pxor(vec1, vec2);
4367
4368 ptest(vec1, vec1);
4369 jcc(Assembler::notZero, FALSE_LABEL);
4370 addptr(limit, scaleIncr);
4371 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4372
4373 bind(COMPARE_TAIL); // limit is zero
4374 movl(limit, result);
4375 // Fallthru to tail compare
4376 } else if (UseSSE42Intrinsics) {
4377 // With SSE4.2, use double quad vector compare
4378 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4379
4380 // Compare 16-byte vectors
4381 andl(result, 0x0000000f); // tail count (in bytes)
4382 andl(limit, 0xfffffff0); // vector count (in bytes)
4383 jcc(Assembler::zero, COMPARE_TAIL);
4384
4385 lea(ary1, Address(ary1, limit, Address::times_1));
4386 lea(ary2, Address(ary2, limit, Address::times_1));
4387 negptr(limit);
4388
4389 bind(COMPARE_WIDE_VECTORS);
4390 movdqu(vec1, Address(ary1, limit, Address::times_1));
4391 movdqu(vec2, Address(ary2, limit, Address::times_1));
4392 pxor(vec1, vec2);
4393
4394 ptest(vec1, vec1);
4395 jcc(Assembler::notZero, FALSE_LABEL);
4396 addptr(limit, 16);
4397 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4398
4399 testl(result, result);
4400 jcc(Assembler::zero, TRUE_LABEL);
4401
4402 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4403 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4404 pxor(vec1, vec2);
4405
4406 ptest(vec1, vec1);
4407 jccb(Assembler::notZero, FALSE_LABEL);
4408 jmpb(TRUE_LABEL);
4409
4410 bind(COMPARE_TAIL); // limit is zero
4411 movl(limit, result);
4412 // Fallthru to tail compare
4413 }
4414
4415 // Compare 4-byte vectors
4416 if (expand_ary2) {
4417 testl(result, result);
4418 jccb(Assembler::zero, TRUE_LABEL);
4419 } else {
4420 andl(limit, 0xfffffffc); // vector count (in bytes)
4421 jccb(Assembler::zero, COMPARE_CHAR);
4422 }
4423
4424 lea(ary1, Address(ary1, limit, scaleFactor));
4425 lea(ary2, Address(ary2, limit, Address::times_1));
4426 negptr(limit);
4427
4428 bind(COMPARE_VECTORS);
4429 if (expand_ary2) {
4430 // There are no "vector" operations for bytes to shorts
4431 movzbl(chr, Address(ary2, limit, Address::times_1));
4432 cmpw(Address(ary1, limit, Address::times_2), chr);
4433 jccb(Assembler::notEqual, FALSE_LABEL);
4434 addptr(limit, 1);
4435 jcc(Assembler::notZero, COMPARE_VECTORS);
4436 jmp(TRUE_LABEL);
4437 } else {
4438 movl(chr, Address(ary1, limit, Address::times_1));
4439 cmpl(chr, Address(ary2, limit, Address::times_1));
4440 jccb(Assembler::notEqual, FALSE_LABEL);
4441 addptr(limit, 4);
4442 jcc(Assembler::notZero, COMPARE_VECTORS);
4443 }
4444
4445 // Compare trailing char (final 2 bytes), if any
4446 bind(COMPARE_CHAR);
4447 testl(result, 0x2); // tail char
4448 jccb(Assembler::zero, COMPARE_BYTE);
4449 load_unsigned_short(chr, Address(ary1, 0));
4450 load_unsigned_short(limit, Address(ary2, 0));
4451 cmpl(chr, limit);
4452 jccb(Assembler::notEqual, FALSE_LABEL);
4453
4454 if (is_array_equ && is_char) {
4455 bind(COMPARE_BYTE);
4456 } else {
4457 lea(ary1, Address(ary1, 2));
4458 lea(ary2, Address(ary2, 2));
4459
4460 bind(COMPARE_BYTE);
4461 testl(result, 0x1); // tail byte
4462 jccb(Assembler::zero, TRUE_LABEL);
4463 load_unsigned_byte(chr, Address(ary1, 0));
4464 load_unsigned_byte(limit, Address(ary2, 0));
4465 cmpl(chr, limit);
4466 jccb(Assembler::notEqual, FALSE_LABEL);
4467 }
4468 bind(TRUE_LABEL);
4469 movl(result, 1); // return true
4470 jmpb(DONE);
4471
4472 bind(FALSE_LABEL);
4473 xorl(result, result); // return false
4474
4475 // That's it
4476 bind(DONE);
4477 if (UseAVX >= 2) {
4478 // clean upper bits of YMM registers
4479 vpxor(vec1, vec1);
4480 vpxor(vec2, vec2);
4481 }
4482 }
4483
4484 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4485 #define __ masm.
4486 Register dst = stub.data<0>();
4487 XMMRegister src = stub.data<1>();
4488 address target = stub.data<2>();
4489 __ bind(stub.entry());
4490 __ subptr(rsp, 8);
4491 __ movdbl(Address(rsp), src);
4492 __ call(RuntimeAddress(target));
4493 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4494 __ pop(dst);
4495 __ jmp(stub.continuation());
4496 #undef __
4497 }
4498
4499 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4500 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4501 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4502
4503 address slowpath_target;
4504 if (dst_bt == T_INT) {
4505 if (src_bt == T_FLOAT) {
4506 cvttss2sil(dst, src);
4507 cmpl(dst, 0x80000000);
4508 slowpath_target = StubRoutines::x86::f2i_fixup();
4509 } else {
4510 cvttsd2sil(dst, src);
4511 cmpl(dst, 0x80000000);
4512 slowpath_target = StubRoutines::x86::d2i_fixup();
4513 }
4514 } else {
4515 if (src_bt == T_FLOAT) {
4516 cvttss2siq(dst, src);
4517 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4518 slowpath_target = StubRoutines::x86::f2l_fixup();
4519 } else {
4520 cvttsd2siq(dst, src);
4521 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4522 slowpath_target = StubRoutines::x86::d2l_fixup();
4523 }
4524 }
4525
4526 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4527 int max_size = 23 + (UseAPX ? 1 : 0);
4528 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4529 jcc(Assembler::equal, stub->entry());
4530 bind(stub->continuation());
4531 }
4532
4533 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4534 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4535 switch(ideal_opc) {
4536 case Op_LShiftVS:
4537 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4538 case Op_LShiftVI:
4539 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4540 case Op_LShiftVL:
4541 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4542 case Op_RShiftVS:
4543 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4544 case Op_RShiftVI:
4545 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4546 case Op_RShiftVL:
4547 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4548 case Op_URShiftVS:
4549 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4550 case Op_URShiftVI:
4551 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4552 case Op_URShiftVL:
4553 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4554 case Op_RotateRightV:
4555 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4556 case Op_RotateLeftV:
4557 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4558 default:
4559 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4560 break;
4561 }
4562 }
4563
4564 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4565 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4566 if (is_unsigned) {
4567 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4568 } else {
4569 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4570 }
4571 }
4572
4573 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4574 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4575 switch (elem_bt) {
4576 case T_BYTE:
4577 if (ideal_opc == Op_SaturatingAddV) {
4578 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4579 } else {
4580 assert(ideal_opc == Op_SaturatingSubV, "");
4581 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4582 }
4583 break;
4584 case T_SHORT:
4585 if (ideal_opc == Op_SaturatingAddV) {
4586 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4587 } else {
4588 assert(ideal_opc == Op_SaturatingSubV, "");
4589 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4590 }
4591 break;
4592 default:
4593 fatal("Unsupported type %s", type2name(elem_bt));
4594 break;
4595 }
4596 }
4597
4598 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4599 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4600 switch (elem_bt) {
4601 case T_BYTE:
4602 if (ideal_opc == Op_SaturatingAddV) {
4603 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4604 } else {
4605 assert(ideal_opc == Op_SaturatingSubV, "");
4606 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4607 }
4608 break;
4609 case T_SHORT:
4610 if (ideal_opc == Op_SaturatingAddV) {
4611 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4612 } else {
4613 assert(ideal_opc == Op_SaturatingSubV, "");
4614 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4615 }
4616 break;
4617 default:
4618 fatal("Unsupported type %s", type2name(elem_bt));
4619 break;
4620 }
4621 }
4622
4623 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4624 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4625 if (is_unsigned) {
4626 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4627 } else {
4628 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4629 }
4630 }
4631
4632 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4633 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4634 switch (elem_bt) {
4635 case T_BYTE:
4636 if (ideal_opc == Op_SaturatingAddV) {
4637 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4638 } else {
4639 assert(ideal_opc == Op_SaturatingSubV, "");
4640 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4641 }
4642 break;
4643 case T_SHORT:
4644 if (ideal_opc == Op_SaturatingAddV) {
4645 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4646 } else {
4647 assert(ideal_opc == Op_SaturatingSubV, "");
4648 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4649 }
4650 break;
4651 default:
4652 fatal("Unsupported type %s", type2name(elem_bt));
4653 break;
4654 }
4655 }
4656
4657 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4658 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4659 switch (elem_bt) {
4660 case T_BYTE:
4661 if (ideal_opc == Op_SaturatingAddV) {
4662 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4663 } else {
4664 assert(ideal_opc == Op_SaturatingSubV, "");
4665 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4666 }
4667 break;
4668 case T_SHORT:
4669 if (ideal_opc == Op_SaturatingAddV) {
4670 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4671 } else {
4672 assert(ideal_opc == Op_SaturatingSubV, "");
4673 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4674 }
4675 break;
4676 default:
4677 fatal("Unsupported type %s", type2name(elem_bt));
4678 break;
4679 }
4680 }
4681
4682 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4683 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4684 bool is_varshift) {
4685 switch (ideal_opc) {
4686 case Op_AddVB:
4687 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4688 case Op_AddVS:
4689 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4690 case Op_AddVI:
4691 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4692 case Op_AddVL:
4693 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4694 case Op_AddVF:
4695 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4696 case Op_AddVD:
4697 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4698 case Op_SubVB:
4699 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4700 case Op_SubVS:
4701 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4702 case Op_SubVI:
4703 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4704 case Op_SubVL:
4705 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4706 case Op_SubVF:
4707 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4708 case Op_SubVD:
4709 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4710 case Op_MulVS:
4711 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4712 case Op_MulVI:
4713 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4714 case Op_MulVL:
4715 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4716 case Op_MulVF:
4717 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4718 case Op_MulVD:
4719 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4720 case Op_DivVF:
4721 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4722 case Op_DivVD:
4723 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4724 case Op_SqrtVF:
4725 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4726 case Op_SqrtVD:
4727 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4728 case Op_AbsVB:
4729 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4730 case Op_AbsVS:
4731 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4732 case Op_AbsVI:
4733 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4734 case Op_AbsVL:
4735 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4736 case Op_FmaVF:
4737 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4738 case Op_FmaVD:
4739 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4740 case Op_VectorRearrange:
4741 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4742 case Op_LShiftVS:
4743 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4744 case Op_LShiftVI:
4745 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4746 case Op_LShiftVL:
4747 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4748 case Op_RShiftVS:
4749 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4750 case Op_RShiftVI:
4751 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4752 case Op_RShiftVL:
4753 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4754 case Op_URShiftVS:
4755 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4756 case Op_URShiftVI:
4757 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4758 case Op_URShiftVL:
4759 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4760 case Op_RotateLeftV:
4761 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4762 case Op_RotateRightV:
4763 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4764 case Op_MaxV:
4765 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4766 case Op_MinV:
4767 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4768 case Op_UMinV:
4769 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4770 case Op_UMaxV:
4771 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4772 case Op_XorV:
4773 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4774 case Op_OrV:
4775 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4776 case Op_AndV:
4777 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4778 default:
4779 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4780 break;
4781 }
4782 }
4783
4784 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4785 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4786 switch (ideal_opc) {
4787 case Op_AddVB:
4788 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4789 case Op_AddVS:
4790 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4791 case Op_AddVI:
4792 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4793 case Op_AddVL:
4794 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4795 case Op_AddVF:
4796 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4797 case Op_AddVD:
4798 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4799 case Op_SubVB:
4800 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4801 case Op_SubVS:
4802 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4803 case Op_SubVI:
4804 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4805 case Op_SubVL:
4806 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4807 case Op_SubVF:
4808 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4809 case Op_SubVD:
4810 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4811 case Op_MulVS:
4812 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4813 case Op_MulVI:
4814 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4815 case Op_MulVL:
4816 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4817 case Op_MulVF:
4818 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4819 case Op_MulVD:
4820 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4821 case Op_DivVF:
4822 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4823 case Op_DivVD:
4824 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4825 case Op_FmaVF:
4826 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4827 case Op_FmaVD:
4828 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4829 case Op_MaxV:
4830 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4831 case Op_MinV:
4832 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4833 case Op_UMaxV:
4834 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4835 case Op_UMinV:
4836 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4837 case Op_XorV:
4838 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4839 case Op_OrV:
4840 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4841 case Op_AndV:
4842 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4843 default:
4844 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4845 break;
4846 }
4847 }
4848
4849 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4850 KRegister src1, KRegister src2) {
4851 BasicType etype = T_ILLEGAL;
4852 switch(mask_len) {
4853 case 2:
4854 case 4:
4855 case 8: etype = T_BYTE; break;
4856 case 16: etype = T_SHORT; break;
4857 case 32: etype = T_INT; break;
4858 case 64: etype = T_LONG; break;
4859 default: fatal("Unsupported type"); break;
4860 }
4861 assert(etype != T_ILLEGAL, "");
4862 switch(ideal_opc) {
4863 case Op_AndVMask:
4864 kand(etype, dst, src1, src2); break;
4865 case Op_OrVMask:
4866 kor(etype, dst, src1, src2); break;
4867 case Op_XorVMask:
4868 kxor(etype, dst, src1, src2); break;
4869 default:
4870 fatal("Unsupported masked operation"); break;
4871 }
4872 }
4873
4874 /*
4875 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4876 * If src is NaN, the result is 0.
4877 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4878 * the result is equal to the value of Integer.MIN_VALUE.
4879 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4880 * the result is equal to the value of Integer.MAX_VALUE.
4881 */
4882 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4883 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4884 Register rscratch, AddressLiteral float_sign_flip,
4885 int vec_enc) {
4886 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4887 Label done;
4888 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4889 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4890 vptest(xtmp2, xtmp2, vec_enc);
4891 jccb(Assembler::equal, done);
4892
4893 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4894 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4895
4896 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4897 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4898 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4899
4900 // Recompute the mask for remaining special value.
4901 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4902 // Extract SRC values corresponding to TRUE mask lanes.
4903 vpand(xtmp4, xtmp2, src, vec_enc);
4904 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4905 // values are set.
4906 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4907
4908 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4909 bind(done);
4910 }
4911
4912 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4913 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4914 Register rscratch, AddressLiteral float_sign_flip,
4915 int vec_enc) {
4916 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4917 Label done;
4918 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4919 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4920 kortestwl(ktmp1, ktmp1);
4921 jccb(Assembler::equal, done);
4922
4923 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4924 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4925 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4926
4927 kxorwl(ktmp1, ktmp1, ktmp2);
4928 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4929 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4930 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4931 bind(done);
4932 }
4933
4934 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4935 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4936 Register rscratch, AddressLiteral double_sign_flip,
4937 int vec_enc) {
4938 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4939
4940 Label done;
4941 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4942 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4943 kortestwl(ktmp1, ktmp1);
4944 jccb(Assembler::equal, done);
4945
4946 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4947 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4948 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4949
4950 kxorwl(ktmp1, ktmp1, ktmp2);
4951 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4952 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4953 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4954 bind(done);
4955 }
4956
4957 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4958 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4959 Register rscratch, AddressLiteral float_sign_flip,
4960 int vec_enc) {
4961 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4962 Label done;
4963 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4964 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4965 kortestwl(ktmp1, ktmp1);
4966 jccb(Assembler::equal, done);
4967
4968 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4969 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4970 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4971
4972 kxorwl(ktmp1, ktmp1, ktmp2);
4973 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4974 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4975 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4976 bind(done);
4977 }
4978
4979 /*
4980 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4981 * If src is NaN, the result is 0.
4982 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4983 * the result is equal to the value of Long.MIN_VALUE.
4984 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4985 * the result is equal to the value of Long.MAX_VALUE.
4986 */
4987 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4988 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4989 Register rscratch, AddressLiteral double_sign_flip,
4990 int vec_enc) {
4991 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4992
4993 Label done;
4994 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4995 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4996 kortestwl(ktmp1, ktmp1);
4997 jccb(Assembler::equal, done);
4998
4999 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5000 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5001 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5002
5003 kxorwl(ktmp1, ktmp1, ktmp2);
5004 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5005 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5006 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5007 bind(done);
5008 }
5009
5010 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5011 XMMRegister xtmp, int index, int vec_enc) {
5012 assert(vec_enc < Assembler::AVX_512bit, "");
5013 if (vec_enc == Assembler::AVX_256bit) {
5014 vextractf128_high(xtmp, src);
5015 vshufps(dst, src, xtmp, index, vec_enc);
5016 } else {
5017 vshufps(dst, src, zero, index, vec_enc);
5018 }
5019 }
5020
5021 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5022 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5023 AddressLiteral float_sign_flip, int src_vec_enc) {
5024 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5025
5026 Label done;
5027 // Compare the destination lanes with float_sign_flip
5028 // value to get mask for all special values.
5029 movdqu(xtmp1, float_sign_flip, rscratch);
5030 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5031 ptest(xtmp2, xtmp2);
5032 jccb(Assembler::equal, done);
5033
5034 // Flip float_sign_flip to get max integer value.
5035 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5036 pxor(xtmp1, xtmp4);
5037
5038 // Set detination lanes corresponding to unordered source lanes as zero.
5039 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5040 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5041
5042 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5043 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5044 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5045
5046 // Recompute the mask for remaining special value.
5047 pxor(xtmp2, xtmp3);
5048 // Extract mask corresponding to non-negative source lanes.
5049 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5050
5051 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5052 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5053 pand(xtmp3, xtmp2);
5054
5055 // Replace destination lanes holding special value(0x80000000) with max int
5056 // if corresponding source lane holds a +ve value.
5057 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5058 bind(done);
5059 }
5060
5061
5062 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5063 XMMRegister xtmp, Register rscratch, int vec_enc) {
5064 switch(to_elem_bt) {
5065 case T_SHORT:
5066 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5067 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5068 vpackusdw(dst, dst, zero, vec_enc);
5069 if (vec_enc == Assembler::AVX_256bit) {
5070 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5071 }
5072 break;
5073 case T_BYTE:
5074 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5075 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5076 vpackusdw(dst, dst, zero, vec_enc);
5077 if (vec_enc == Assembler::AVX_256bit) {
5078 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5079 }
5080 vpackuswb(dst, dst, zero, vec_enc);
5081 break;
5082 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5083 }
5084 }
5085
5086 /*
5087 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5088 * a) Perform vector D2L/F2I cast.
5089 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5090 * It signifies that source value could be any of the special floating point
5091 * values(NaN,-Inf,Inf,Max,-Min).
5092 * c) Set destination to zero if source is NaN value.
5093 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5094 */
5095
5096 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5097 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5098 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5099 int to_elem_sz = type2aelembytes(to_elem_bt);
5100 assert(to_elem_sz <= 4, "");
5101 vcvttps2dq(dst, src, vec_enc);
5102 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5103 if (to_elem_sz < 4) {
5104 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5105 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5106 }
5107 }
5108
5109 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5110 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5111 Register rscratch, int vec_enc) {
5112 int to_elem_sz = type2aelembytes(to_elem_bt);
5113 assert(to_elem_sz <= 4, "");
5114 vcvttps2dq(dst, src, vec_enc);
5115 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5116 switch(to_elem_bt) {
5117 case T_INT:
5118 break;
5119 case T_SHORT:
5120 evpmovdw(dst, dst, vec_enc);
5121 break;
5122 case T_BYTE:
5123 evpmovdb(dst, dst, vec_enc);
5124 break;
5125 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5126 }
5127 }
5128
5129 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5130 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5131 Register rscratch, int vec_enc) {
5132 evcvttps2qq(dst, src, vec_enc);
5133 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5134 }
5135
5136 // Handling for downcasting from double to integer or sub-word types on AVX2.
5137 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5138 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5139 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5140 int to_elem_sz = type2aelembytes(to_elem_bt);
5141 assert(to_elem_sz < 8, "");
5142 vcvttpd2dq(dst, src, vec_enc);
5143 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5144 float_sign_flip, vec_enc);
5145 if (to_elem_sz < 4) {
5146 // xtmp4 holds all zero lanes.
5147 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5148 }
5149 }
5150
5151 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5152 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5153 KRegister ktmp2, AddressLiteral sign_flip,
5154 Register rscratch, int vec_enc) {
5155 if (VM_Version::supports_avx512dq()) {
5156 evcvttpd2qq(dst, src, vec_enc);
5157 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5158 switch(to_elem_bt) {
5159 case T_LONG:
5160 break;
5161 case T_INT:
5162 evpmovsqd(dst, dst, vec_enc);
5163 break;
5164 case T_SHORT:
5165 evpmovsqd(dst, dst, vec_enc);
5166 evpmovdw(dst, dst, vec_enc);
5167 break;
5168 case T_BYTE:
5169 evpmovsqd(dst, dst, vec_enc);
5170 evpmovdb(dst, dst, vec_enc);
5171 break;
5172 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5173 }
5174 } else {
5175 assert(type2aelembytes(to_elem_bt) <= 4, "");
5176 vcvttpd2dq(dst, src, vec_enc);
5177 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5178 switch(to_elem_bt) {
5179 case T_INT:
5180 break;
5181 case T_SHORT:
5182 evpmovdw(dst, dst, vec_enc);
5183 break;
5184 case T_BYTE:
5185 evpmovdb(dst, dst, vec_enc);
5186 break;
5187 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5188 }
5189 }
5190 }
5191
5192 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5193 switch(to_elem_bt) {
5194 case T_LONG:
5195 evcvttps2qqs(dst, src, vec_enc);
5196 break;
5197 case T_INT:
5198 evcvttps2dqs(dst, src, vec_enc);
5199 break;
5200 case T_SHORT:
5201 evcvttps2dqs(dst, src, vec_enc);
5202 evpmovdw(dst, dst, vec_enc);
5203 break;
5204 case T_BYTE:
5205 evcvttps2dqs(dst, src, vec_enc);
5206 evpmovdb(dst, dst, vec_enc);
5207 break;
5208 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5209 }
5210 }
5211
5212 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5213 switch(to_elem_bt) {
5214 case T_LONG:
5215 evcvttps2qqs(dst, src, vec_enc);
5216 break;
5217 case T_INT:
5218 evcvttps2dqs(dst, src, vec_enc);
5219 break;
5220 case T_SHORT:
5221 evcvttps2dqs(dst, src, vec_enc);
5222 evpmovdw(dst, dst, vec_enc);
5223 break;
5224 case T_BYTE:
5225 evcvttps2dqs(dst, src, vec_enc);
5226 evpmovdb(dst, dst, vec_enc);
5227 break;
5228 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5229 }
5230 }
5231
5232 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5233 switch(to_elem_bt) {
5234 case T_LONG:
5235 evcvttpd2qqs(dst, src, vec_enc);
5236 break;
5237 case T_INT:
5238 evcvttpd2dqs(dst, src, vec_enc);
5239 break;
5240 case T_SHORT:
5241 evcvttpd2dqs(dst, src, vec_enc);
5242 evpmovdw(dst, dst, vec_enc);
5243 break;
5244 case T_BYTE:
5245 evcvttpd2dqs(dst, src, vec_enc);
5246 evpmovdb(dst, dst, vec_enc);
5247 break;
5248 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5249 }
5250 }
5251
5252 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5253 switch(to_elem_bt) {
5254 case T_LONG:
5255 evcvttpd2qqs(dst, src, vec_enc);
5256 break;
5257 case T_INT:
5258 evcvttpd2dqs(dst, src, vec_enc);
5259 break;
5260 case T_SHORT:
5261 evcvttpd2dqs(dst, src, vec_enc);
5262 evpmovdw(dst, dst, vec_enc);
5263 break;
5264 case T_BYTE:
5265 evcvttpd2dqs(dst, src, vec_enc);
5266 evpmovdb(dst, dst, vec_enc);
5267 break;
5268 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5269 }
5270 }
5271
5272 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5273 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5274 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5275 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5276 // and re-instantiate original MXCSR.RC mode after that.
5277 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5278
5279 mov64(tmp, julong_cast(0.5L));
5280 evpbroadcastq(xtmp1, tmp, vec_enc);
5281 vaddpd(xtmp1, src , xtmp1, vec_enc);
5282 evcvtpd2qq(dst, xtmp1, vec_enc);
5283 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5284 double_sign_flip, vec_enc);;
5285
5286 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5287 }
5288
5289 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5290 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5291 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5292 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5293 // and re-instantiate original MXCSR.RC mode after that.
5294 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5295
5296 movl(tmp, jint_cast(0.5));
5297 movq(xtmp1, tmp);
5298 vbroadcastss(xtmp1, xtmp1, vec_enc);
5299 vaddps(xtmp1, src , xtmp1, vec_enc);
5300 vcvtps2dq(dst, xtmp1, vec_enc);
5301 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5302 float_sign_flip, vec_enc);
5303
5304 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5305 }
5306
5307 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5308 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5309 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5310 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5311 // and re-instantiate original MXCSR.RC mode after that.
5312 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5313
5314 movl(tmp, jint_cast(0.5));
5315 movq(xtmp1, tmp);
5316 vbroadcastss(xtmp1, xtmp1, vec_enc);
5317 vaddps(xtmp1, src , xtmp1, vec_enc);
5318 vcvtps2dq(dst, xtmp1, vec_enc);
5319 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5320
5321 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5322 }
5323
5324 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5325 BasicType from_elem_bt, BasicType to_elem_bt) {
5326 switch (from_elem_bt) {
5327 case T_BYTE:
5328 switch (to_elem_bt) {
5329 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5330 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5331 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5332 default: ShouldNotReachHere();
5333 }
5334 break;
5335 case T_SHORT:
5336 switch (to_elem_bt) {
5337 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5338 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5339 default: ShouldNotReachHere();
5340 }
5341 break;
5342 case T_INT:
5343 assert(to_elem_bt == T_LONG, "");
5344 vpmovzxdq(dst, src, vlen_enc);
5345 break;
5346 default:
5347 ShouldNotReachHere();
5348 }
5349 }
5350
5351 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5352 BasicType from_elem_bt, BasicType to_elem_bt) {
5353 switch (from_elem_bt) {
5354 case T_BYTE:
5355 switch (to_elem_bt) {
5356 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5357 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5358 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5359 default: ShouldNotReachHere();
5360 }
5361 break;
5362 case T_SHORT:
5363 switch (to_elem_bt) {
5364 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5365 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5366 default: ShouldNotReachHere();
5367 }
5368 break;
5369 case T_INT:
5370 assert(to_elem_bt == T_LONG, "");
5371 vpmovsxdq(dst, src, vlen_enc);
5372 break;
5373 default:
5374 ShouldNotReachHere();
5375 }
5376 }
5377
5378 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5379 BasicType dst_bt, BasicType src_bt, int vlen) {
5380 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5381 assert(vlen_enc != AVX_512bit, "");
5382
5383 int dst_bt_size = type2aelembytes(dst_bt);
5384 int src_bt_size = type2aelembytes(src_bt);
5385 if (dst_bt_size > src_bt_size) {
5386 switch (dst_bt_size / src_bt_size) {
5387 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5388 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5389 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5390 default: ShouldNotReachHere();
5391 }
5392 } else {
5393 assert(dst_bt_size < src_bt_size, "");
5394 switch (src_bt_size / dst_bt_size) {
5395 case 2: {
5396 if (vlen_enc == AVX_128bit) {
5397 vpacksswb(dst, src, src, vlen_enc);
5398 } else {
5399 vpacksswb(dst, src, src, vlen_enc);
5400 vpermq(dst, dst, 0x08, vlen_enc);
5401 }
5402 break;
5403 }
5404 case 4: {
5405 if (vlen_enc == AVX_128bit) {
5406 vpackssdw(dst, src, src, vlen_enc);
5407 vpacksswb(dst, dst, dst, vlen_enc);
5408 } else {
5409 vpackssdw(dst, src, src, vlen_enc);
5410 vpermq(dst, dst, 0x08, vlen_enc);
5411 vpacksswb(dst, dst, dst, AVX_128bit);
5412 }
5413 break;
5414 }
5415 case 8: {
5416 if (vlen_enc == AVX_128bit) {
5417 vpshufd(dst, src, 0x08, vlen_enc);
5418 vpackssdw(dst, dst, dst, vlen_enc);
5419 vpacksswb(dst, dst, dst, vlen_enc);
5420 } else {
5421 vpshufd(dst, src, 0x08, vlen_enc);
5422 vpermq(dst, dst, 0x08, vlen_enc);
5423 vpackssdw(dst, dst, dst, AVX_128bit);
5424 vpacksswb(dst, dst, dst, AVX_128bit);
5425 }
5426 break;
5427 }
5428 default: ShouldNotReachHere();
5429 }
5430 }
5431 }
5432
5433 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5434 bool merge, BasicType bt, int vlen_enc) {
5435 if (bt == T_INT) {
5436 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5437 } else {
5438 assert(bt == T_LONG, "");
5439 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5440 }
5441 }
5442
5443 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5444 bool merge, BasicType bt, int vlen_enc) {
5445 if (bt == T_INT) {
5446 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5447 } else {
5448 assert(bt == T_LONG, "");
5449 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5450 }
5451 }
5452
5453 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5454 Register rtmp2, XMMRegister xtmp, int mask_len,
5455 int vec_enc) {
5456 int index = 0;
5457 int vindex = 0;
5458 mov64(rtmp1, 0x0101010101010101L);
5459 pdepq(rtmp1, src, rtmp1);
5460 if (mask_len > 8) {
5461 movq(rtmp2, src);
5462 vpxor(xtmp, xtmp, xtmp, vec_enc);
5463 movq(xtmp, rtmp1);
5464 }
5465 movq(dst, rtmp1);
5466
5467 mask_len -= 8;
5468 while (mask_len > 0) {
5469 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5470 index++;
5471 if ((index % 2) == 0) {
5472 pxor(xtmp, xtmp);
5473 }
5474 mov64(rtmp1, 0x0101010101010101L);
5475 shrq(rtmp2, 8);
5476 pdepq(rtmp1, rtmp2, rtmp1);
5477 pinsrq(xtmp, rtmp1, index % 2);
5478 vindex = index / 2;
5479 if (vindex) {
5480 // Write entire 16 byte vector when both 64 bit
5481 // lanes are update to save redundant instructions.
5482 if (index % 2) {
5483 vinsertf128(dst, dst, xtmp, vindex);
5484 }
5485 } else {
5486 vmovdqu(dst, xtmp);
5487 }
5488 mask_len -= 8;
5489 }
5490 }
5491
5492 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5493 switch(opc) {
5494 case Op_VectorMaskTrueCount:
5495 popcntq(dst, tmp);
5496 break;
5497 case Op_VectorMaskLastTrue:
5498 if (VM_Version::supports_lzcnt()) {
5499 lzcntq(tmp, tmp);
5500 movl(dst, 63);
5501 subl(dst, tmp);
5502 } else {
5503 movl(dst, -1);
5504 bsrq(tmp, tmp);
5505 cmov32(Assembler::notZero, dst, tmp);
5506 }
5507 break;
5508 case Op_VectorMaskFirstTrue:
5509 if (VM_Version::supports_bmi1()) {
5510 if (masklen < 32) {
5511 orl(tmp, 1 << masklen);
5512 tzcntl(dst, tmp);
5513 } else if (masklen == 32) {
5514 tzcntl(dst, tmp);
5515 } else {
5516 assert(masklen == 64, "");
5517 tzcntq(dst, tmp);
5518 }
5519 } else {
5520 if (masklen < 32) {
5521 orl(tmp, 1 << masklen);
5522 bsfl(dst, tmp);
5523 } else {
5524 assert(masklen == 32 || masklen == 64, "");
5525 movl(dst, masklen);
5526 if (masklen == 32) {
5527 bsfl(tmp, tmp);
5528 } else {
5529 bsfq(tmp, tmp);
5530 }
5531 cmov32(Assembler::notZero, dst, tmp);
5532 }
5533 }
5534 break;
5535 case Op_VectorMaskToLong:
5536 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5537 break;
5538 default: assert(false, "Unhandled mask operation");
5539 }
5540 }
5541
5542 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5543 int masklen, int masksize, int vec_enc) {
5544 assert(VM_Version::supports_popcnt(), "");
5545
5546 if(VM_Version::supports_avx512bw()) {
5547 kmovql(tmp, mask);
5548 } else {
5549 assert(masklen <= 16, "");
5550 kmovwl(tmp, mask);
5551 }
5552
5553 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5554 // operations needs to be clipped.
5555 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5556 andq(tmp, (1 << masklen) - 1);
5557 }
5558
5559 vector_mask_operation_helper(opc, dst, tmp, masklen);
5560 }
5561
5562 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5563 Register tmp, int masklen, BasicType bt, int vec_enc) {
5564 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5565 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5566 assert(VM_Version::supports_popcnt(), "");
5567
5568 bool need_clip = false;
5569 switch(bt) {
5570 case T_BOOLEAN:
5571 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5572 vpxor(xtmp, xtmp, xtmp, vec_enc);
5573 vpsubb(xtmp, xtmp, mask, vec_enc);
5574 vpmovmskb(tmp, xtmp, vec_enc);
5575 need_clip = masklen < 16;
5576 break;
5577 case T_BYTE:
5578 vpmovmskb(tmp, mask, vec_enc);
5579 need_clip = masklen < 16;
5580 break;
5581 case T_SHORT:
5582 vpacksswb(xtmp, mask, mask, vec_enc);
5583 if (masklen >= 16) {
5584 vpermpd(xtmp, xtmp, 8, vec_enc);
5585 }
5586 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5587 need_clip = masklen < 16;
5588 break;
5589 case T_INT:
5590 case T_FLOAT:
5591 vmovmskps(tmp, mask, vec_enc);
5592 need_clip = masklen < 4;
5593 break;
5594 case T_LONG:
5595 case T_DOUBLE:
5596 vmovmskpd(tmp, mask, vec_enc);
5597 need_clip = masklen < 2;
5598 break;
5599 default: assert(false, "Unhandled type, %s", type2name(bt));
5600 }
5601
5602 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5603 // operations needs to be clipped.
5604 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5605 // need_clip implies masklen < 32
5606 andq(tmp, (1 << masklen) - 1);
5607 }
5608
5609 vector_mask_operation_helper(opc, dst, tmp, masklen);
5610 }
5611
5612 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5613 Register rtmp2, int mask_len) {
5614 kmov(rtmp1, src);
5615 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5616 mov64(rtmp2, -1L);
5617 pextq(rtmp2, rtmp2, rtmp1);
5618 kmov(dst, rtmp2);
5619 }
5620
5621 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5622 XMMRegister mask, Register rtmp, Register rscratch,
5623 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5624 int vec_enc) {
5625 assert(type2aelembytes(bt) >= 4, "");
5626 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5627 address compress_perm_table = nullptr;
5628 address expand_perm_table = nullptr;
5629 if (type2aelembytes(bt) == 8) {
5630 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5631 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5632 vmovmskpd(rtmp, mask, vec_enc);
5633 } else {
5634 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5635 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5636 vmovmskps(rtmp, mask, vec_enc);
5637 }
5638 shlq(rtmp, 5); // for 32 byte permute row.
5639 if (opcode == Op_CompressV) {
5640 lea(rscratch, ExternalAddress(compress_perm_table));
5641 } else {
5642 lea(rscratch, ExternalAddress(expand_perm_table));
5643 }
5644 addptr(rtmp, rscratch);
5645 vmovdqu(permv, Address(rtmp));
5646 vpermps(dst, permv, src, Assembler::AVX_256bit);
5647 vpxor(xtmp, xtmp, xtmp, vec_enc);
5648 // Blend the result with zero vector using permute mask, each column entry
5649 // in a permute table row contains either a valid permute index or a -1 (default)
5650 // value, this can potentially be used as a blending mask after
5651 // compressing/expanding the source vector lanes.
5652 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5653 }
5654
5655 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5656 bool merge, BasicType bt, int vec_enc) {
5657 if (opcode == Op_CompressV) {
5658 switch(bt) {
5659 case T_BYTE:
5660 evpcompressb(dst, mask, src, merge, vec_enc);
5661 break;
5662 case T_CHAR:
5663 case T_SHORT:
5664 evpcompressw(dst, mask, src, merge, vec_enc);
5665 break;
5666 case T_INT:
5667 evpcompressd(dst, mask, src, merge, vec_enc);
5668 break;
5669 case T_FLOAT:
5670 evcompressps(dst, mask, src, merge, vec_enc);
5671 break;
5672 case T_LONG:
5673 evpcompressq(dst, mask, src, merge, vec_enc);
5674 break;
5675 case T_DOUBLE:
5676 evcompresspd(dst, mask, src, merge, vec_enc);
5677 break;
5678 default:
5679 fatal("Unsupported type %s", type2name(bt));
5680 break;
5681 }
5682 } else {
5683 assert(opcode == Op_ExpandV, "");
5684 switch(bt) {
5685 case T_BYTE:
5686 evpexpandb(dst, mask, src, merge, vec_enc);
5687 break;
5688 case T_CHAR:
5689 case T_SHORT:
5690 evpexpandw(dst, mask, src, merge, vec_enc);
5691 break;
5692 case T_INT:
5693 evpexpandd(dst, mask, src, merge, vec_enc);
5694 break;
5695 case T_FLOAT:
5696 evexpandps(dst, mask, src, merge, vec_enc);
5697 break;
5698 case T_LONG:
5699 evpexpandq(dst, mask, src, merge, vec_enc);
5700 break;
5701 case T_DOUBLE:
5702 evexpandpd(dst, mask, src, merge, vec_enc);
5703 break;
5704 default:
5705 fatal("Unsupported type %s", type2name(bt));
5706 break;
5707 }
5708 }
5709 }
5710
5711 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5712 KRegister ktmp1, int vec_enc) {
5713 if (opcode == Op_SignumVD) {
5714 vsubpd(dst, zero, one, vec_enc);
5715 // if src < 0 ? -1 : 1
5716 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5717 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5718 // if src == NaN, -0.0 or 0.0 return src.
5719 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5720 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5721 } else {
5722 assert(opcode == Op_SignumVF, "");
5723 vsubps(dst, zero, one, vec_enc);
5724 // if src < 0 ? -1 : 1
5725 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5726 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5727 // if src == NaN, -0.0 or 0.0 return src.
5728 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5729 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5730 }
5731 }
5732
5733 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5734 XMMRegister xtmp1, int vec_enc) {
5735 if (opcode == Op_SignumVD) {
5736 vsubpd(dst, zero, one, vec_enc);
5737 // if src < 0 ? -1 : 1
5738 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5739 // if src == NaN, -0.0 or 0.0 return src.
5740 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5741 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5742 } else {
5743 assert(opcode == Op_SignumVF, "");
5744 vsubps(dst, zero, one, vec_enc);
5745 // if src < 0 ? -1 : 1
5746 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5747 // if src == NaN, -0.0 or 0.0 return src.
5748 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5749 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5750 }
5751 }
5752
5753 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5754 if (VM_Version::supports_avx512bw()) {
5755 if (mask_len > 32) {
5756 kmovql(dst, src);
5757 } else {
5758 kmovdl(dst, src);
5759 if (mask_len != 32) {
5760 kshiftrdl(dst, dst, 32 - mask_len);
5761 }
5762 }
5763 } else {
5764 assert(mask_len <= 16, "");
5765 kmovwl(dst, src);
5766 if (mask_len != 16) {
5767 kshiftrwl(dst, dst, 16 - mask_len);
5768 }
5769 }
5770 }
5771
5772 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5773 int lane_size = type2aelembytes(bt);
5774 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5775 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5776 movptr(rtmp, imm32);
5777 switch(lane_size) {
5778 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5779 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5780 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5781 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5782 fatal("Unsupported lane size %d", lane_size);
5783 break;
5784 }
5785 } else {
5786 movptr(rtmp, imm32);
5787 movq(dst, rtmp);
5788 switch(lane_size) {
5789 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5790 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5791 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5792 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5793 fatal("Unsupported lane size %d", lane_size);
5794 break;
5795 }
5796 }
5797 }
5798
5799 //
5800 // Following is lookup table based popcount computation algorithm:-
5801 // Index Bit set count
5802 // [ 0000 -> 0,
5803 // 0001 -> 1,
5804 // 0010 -> 1,
5805 // 0011 -> 2,
5806 // 0100 -> 1,
5807 // 0101 -> 2,
5808 // 0110 -> 2,
5809 // 0111 -> 3,
5810 // 1000 -> 1,
5811 // 1001 -> 2,
5812 // 1010 -> 3,
5813 // 1011 -> 3,
5814 // 1100 -> 2,
5815 // 1101 -> 3,
5816 // 1111 -> 4 ]
5817 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5818 // shuffle indices for lookup table access.
5819 // b. Right shift each byte of vector lane by 4 positions.
5820 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5821 // shuffle indices for lookup table access.
5822 // d. Add the bitset count of upper and lower 4 bits of each byte.
5823 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5824 // count of all the bytes of a quadword.
5825 // f. Perform step e. for upper 128bit vector lane.
5826 // g. Pack the bitset count of quadwords back to double word.
5827 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5828
5829 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5830 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5831 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5832 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5833 vpsrlw(dst, src, 4, vec_enc);
5834 vpand(dst, dst, xtmp1, vec_enc);
5835 vpand(xtmp1, src, xtmp1, vec_enc);
5836 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5837 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5838 vpshufb(dst, xtmp2, dst, vec_enc);
5839 vpaddb(dst, dst, xtmp1, vec_enc);
5840 }
5841
5842 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5843 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5844 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5845 // Following code is as per steps e,f,g and h of above algorithm.
5846 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5847 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5848 vpsadbw(dst, dst, xtmp2, vec_enc);
5849 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5850 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5851 vpackuswb(dst, xtmp1, dst, vec_enc);
5852 }
5853
5854 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5855 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5856 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5857 // Add the popcount of upper and lower bytes of word.
5858 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5859 vpsrlw(dst, xtmp1, 8, vec_enc);
5860 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5861 vpaddw(dst, dst, xtmp1, vec_enc);
5862 }
5863
5864 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5865 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5866 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5867 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5868 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5869 }
5870
5871 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5872 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5873 switch(bt) {
5874 case T_LONG:
5875 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5876 break;
5877 case T_INT:
5878 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5879 break;
5880 case T_CHAR:
5881 case T_SHORT:
5882 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5883 break;
5884 case T_BYTE:
5885 case T_BOOLEAN:
5886 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5887 break;
5888 default:
5889 fatal("Unsupported type %s", type2name(bt));
5890 break;
5891 }
5892 }
5893
5894 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5895 KRegister mask, bool merge, int vec_enc) {
5896 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5897 switch(bt) {
5898 case T_LONG:
5899 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5900 evpopcntq(dst, mask, src, merge, vec_enc);
5901 break;
5902 case T_INT:
5903 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5904 evpopcntd(dst, mask, src, merge, vec_enc);
5905 break;
5906 case T_CHAR:
5907 case T_SHORT:
5908 assert(VM_Version::supports_avx512_bitalg(), "");
5909 evpopcntw(dst, mask, src, merge, vec_enc);
5910 break;
5911 case T_BYTE:
5912 case T_BOOLEAN:
5913 assert(VM_Version::supports_avx512_bitalg(), "");
5914 evpopcntb(dst, mask, src, merge, vec_enc);
5915 break;
5916 default:
5917 fatal("Unsupported type %s", type2name(bt));
5918 break;
5919 }
5920 }
5921
5922 // Bit reversal algorithm first reverses the bits of each byte followed by
5923 // a byte level reversal for multi-byte primitive types (short/int/long).
5924 // Algorithm performs a lookup table access to get reverse bit sequence
5925 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5926 // is obtained by swapping the reverse bit sequences of upper and lower
5927 // nibble of a byte.
5928 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5929 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5930 if (VM_Version::supports_avx512vlbw()) {
5931
5932 // Get the reverse bit sequence of lower nibble of each byte.
5933 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5934 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5935 evpandq(dst, xtmp2, src, vec_enc);
5936 vpshufb(dst, xtmp1, dst, vec_enc);
5937 vpsllq(dst, dst, 4, vec_enc);
5938
5939 // Get the reverse bit sequence of upper nibble of each byte.
5940 vpandn(xtmp2, xtmp2, src, vec_enc);
5941 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5942 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5943
5944 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5945 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5946 evporq(xtmp2, dst, xtmp2, vec_enc);
5947 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5948
5949 } else if(vec_enc == Assembler::AVX_512bit) {
5950 // Shift based bit reversal.
5951 assert(bt == T_LONG || bt == T_INT, "");
5952
5953 // Swap lower and upper nibble of each byte.
5954 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5955
5956 // Swap two least and most significant bits of each nibble.
5957 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5958
5959 // Swap adjacent pair of bits.
5960 evmovdqul(xtmp1, k0, dst, true, vec_enc);
5961 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5962
5963 evmovdqul(xtmp1, k0, dst, true, vec_enc);
5964 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5965 } else {
5966 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5967 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5968
5969 // Get the reverse bit sequence of lower nibble of each byte.
5970 vpand(dst, xtmp2, src, vec_enc);
5971 vpshufb(dst, xtmp1, dst, vec_enc);
5972 vpsllq(dst, dst, 4, vec_enc);
5973
5974 // Get the reverse bit sequence of upper nibble of each byte.
5975 vpandn(xtmp2, xtmp2, src, vec_enc);
5976 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5977 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5978
5979 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5980 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5981 vpor(xtmp2, dst, xtmp2, vec_enc);
5982 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5983 }
5984 }
5985
5986 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5987 XMMRegister xtmp, Register rscratch) {
5988 assert(VM_Version::supports_gfni(), "");
5989 assert(rscratch != noreg || always_reachable(mask), "missing");
5990
5991 // Galois field instruction based bit reversal based on following algorithm.
5992 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5993 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5994 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5995 vector_reverse_byte(bt, dst, xtmp, vec_enc);
5996 }
5997
5998 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5999 XMMRegister xtmp1, Register rtmp, int vec_enc) {
6000 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6001 evpandq(dst, xtmp1, src, vec_enc);
6002 vpsllq(dst, dst, nbits, vec_enc);
6003 vpandn(xtmp1, xtmp1, src, vec_enc);
6004 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6005 evporq(dst, dst, xtmp1, vec_enc);
6006 }
6007
6008 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6009 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6010 // Shift based bit reversal.
6011 assert(VM_Version::supports_evex(), "");
6012 switch(bt) {
6013 case T_LONG:
6014 // Swap upper and lower double word of each quad word.
6015 evprorq(xtmp1, k0, src, 32, true, vec_enc);
6016 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6017 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6018 break;
6019 case T_INT:
6020 // Swap upper and lower word of each double word.
6021 evprord(xtmp1, k0, src, 16, true, vec_enc);
6022 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6023 break;
6024 case T_CHAR:
6025 case T_SHORT:
6026 // Swap upper and lower byte of each word.
6027 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6028 break;
6029 case T_BYTE:
6030 evmovdquq(dst, k0, src, true, vec_enc);
6031 break;
6032 default:
6033 fatal("Unsupported type %s", type2name(bt));
6034 break;
6035 }
6036 }
6037
6038 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6039 if (bt == T_BYTE) {
6040 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6041 evmovdquq(dst, k0, src, true, vec_enc);
6042 } else {
6043 vmovdqu(dst, src);
6044 }
6045 return;
6046 }
6047 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6048 // pre-computed shuffle indices.
6049 switch(bt) {
6050 case T_LONG:
6051 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6052 break;
6053 case T_INT:
6054 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6055 break;
6056 case T_CHAR:
6057 case T_SHORT:
6058 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6059 break;
6060 default:
6061 fatal("Unsupported type %s", type2name(bt));
6062 break;
6063 }
6064 vpshufb(dst, src, dst, vec_enc);
6065 }
6066
6067 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6068 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6069 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6070 assert(is_integral_type(bt), "");
6071 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6072 assert(VM_Version::supports_avx512cd(), "");
6073 switch(bt) {
6074 case T_LONG:
6075 evplzcntq(dst, ktmp, src, merge, vec_enc);
6076 break;
6077 case T_INT:
6078 evplzcntd(dst, ktmp, src, merge, vec_enc);
6079 break;
6080 case T_SHORT:
6081 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6082 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6083 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6084 vpunpckhwd(dst, xtmp1, src, vec_enc);
6085 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6086 vpackusdw(dst, xtmp2, dst, vec_enc);
6087 break;
6088 case T_BYTE:
6089 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6090 // accessing the lookup table.
6091 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6092 // accessing the lookup table.
6093 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6094 assert(VM_Version::supports_avx512bw(), "");
6095 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6096 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6097 vpand(xtmp2, dst, src, vec_enc);
6098 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6099 vpsrlw(xtmp3, src, 4, vec_enc);
6100 vpand(xtmp3, dst, xtmp3, vec_enc);
6101 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6102 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6103 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6104 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6105 break;
6106 default:
6107 fatal("Unsupported type %s", type2name(bt));
6108 break;
6109 }
6110 }
6111
6112 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6113 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6114 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6115 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6116 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6117 // accessing the lookup table.
6118 vpand(dst, xtmp2, src, vec_enc);
6119 vpshufb(dst, xtmp1, dst, vec_enc);
6120 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6121 // accessing the lookup table.
6122 vpsrlw(xtmp3, src, 4, vec_enc);
6123 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6124 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6125 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6126 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6127 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6128 vpaddb(dst, dst, xtmp2, vec_enc);
6129 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6130 }
6131
6132 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6133 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6134 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6135 // Add zero counts of lower byte and upper byte of a word if
6136 // upper byte holds a zero value.
6137 vpsrlw(xtmp3, src, 8, vec_enc);
6138 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6139 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6140 vpsllw(xtmp2, dst, 8, vec_enc);
6141 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6142 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6143 vpsrlw(dst, dst, 8, vec_enc);
6144 }
6145
6146 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6147 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6148 // Since IEEE 754 floating point format represents mantissa in 1.0 format
6149 // hence biased exponent can be used to compute leading zero count as per
6150 // following formula:-
6151 // LZCNT = 31 - (biased_exp - 127)
6152 // Special handling has been introduced for Zero, Max_Int and -ve source values.
6153
6154 // Broadcast 0xFF
6155 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6156 vpsrld(xtmp1, xtmp1, 24, vec_enc);
6157
6158 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6159 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6160 // contributes to the leading number of zeros.
6161 vpsrld(xtmp2, src, 1, vec_enc);
6162 vpandn(xtmp3, xtmp2, src, vec_enc);
6163
6164 // Extract biased exponent.
6165 vcvtdq2ps(dst, xtmp3, vec_enc);
6166 vpsrld(dst, dst, 23, vec_enc);
6167 vpand(dst, dst, xtmp1, vec_enc);
6168
6169 // Broadcast 127.
6170 vpsrld(xtmp1, xtmp1, 1, vec_enc);
6171 // Exponent = biased_exp - 127
6172 vpsubd(dst, dst, xtmp1, vec_enc);
6173
6174 // Exponent_plus_one = Exponent + 1
6175 vpsrld(xtmp3, xtmp1, 6, vec_enc);
6176 vpaddd(dst, dst, xtmp3, vec_enc);
6177
6178 // Replace -ve exponent with zero, exponent is -ve when src
6179 // lane contains a zero value.
6180 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6181 vblendvps(dst, dst, xtmp2, dst, vec_enc);
6182
6183 // Rematerialize broadcast 32.
6184 vpslld(xtmp1, xtmp3, 5, vec_enc);
6185 // Exponent is 32 if corresponding source lane contains max_int value.
6186 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6187 // LZCNT = 32 - exponent_plus_one
6188 vpsubd(dst, xtmp1, dst, vec_enc);
6189
6190 // Replace LZCNT with a value 1 if corresponding source lane
6191 // contains max_int value.
6192 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6193
6194 // Replace biased_exp with 0 if source lane value is less than zero.
6195 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6196 vblendvps(dst, dst, xtmp2, src, vec_enc);
6197 }
6198
6199 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6200 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6201 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6202 // Add zero counts of lower word and upper word of a double word if
6203 // upper word holds a zero value.
6204 vpsrld(xtmp3, src, 16, vec_enc);
6205 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6206 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6207 vpslld(xtmp2, dst, 16, vec_enc);
6208 vpaddd(xtmp2, xtmp2, dst, vec_enc);
6209 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6210 vpsrld(dst, dst, 16, vec_enc);
6211 // Add zero counts of lower doubleword and upper doubleword of a
6212 // quadword if upper doubleword holds a zero value.
6213 vpsrlq(xtmp3, src, 32, vec_enc);
6214 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6215 vpsllq(xtmp2, dst, 32, vec_enc);
6216 vpaddq(xtmp2, xtmp2, dst, vec_enc);
6217 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6218 vpsrlq(dst, dst, 32, vec_enc);
6219 }
6220
6221 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6222 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6223 Register rtmp, int vec_enc) {
6224 assert(is_integral_type(bt), "unexpected type");
6225 assert(vec_enc < Assembler::AVX_512bit, "");
6226 switch(bt) {
6227 case T_LONG:
6228 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6229 break;
6230 case T_INT:
6231 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6232 break;
6233 case T_SHORT:
6234 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6235 break;
6236 case T_BYTE:
6237 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6238 break;
6239 default:
6240 fatal("Unsupported type %s", type2name(bt));
6241 break;
6242 }
6243 }
6244
6245 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6246 switch(bt) {
6247 case T_BYTE:
6248 vpsubb(dst, src1, src2, vec_enc);
6249 break;
6250 case T_SHORT:
6251 vpsubw(dst, src1, src2, vec_enc);
6252 break;
6253 case T_INT:
6254 vpsubd(dst, src1, src2, vec_enc);
6255 break;
6256 case T_LONG:
6257 vpsubq(dst, src1, src2, vec_enc);
6258 break;
6259 default:
6260 fatal("Unsupported type %s", type2name(bt));
6261 break;
6262 }
6263 }
6264
6265 // Trailing zero count computation is based on leading zero count operation as per
6266 // following equation. All AVX3 targets support AVX512CD feature which offers
6267 // direct vector instruction to compute leading zero count.
6268 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6269 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6270 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6271 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6272 assert(is_integral_type(bt), "");
6273 // xtmp = -1
6274 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6275 // xtmp = xtmp + src
6276 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6277 // xtmp = xtmp & ~src
6278 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6279 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6280 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6281 vpsub(bt, dst, xtmp4, dst, vec_enc);
6282 }
6283
6284 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6285 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6286 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6287 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6288 assert(is_integral_type(bt), "");
6289 // xtmp = 0
6290 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6291 // xtmp = 0 - src
6292 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6293 // xtmp = xtmp | src
6294 vpor(xtmp3, xtmp3, src, vec_enc);
6295 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6296 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6297 vpsub(bt, dst, xtmp1, dst, vec_enc);
6298 }
6299
6300 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6301 Label done;
6302 Label neg_divisor_fastpath;
6303 cmpl(divisor, 0);
6304 jccb(Assembler::less, neg_divisor_fastpath);
6305 xorl(rdx, rdx);
6306 divl(divisor);
6307 jmpb(done);
6308 bind(neg_divisor_fastpath);
6309 // Fastpath for divisor < 0:
6310 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6311 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6312 movl(rdx, rax);
6313 subl(rdx, divisor);
6314 if (VM_Version::supports_bmi1()) {
6315 andnl(rax, rdx, rax);
6316 } else {
6317 notl(rdx);
6318 andl(rax, rdx);
6319 }
6320 shrl(rax, 31);
6321 bind(done);
6322 }
6323
6324 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6325 Label done;
6326 Label neg_divisor_fastpath;
6327 cmpl(divisor, 0);
6328 jccb(Assembler::less, neg_divisor_fastpath);
6329 xorl(rdx, rdx);
6330 divl(divisor);
6331 jmpb(done);
6332 bind(neg_divisor_fastpath);
6333 // Fastpath when divisor < 0:
6334 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6335 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6336 movl(rdx, rax);
6337 subl(rax, divisor);
6338 if (VM_Version::supports_bmi1()) {
6339 andnl(rax, rax, rdx);
6340 } else {
6341 notl(rax);
6342 andl(rax, rdx);
6343 }
6344 sarl(rax, 31);
6345 andl(rax, divisor);
6346 subl(rdx, rax);
6347 bind(done);
6348 }
6349
6350 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6351 Label done;
6352 Label neg_divisor_fastpath;
6353
6354 cmpl(divisor, 0);
6355 jccb(Assembler::less, neg_divisor_fastpath);
6356 xorl(rdx, rdx);
6357 divl(divisor);
6358 jmpb(done);
6359 bind(neg_divisor_fastpath);
6360 // Fastpath for divisor < 0:
6361 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6362 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6363 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6364 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6365 movl(rdx, rax);
6366 subl(rax, divisor);
6367 if (VM_Version::supports_bmi1()) {
6368 andnl(rax, rax, rdx);
6369 } else {
6370 notl(rax);
6371 andl(rax, rdx);
6372 }
6373 movl(tmp, rax);
6374 shrl(rax, 31); // quotient
6375 sarl(tmp, 31);
6376 andl(tmp, divisor);
6377 subl(rdx, tmp); // remainder
6378 bind(done);
6379 }
6380
6381 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6382 XMMRegister xtmp2, Register rtmp) {
6383 if(VM_Version::supports_gfni()) {
6384 // Galois field instruction based bit reversal based on following algorithm.
6385 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6386 mov64(rtmp, 0x8040201008040201L);
6387 movq(xtmp1, src);
6388 movq(xtmp2, rtmp);
6389 gf2p8affineqb(xtmp1, xtmp2, 0);
6390 movq(dst, xtmp1);
6391 } else {
6392 // Swap even and odd numbered bits.
6393 movl(rtmp, src);
6394 andl(rtmp, 0x55555555);
6395 shll(rtmp, 1);
6396 movl(dst, src);
6397 andl(dst, 0xAAAAAAAA);
6398 shrl(dst, 1);
6399 orl(dst, rtmp);
6400
6401 // Swap LSB and MSB 2 bits of each nibble.
6402 movl(rtmp, dst);
6403 andl(rtmp, 0x33333333);
6404 shll(rtmp, 2);
6405 andl(dst, 0xCCCCCCCC);
6406 shrl(dst, 2);
6407 orl(dst, rtmp);
6408
6409 // Swap LSB and MSB 4 bits of each byte.
6410 movl(rtmp, dst);
6411 andl(rtmp, 0x0F0F0F0F);
6412 shll(rtmp, 4);
6413 andl(dst, 0xF0F0F0F0);
6414 shrl(dst, 4);
6415 orl(dst, rtmp);
6416 }
6417 bswapl(dst);
6418 }
6419
6420 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6421 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6422 if(VM_Version::supports_gfni()) {
6423 // Galois field instruction based bit reversal based on following algorithm.
6424 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6425 mov64(rtmp1, 0x8040201008040201L);
6426 movq(xtmp1, src);
6427 movq(xtmp2, rtmp1);
6428 gf2p8affineqb(xtmp1, xtmp2, 0);
6429 movq(dst, xtmp1);
6430 } else {
6431 // Swap even and odd numbered bits.
6432 movq(rtmp1, src);
6433 mov64(rtmp2, 0x5555555555555555L);
6434 andq(rtmp1, rtmp2);
6435 shlq(rtmp1, 1);
6436 movq(dst, src);
6437 notq(rtmp2);
6438 andq(dst, rtmp2);
6439 shrq(dst, 1);
6440 orq(dst, rtmp1);
6441
6442 // Swap LSB and MSB 2 bits of each nibble.
6443 movq(rtmp1, dst);
6444 mov64(rtmp2, 0x3333333333333333L);
6445 andq(rtmp1, rtmp2);
6446 shlq(rtmp1, 2);
6447 notq(rtmp2);
6448 andq(dst, rtmp2);
6449 shrq(dst, 2);
6450 orq(dst, rtmp1);
6451
6452 // Swap LSB and MSB 4 bits of each byte.
6453 movq(rtmp1, dst);
6454 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6455 andq(rtmp1, rtmp2);
6456 shlq(rtmp1, 4);
6457 notq(rtmp2);
6458 andq(dst, rtmp2);
6459 shrq(dst, 4);
6460 orq(dst, rtmp1);
6461 }
6462 bswapq(dst);
6463 }
6464
6465 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6466 Label done;
6467 Label neg_divisor_fastpath;
6468 cmpq(divisor, 0);
6469 jccb(Assembler::less, neg_divisor_fastpath);
6470 xorl(rdx, rdx);
6471 divq(divisor);
6472 jmpb(done);
6473 bind(neg_divisor_fastpath);
6474 // Fastpath for divisor < 0:
6475 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6476 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6477 movq(rdx, rax);
6478 subq(rdx, divisor);
6479 if (VM_Version::supports_bmi1()) {
6480 andnq(rax, rdx, rax);
6481 } else {
6482 notq(rdx);
6483 andq(rax, rdx);
6484 }
6485 shrq(rax, 63);
6486 bind(done);
6487 }
6488
6489 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6490 Label done;
6491 Label neg_divisor_fastpath;
6492 cmpq(divisor, 0);
6493 jccb(Assembler::less, neg_divisor_fastpath);
6494 xorq(rdx, rdx);
6495 divq(divisor);
6496 jmp(done);
6497 bind(neg_divisor_fastpath);
6498 // Fastpath when divisor < 0:
6499 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6500 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6501 movq(rdx, rax);
6502 subq(rax, divisor);
6503 if (VM_Version::supports_bmi1()) {
6504 andnq(rax, rax, rdx);
6505 } else {
6506 notq(rax);
6507 andq(rax, rdx);
6508 }
6509 sarq(rax, 63);
6510 andq(rax, divisor);
6511 subq(rdx, rax);
6512 bind(done);
6513 }
6514
6515 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6516 Label done;
6517 Label neg_divisor_fastpath;
6518 cmpq(divisor, 0);
6519 jccb(Assembler::less, neg_divisor_fastpath);
6520 xorq(rdx, rdx);
6521 divq(divisor);
6522 jmp(done);
6523 bind(neg_divisor_fastpath);
6524 // Fastpath for divisor < 0:
6525 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6526 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6527 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6528 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6529 movq(rdx, rax);
6530 subq(rax, divisor);
6531 if (VM_Version::supports_bmi1()) {
6532 andnq(rax, rax, rdx);
6533 } else {
6534 notq(rax);
6535 andq(rax, rdx);
6536 }
6537 movq(tmp, rax);
6538 shrq(rax, 63); // quotient
6539 sarq(tmp, 63);
6540 andq(tmp, divisor);
6541 subq(rdx, tmp); // remainder
6542 bind(done);
6543 }
6544
6545 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6546 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6547 int vlen_enc) {
6548 assert(VM_Version::supports_avx512bw(), "");
6549 // Byte shuffles are inlane operations and indices are determined using
6550 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6551 // normalized to index range 0-15. This makes sure that all the multiples
6552 // of an index value are placed at same relative position in 128 bit
6553 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6554 // will be 16th element in their respective 128 bit lanes.
6555 movl(rtmp, 16);
6556 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6557
6558 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6559 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6560 // original shuffle indices and move the shuffled lanes corresponding to true
6561 // mask to destination vector.
6562 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6563 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6564 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6565
6566 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6567 // and broadcasting second 128 bit lane.
6568 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6569 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6570 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6571 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6572 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6573
6574 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6575 // and broadcasting third 128 bit lane.
6576 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6577 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6578 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6579 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6580 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6581
6582 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6583 // and broadcasting third 128 bit lane.
6584 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6585 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6586 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6587 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6588 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6589 }
6590
6591 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6592 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6593 if (vlen_enc == AVX_128bit) {
6594 vpermilps(dst, src, shuffle, vlen_enc);
6595 } else if (bt == T_INT) {
6596 vpermd(dst, shuffle, src, vlen_enc);
6597 } else {
6598 assert(bt == T_FLOAT, "");
6599 vpermps(dst, shuffle, src, vlen_enc);
6600 }
6601 }
6602
6603 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6604 switch(opcode) {
6605 case Op_AddHF: vaddsh(dst, src1, src2); break;
6606 case Op_SubHF: vsubsh(dst, src1, src2); break;
6607 case Op_MulHF: vmulsh(dst, src1, src2); break;
6608 case Op_DivHF: vdivsh(dst, src1, src2); break;
6609 default: assert(false, "%s", NodeClassNames[opcode]); break;
6610 }
6611 }
6612
6613 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6614 switch(elem_bt) {
6615 case T_BYTE:
6616 if (ideal_opc == Op_SaturatingAddV) {
6617 vpaddsb(dst, src1, src2, vlen_enc);
6618 } else {
6619 assert(ideal_opc == Op_SaturatingSubV, "");
6620 vpsubsb(dst, src1, src2, vlen_enc);
6621 }
6622 break;
6623 case T_SHORT:
6624 if (ideal_opc == Op_SaturatingAddV) {
6625 vpaddsw(dst, src1, src2, vlen_enc);
6626 } else {
6627 assert(ideal_opc == Op_SaturatingSubV, "");
6628 vpsubsw(dst, src1, src2, vlen_enc);
6629 }
6630 break;
6631 default:
6632 fatal("Unsupported type %s", type2name(elem_bt));
6633 break;
6634 }
6635 }
6636
6637 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6638 switch(elem_bt) {
6639 case T_BYTE:
6640 if (ideal_opc == Op_SaturatingAddV) {
6641 vpaddusb(dst, src1, src2, vlen_enc);
6642 } else {
6643 assert(ideal_opc == Op_SaturatingSubV, "");
6644 vpsubusb(dst, src1, src2, vlen_enc);
6645 }
6646 break;
6647 case T_SHORT:
6648 if (ideal_opc == Op_SaturatingAddV) {
6649 vpaddusw(dst, src1, src2, vlen_enc);
6650 } else {
6651 assert(ideal_opc == Op_SaturatingSubV, "");
6652 vpsubusw(dst, src1, src2, vlen_enc);
6653 }
6654 break;
6655 default:
6656 fatal("Unsupported type %s", type2name(elem_bt));
6657 break;
6658 }
6659 }
6660
6661 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6662 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6663 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6664 // overflow_mask = Inp1 <u Inp2
6665 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6666 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6667 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6668 }
6669
6670 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6671 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6672 // Emulate unsigned comparison using signed comparison
6673 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6674 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6675 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6676 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6677
6678 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6679
6680 // Res = INP1 - INP2 (non-commutative and non-associative)
6681 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6682 // Res = Mask ? Zero : Res
6683 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6684 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6685 }
6686
6687 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6688 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6689 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6690 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6691 // Res = Signed Add INP1, INP2
6692 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6693 // T1 = SRC1 | SRC2
6694 vpor(xtmp1, src1, src2, vlen_enc);
6695 // Max_Unsigned = -1
6696 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6697 // Unsigned compare: Mask = Res <u T1
6698 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6699 // res = Mask ? Max_Unsigned : Res
6700 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6701 }
6702
6703 //
6704 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6705 // unsigned addition operation.
6706 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6707 //
6708 // We empirically determined its semantic equivalence to following reduced expression
6709 // overflow_mask = (a + b) <u (a | b)
6710 //
6711 // and also verified it though Alive2 solver.
6712 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6713 //
6714
6715 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6716 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6717 // Res = Signed Add INP1, INP2
6718 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6719 // Compute T1 = INP1 | INP2
6720 vpor(xtmp3, src1, src2, vlen_enc);
6721 // T1 = Minimum signed value.
6722 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6723 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6724 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6725 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6726 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6727 // Compute overflow detection mask = Res<1> <s T1
6728 if (elem_bt == T_INT) {
6729 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6730 } else {
6731 assert(elem_bt == T_LONG, "");
6732 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6733 }
6734 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6735 }
6736
6737 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6738 int vlen_enc, bool xtmp2_hold_M1) {
6739 if (VM_Version::supports_avx512dq()) {
6740 evpmovq2m(ktmp, src, vlen_enc);
6741 } else {
6742 assert(VM_Version::supports_evex(), "");
6743 if (!xtmp2_hold_M1) {
6744 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6745 }
6746 evpsraq(xtmp1, src, 63, vlen_enc);
6747 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6748 }
6749 }
6750
6751 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6752 int vlen_enc, bool xtmp2_hold_M1) {
6753 if (VM_Version::supports_avx512dq()) {
6754 evpmovd2m(ktmp, src, vlen_enc);
6755 } else {
6756 assert(VM_Version::supports_evex(), "");
6757 if (!xtmp2_hold_M1) {
6758 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6759 }
6760 vpsrad(xtmp1, src, 31, vlen_enc);
6761 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6762 }
6763 }
6764
6765
6766 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6767 if (elem_bt == T_LONG) {
6768 if (VM_Version::supports_evex()) {
6769 evpsraq(dst, src, 63, vlen_enc);
6770 } else {
6771 vpsrad(dst, src, 31, vlen_enc);
6772 vpshufd(dst, dst, 0xF5, vlen_enc);
6773 }
6774 } else {
6775 assert(elem_bt == T_INT, "");
6776 vpsrad(dst, src, 31, vlen_enc);
6777 }
6778 }
6779
6780 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6781 if (compute_allones) {
6782 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6783 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6784 } else {
6785 vpcmpeqq(allones, allones, allones, vlen_enc);
6786 }
6787 }
6788 if (elem_bt == T_LONG) {
6789 vpsrlq(dst, allones, 1, vlen_enc);
6790 } else {
6791 assert(elem_bt == T_INT, "");
6792 vpsrld(dst, allones, 1, vlen_enc);
6793 }
6794 }
6795
6796 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6797 if (compute_allones) {
6798 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6799 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6800 } else {
6801 vpcmpeqq(allones, allones, allones, vlen_enc);
6802 }
6803 }
6804 if (elem_bt == T_LONG) {
6805 vpsllq(dst, allones, 63, vlen_enc);
6806 } else {
6807 assert(elem_bt == T_INT, "");
6808 vpslld(dst, allones, 31, vlen_enc);
6809 }
6810 }
6811
6812 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6813 Assembler::ComparisonPredicate cond, int vlen_enc) {
6814 switch(elem_bt) {
6815 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6816 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6817 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6818 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6819 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6820 }
6821 }
6822
6823 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6824 switch(elem_bt) {
6825 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6826 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6827 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6828 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6829 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6830 }
6831 }
6832
6833 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6834 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6835 if (elem_bt == T_LONG) {
6836 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6837 } else {
6838 assert(elem_bt == T_INT, "");
6839 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6840 }
6841 }
6842
6843 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6844 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6845 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6846 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6847 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6848 // Overflow detection based on Hacker's delight section 2-13.
6849 if (ideal_opc == Op_SaturatingAddV) {
6850 // res = src1 + src2
6851 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6852 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6853 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6854 vpxor(xtmp1, dst, src1, vlen_enc);
6855 vpxor(xtmp2, dst, src2, vlen_enc);
6856 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6857 } else {
6858 assert(ideal_opc == Op_SaturatingSubV, "");
6859 // res = src1 - src2
6860 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6861 // Overflow occurs when both inputs have opposite polarity and
6862 // result polarity does not comply with first input polarity.
6863 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6864 vpxor(xtmp1, src1, src2, vlen_enc);
6865 vpxor(xtmp2, dst, src1, vlen_enc);
6866 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6867 }
6868
6869 // Compute overflow detection mask.
6870 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6871 // Note: xtmp1 hold -1 in all its lanes after above call.
6872
6873 // Compute mask based on first input polarity.
6874 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6875
6876 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6877 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6878
6879 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6880 // set bits in first input polarity mask holds a min value.
6881 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6882 // Blend destination lanes with saturated values using overflow detection mask.
6883 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6884 }
6885
6886
6887 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6888 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6889 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6890 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6891 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6892 // Overflow detection based on Hacker's delight section 2-13.
6893 if (ideal_opc == Op_SaturatingAddV) {
6894 // res = src1 + src2
6895 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6896 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6897 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6898 vpxor(xtmp1, dst, src1, vlen_enc);
6899 vpxor(xtmp2, dst, src2, vlen_enc);
6900 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6901 } else {
6902 assert(ideal_opc == Op_SaturatingSubV, "");
6903 // res = src1 - src2
6904 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6905 // Overflow occurs when both inputs have opposite polarity and
6906 // result polarity does not comply with first input polarity.
6907 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6908 vpxor(xtmp1, src1, src2, vlen_enc);
6909 vpxor(xtmp2, dst, src1, vlen_enc);
6910 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6911 }
6912
6913 // Sign-extend to compute overflow detection mask.
6914 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6915
6916 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6917 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6918 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6919
6920 // Compose saturating min/max vector using first input polarity mask.
6921 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6922 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6923
6924 // Blend result with saturating vector using overflow detection mask.
6925 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6926 }
6927
6928 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6929 switch(elem_bt) {
6930 case T_BYTE:
6931 if (ideal_opc == Op_SaturatingAddV) {
6932 vpaddsb(dst, src1, src2, vlen_enc);
6933 } else {
6934 assert(ideal_opc == Op_SaturatingSubV, "");
6935 vpsubsb(dst, src1, src2, vlen_enc);
6936 }
6937 break;
6938 case T_SHORT:
6939 if (ideal_opc == Op_SaturatingAddV) {
6940 vpaddsw(dst, src1, src2, vlen_enc);
6941 } else {
6942 assert(ideal_opc == Op_SaturatingSubV, "");
6943 vpsubsw(dst, src1, src2, vlen_enc);
6944 }
6945 break;
6946 default:
6947 fatal("Unsupported type %s", type2name(elem_bt));
6948 break;
6949 }
6950 }
6951
6952 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6953 switch(elem_bt) {
6954 case T_BYTE:
6955 if (ideal_opc == Op_SaturatingAddV) {
6956 vpaddusb(dst, src1, src2, vlen_enc);
6957 } else {
6958 assert(ideal_opc == Op_SaturatingSubV, "");
6959 vpsubusb(dst, src1, src2, vlen_enc);
6960 }
6961 break;
6962 case T_SHORT:
6963 if (ideal_opc == Op_SaturatingAddV) {
6964 vpaddusw(dst, src1, src2, vlen_enc);
6965 } else {
6966 assert(ideal_opc == Op_SaturatingSubV, "");
6967 vpsubusw(dst, src1, src2, vlen_enc);
6968 }
6969 break;
6970 default:
6971 fatal("Unsupported type %s", type2name(elem_bt));
6972 break;
6973 }
6974 }
6975
6976 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6977 XMMRegister src2, int vlen_enc) {
6978 switch(elem_bt) {
6979 case T_BYTE:
6980 evpermi2b(dst, src1, src2, vlen_enc);
6981 break;
6982 case T_SHORT:
6983 evpermi2w(dst, src1, src2, vlen_enc);
6984 break;
6985 case T_INT:
6986 evpermi2d(dst, src1, src2, vlen_enc);
6987 break;
6988 case T_LONG:
6989 evpermi2q(dst, src1, src2, vlen_enc);
6990 break;
6991 case T_FLOAT:
6992 evpermi2ps(dst, src1, src2, vlen_enc);
6993 break;
6994 case T_DOUBLE:
6995 evpermi2pd(dst, src1, src2, vlen_enc);
6996 break;
6997 default:
6998 fatal("Unsupported type %s", type2name(elem_bt));
6999 break;
7000 }
7001 }
7002
7003 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7004 if (is_unsigned) {
7005 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7006 } else {
7007 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7008 }
7009 }
7010
7011 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7012 if (is_unsigned) {
7013 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7014 } else {
7015 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7016 }
7017 }
7018
7019 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7020 switch(opcode) {
7021 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7022 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7023 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7024 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7025 default: assert(false, "%s", NodeClassNames[opcode]); break;
7026 }
7027 }
7028
7029 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7030 switch(opcode) {
7031 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7032 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7033 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7034 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7035 default: assert(false, "%s", NodeClassNames[opcode]); break;
7036 }
7037 }
7038
7039 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7040 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7041 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7042 }
7043
7044 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7045 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7046 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7047 // Move sign bits of src2 to mask register.
7048 evpmovw2m(ktmp, src2, vlen_enc);
7049 // xtmp1 = src2 < 0 ? src2 : src1
7050 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7051 // xtmp2 = src2 < 0 ? ? src1 : src2
7052 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7053 // Idea behind above swapping is to make seconds source operand a +ve value.
7054 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7055 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7056 // the second source operand, either a NaN or a valid floating-point value, is returned
7057 // dst = max(xtmp1, xtmp2)
7058 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7059 // isNaN = is_unordered_quiet(xtmp1)
7060 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7061 // Final result is same as first source if its a NaN value,
7062 // in case second operand holds a NaN value then as per above semantics
7063 // result is same as second operand.
7064 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7065 } else {
7066 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7067 // Move sign bits of src1 to mask register.
7068 evpmovw2m(ktmp, src1, vlen_enc);
7069 // xtmp1 = src1 < 0 ? src2 : src1
7070 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7071 // xtmp2 = src1 < 0 ? src1 : src2
7072 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7073 // Idea behind above swapping is to make seconds source operand a -ve value.
7074 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7075 // the second source operand is returned.
7076 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7077 // or a valid floating-point value, is written to the result.
7078 // dst = min(xtmp1, xtmp2)
7079 evminph(dst, xtmp1, xtmp2, vlen_enc);
7080 // isNaN = is_unordered_quiet(xtmp1)
7081 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7082 // Final result is same as first source if its a NaN value,
7083 // in case second operand holds a NaN value then as per above semantics
7084 // result is same as second operand.
7085 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7086 }
7087 }