1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "oops/methodData.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/opcodes.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/globals.hpp"
36 #include "runtime/objectMonitor.hpp"
37 #include "runtime/objectMonitorTable.hpp"
38 #include "runtime/stubRoutines.hpp"
39 #include "runtime/synchronizer.hpp"
40 #include "utilities/checkedCast.hpp"
41 #include "utilities/globalDefinitions.hpp"
42 #include "utilities/powerOfTwo.hpp"
43 #include "utilities/sizes.hpp"
44
45 #ifdef PRODUCT
46 #define BLOCK_COMMENT(str) /* nothing */
47 #define STOP(error) stop(error)
48 #else
49 #define BLOCK_COMMENT(str) block_comment(str)
50 #define STOP(error) block_comment(error); stop(error)
51 #endif
52
53 // C2 compiled method's prolog code.
54 // Beware! This sp_inc is NOT the same as the one mentioned in MacroAssembler::remove_frame but only the size
55 // of the extension space + the additional copy of the return address. That means, it doesn't contain the
56 // frame size (where the local and sp_inc are) and the saved RBP.
57 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
58 if (C->clinit_barrier_on_entry()) {
59 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
60 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
61
62 Label L_skip_barrier;
63 Register klass = rscratch1;
64
65 mov_metadata(klass, C->method()->holder()->constant_encoding());
66 clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
67
68 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
69
70 bind(L_skip_barrier);
71 }
72
73 int framesize = C->output()->frame_size_in_bytes();
74 int bangsize = C->output()->bang_size_in_bytes();
75 bool fp_mode_24b = false;
76 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
77
78 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
79
80 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
81 // Remove word for return addr
82 framesize -= wordSize;
83 stack_bang_size -= wordSize;
84
85 // Calls to C2R adapters often do not accept exceptional returns.
86 // We require that their callers must bang for them. But be careful, because
87 // some VM calls (such as call site linkage) can use several kilobytes of
88 // stack. But the stack safety zone should account for that.
89 // See bugs 4446381, 4468289, 4497237.
90 if (stack_bang_size > 0) {
91 generate_stack_overflow_check(stack_bang_size);
92
93 // We always push rbp, so that on return to interpreter rbp, will be
94 // restored correctly and we can correct the stack.
95 push(rbp);
96 #ifdef ASSERT
97 if (sp_inc > 0) {
98 movl(Address(rsp, 0), badRegWordVal);
99 movl(Address(rsp, VMRegImpl::stack_slot_size), badRegWordVal);
100 }
101 #endif
102 // Save caller's stack pointer into RBP if the frame pointer is preserved.
103 if (PreserveFramePointer) {
104 mov(rbp, rsp);
105 }
106 // Remove word for ebp
107 framesize -= wordSize;
108
109 // Create frame
110 if (framesize) {
111 subptr(rsp, framesize);
112 }
113 } else {
114 subptr(rsp, framesize);
115
116 // Save RBP register now.
117 framesize -= wordSize;
118 movptr(Address(rsp, framesize), rbp);
119 #ifdef ASSERT
120 if (sp_inc > 0) {
121 movl(Address(rsp, framesize), badRegWordVal);
122 movl(Address(rsp, framesize + VMRegImpl::stack_slot_size), badRegWordVal);
123 }
124 #endif
125 // Save caller's stack pointer into RBP if the frame pointer is preserved.
126 if (PreserveFramePointer) {
127 movptr(rbp, rsp);
128 if (framesize > 0) {
129 addptr(rbp, framesize);
130 }
131 }
132 }
133
134 if (C->needs_stack_repair()) {
135 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
136 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
137 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize);
138 }
139
140 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
141 framesize -= wordSize;
142 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
143 }
144
145 #ifdef ASSERT
146 if (VerifyStackAtCalls) {
147 Label L;
148 push(rax);
149 mov(rax, rsp);
150 andptr(rax, StackAlignmentInBytes-1);
151 cmpptr(rax, StackAlignmentInBytes-wordSize);
152 pop(rax);
153 jcc(Assembler::equal, L);
154 STOP("Stack is not properly aligned!");
155 bind(L);
156 }
157 #endif
158 }
159
160 void C2_MacroAssembler::entry_barrier() {
161 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
162 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
163 Label dummy_slow_path;
164 Label dummy_continuation;
165 Label* slow_path = &dummy_slow_path;
166 Label* continuation = &dummy_continuation;
167 if (!Compile::current()->output()->in_scratch_emit_size()) {
168 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
169 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
170 Compile::current()->output()->add_stub(stub);
171 slow_path = &stub->entry();
172 continuation = &stub->continuation();
173 }
174 bs->nmethod_entry_barrier(this, slow_path, continuation);
175 }
176
177 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
178 switch (vlen_in_bytes) {
179 case 4: // fall-through
180 case 8: // fall-through
181 case 16: return Assembler::AVX_128bit;
182 case 32: return Assembler::AVX_256bit;
183 case 64: return Assembler::AVX_512bit;
184
185 default: {
186 ShouldNotReachHere();
187 return Assembler::AVX_NoVec;
188 }
189 }
190 }
191
192 // fast_lock and fast_unlock used by C2
193
194 // Because the transitions from emitted code to the runtime
195 // monitorenter/exit helper stubs are so slow it's critical that
196 // we inline both the lock-stack fast path and the inflated fast path.
197 //
198 // See also: cmpFastLock and cmpFastUnlock.
199 //
200 // What follows is a specialized inline transliteration of the code
201 // in enter() and exit(). If we're concerned about I$ bloat another
202 // option would be to emit TrySlowEnter and TrySlowExit methods
203 // at startup-time. These methods would accept arguments as
204 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
205 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
206 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
207 // In practice, however, the # of lock sites is bounded and is usually small.
208 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
209 // if the processor uses simple bimodal branch predictors keyed by EIP
210 // Since the helper routines would be called from multiple synchronization
211 // sites.
212 //
213 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
214 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
215 // to those specialized methods. That'd give us a mostly platform-independent
216 // implementation that the JITs could optimize and inline at their pleasure.
217 // Done correctly, the only time we'd need to cross to native could would be
218 // to park() or unpark() threads. We'd also need a few more unsafe operators
219 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
220 // (b) explicit barriers or fence operations.
221 //
222 // TODO:
223 //
224 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
225 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
226 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
227 // the lock operators would typically be faster than reifying Self.
228 //
229 // * Ideally I'd define the primitives as:
230 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
231 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
232 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
233 // Instead, we're stuck with a rather awkward and brittle register assignments below.
234 // Furthermore the register assignments are overconstrained, possibly resulting in
235 // sub-optimal code near the synchronization site.
236 //
237 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
238 // Alternately, use a better sp-proximity test.
239 //
240 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
241 // Either one is sufficient to uniquely identify a thread.
242 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
243 //
244 // * Intrinsify notify() and notifyAll() for the common cases where the
245 // object is locked by the calling thread but the waitlist is empty.
246 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
247 //
248 // * use jccb and jmpb instead of jcc and jmp to improve code density.
249 // But beware of excessive branch density on AMD Opterons.
250 //
251 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
252 // or failure of the fast path. If the fast path fails then we pass
253 // control to the slow path, typically in C. In fast_lock and
254 // fast_unlock we often branch to DONE_LABEL, just to find that C2
255 // will emit a conditional branch immediately after the node.
256 // So we have branches to branches and lots of ICC.ZF games.
257 // Instead, it might be better to have C2 pass a "FailureLabel"
258 // into fast_lock and fast_unlock. In the case of success, control
259 // will drop through the node. ICC.ZF is undefined at exit.
260 // In the case of failure, the node will branch directly to the
261 // FailureLabel
262
263 // obj: object to lock
264 // box: on-stack box address -- KILLED
265 // rax: tmp -- KILLED
266 // t : tmp -- KILLED
267 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
268 Register t, Register thread) {
269 assert(rax_reg == rax, "Used for CAS");
270 assert_different_registers(obj, box, rax_reg, t, thread);
271
272 // Handle inflated monitor.
273 Label inflated;
274 // Finish fast lock successfully. ZF value is irrelevant.
275 Label locked;
276 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
277 Label slow_path;
278
279 if (UseObjectMonitorTable) {
280 // Clear cache in case fast locking succeeds or we need to take the slow-path.
281 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
282 }
283
284 if (DiagnoseSyncOnValueBasedClasses != 0) {
285 load_klass(rax_reg, obj, t);
286 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
287 jcc(Assembler::notZero, slow_path);
288 }
289
290 const Register mark = t;
291
292 { // Fast Lock
293
294 Label push;
295
296 const Register top = UseObjectMonitorTable ? rax_reg : box;
297
298 // Load the mark.
299 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
300
301 // Prefetch top.
302 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
303
304 // Check for monitor (0b10).
305 testptr(mark, markWord::monitor_value);
306 jcc(Assembler::notZero, inflated);
307
308 // Check if lock-stack is full.
309 cmpl(top, LockStack::end_offset() - 1);
310 jcc(Assembler::greater, slow_path);
311
312 // Check if recursive.
313 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
314 jccb(Assembler::equal, push);
315
316 // Try to lock. Transition lock bits 0b01 => 0b00
317 movptr(rax_reg, mark);
318 orptr(rax_reg, markWord::unlocked_value);
319 andptr(mark, ~(int32_t)markWord::unlocked_value);
320 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
321 jcc(Assembler::notEqual, slow_path);
322
323 if (UseObjectMonitorTable) {
324 // Need to reload top, clobbered by CAS.
325 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
326 }
327 bind(push);
328 // After successful lock, push object on lock-stack.
329 movptr(Address(thread, top), obj);
330 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
331 jmp(locked);
332 }
333
334 { // Handle inflated monitor.
335 bind(inflated);
336
337 const Register monitor = t;
338
339 if (!UseObjectMonitorTable) {
340 assert(mark == monitor, "should be the same here");
341 } else {
342 const Register hash = t;
343 Label monitor_found;
344
345 // Look for the monitor in the om_cache.
346
347 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
348 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
349 const int num_unrolled = OMCache::CAPACITY;
350 for (int i = 0; i < num_unrolled; i++) {
351 movptr(monitor, Address(thread, cache_offset + monitor_offset));
352 cmpptr(obj, Address(thread, cache_offset));
353 jccb(Assembler::equal, monitor_found);
354 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
355 }
356
357 // Look for the monitor in the table.
358
359 // Get the hash code.
360 movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
361 shrq(hash, markWord::hash_shift);
362 andq(hash, markWord::hash_mask);
363
364 // Get the table and calculate the bucket's address.
365 lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
366 movptr(rax_reg, Address(rax_reg));
367 andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
368 movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
369
370 // Read the monitor from the bucket.
371 movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
372
373 // Check if the monitor in the bucket is special (empty, tombstone or removed)
374 cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
375 jcc(Assembler::below, slow_path);
376
377 // Check if object matches.
378 movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
379 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
380 bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
381 cmpptr(rax_reg, obj);
382 jcc(Assembler::notEqual, slow_path);
383
384 bind(monitor_found);
385 }
386 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
387 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
388 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
389
390 Label monitor_locked;
391 // Lock the monitor.
392
393 if (UseObjectMonitorTable) {
394 // Cache the monitor for unlock before trashing box. On failure to acquire
395 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
396 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
397 }
398
399 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
400 xorptr(rax_reg, rax_reg);
401 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
402 lock(); cmpxchgptr(box, owner_address);
403 jccb(Assembler::equal, monitor_locked);
404
405 // Check if recursive.
406 cmpptr(box, rax_reg);
407 jccb(Assembler::notEqual, slow_path);
408
409 // Recursive.
410 increment(recursions_address);
411
412 bind(monitor_locked);
413 }
414
415 bind(locked);
416 // Set ZF = 1
417 xorl(rax_reg, rax_reg);
418
419 #ifdef ASSERT
420 // Check that locked label is reached with ZF set.
421 Label zf_correct;
422 Label zf_bad_zero;
423 jcc(Assembler::zero, zf_correct);
424 jmp(zf_bad_zero);
425 #endif
426
427 bind(slow_path);
428 #ifdef ASSERT
429 // Check that slow_path label is reached with ZF not set.
430 jcc(Assembler::notZero, zf_correct);
431 stop("Fast Lock ZF != 0");
432 bind(zf_bad_zero);
433 stop("Fast Lock ZF != 1");
434 bind(zf_correct);
435 #endif
436 // C2 uses the value of ZF to determine the continuation.
437 }
438
439 // obj: object to lock
440 // rax: tmp -- KILLED
441 // t : tmp - cannot be obj nor rax -- KILLED
442 //
443 // Some commentary on balanced locking:
444 //
445 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
446 // Methods that don't have provably balanced locking are forced to run in the
447 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
448 // The interpreter provides two properties:
449 // I1: At return-time the interpreter automatically and quietly unlocks any
450 // objects acquired in the current activation (frame). Recall that the
451 // interpreter maintains an on-stack list of locks currently held by
452 // a frame.
453 // I2: If a method attempts to unlock an object that is not held by the
454 // frame the interpreter throws IMSX.
455 //
456 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
457 // B() doesn't have provably balanced locking so it runs in the interpreter.
458 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
459 // is still locked by A().
460 //
461 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
462 // Specification" states that an object locked by JNI's MonitorEnter should not be
463 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
464 // specify what will occur if a program engages in such mixed-mode locking, however.
465 // Arguably given that the spec legislates the JNI case as undefined our implementation
466 // could reasonably *avoid* checking owner in fast_unlock().
467 // In the interest of performance we elide m->Owner==Self check in unlock.
468 // A perfectly viable alternative is to elide the owner check except when
469 // Xcheck:jni is enabled.
470
471 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
472 assert(reg_rax == rax, "Used for CAS");
473 assert_different_registers(obj, reg_rax, t);
474
475 // Handle inflated monitor.
476 Label inflated, inflated_check_lock_stack;
477 // Finish fast unlock successfully. MUST jump with ZF == 1
478 Label unlocked, slow_path;
479
480 const Register mark = t;
481 const Register monitor = t;
482 const Register top = UseObjectMonitorTable ? t : reg_rax;
483 const Register box = reg_rax;
484
485 Label dummy;
486 C2FastUnlockStub* stub = nullptr;
487
488 if (!Compile::current()->output()->in_scratch_emit_size()) {
489 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
490 Compile::current()->output()->add_stub(stub);
491 }
492
493 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
494
495 { // Fast Unlock
496
497 // Load top.
498 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
499
500 if (!UseObjectMonitorTable) {
501 // Prefetch mark.
502 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
503 }
504
505 // Check if obj is top of lock-stack.
506 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
507 // Top of lock stack was not obj. Must be monitor.
508 jcc(Assembler::notEqual, inflated_check_lock_stack);
509
510 // Pop lock-stack.
511 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
512 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
513
514 // Check if recursive.
515 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
516 jcc(Assembler::equal, unlocked);
517
518 // We elide the monitor check, let the CAS fail instead.
519
520 if (UseObjectMonitorTable) {
521 // Load mark.
522 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
523 }
524
525 // Try to unlock. Transition lock bits 0b00 => 0b01
526 movptr(reg_rax, mark);
527 andptr(reg_rax, ~(int32_t)markWord::lock_mask_in_place);
528 orptr(mark, markWord::unlocked_value);
529 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
530 jcc(Assembler::notEqual, push_and_slow_path);
531 jmp(unlocked);
532 }
533
534
535 { // Handle inflated monitor.
536 bind(inflated_check_lock_stack);
537 #ifdef ASSERT
538 Label check_done;
539 subl(top, oopSize);
540 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
541 jcc(Assembler::below, check_done);
542 cmpptr(obj, Address(thread, top));
543 jcc(Assembler::notEqual, inflated_check_lock_stack);
544 stop("Fast Unlock lock on stack");
545 bind(check_done);
546 if (UseObjectMonitorTable) {
547 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
548 }
549 testptr(mark, markWord::monitor_value);
550 jcc(Assembler::notZero, inflated);
551 stop("Fast Unlock not monitor");
552 #endif
553
554 bind(inflated);
555
556 if (!UseObjectMonitorTable) {
557 assert(mark == monitor, "should be the same here");
558 } else {
559 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
560 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
561 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
562 cmpptr(monitor, alignof(ObjectMonitor*));
563 jcc(Assembler::below, slow_path);
564 }
565 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
566 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
567 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
568 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
569 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
570
571 Label recursive;
572
573 // Check if recursive.
574 cmpptr(recursions_address, 0);
575 jcc(Assembler::notZero, recursive);
576
577 // Set owner to null.
578 // Release to satisfy the JMM
579 movptr(owner_address, NULL_WORD);
580 // We need a full fence after clearing owner to avoid stranding.
581 // StoreLoad achieves this.
582 membar(StoreLoad);
583
584 // Check if the entry_list is empty.
585 cmpptr(entry_list_address, NULL_WORD);
586 jcc(Assembler::zero, unlocked); // If so we are done.
587
588 // Check if there is a successor.
589 cmpptr(succ_address, NULL_WORD);
590 jcc(Assembler::notZero, unlocked); // If so we are done.
591
592 // Save the monitor pointer in the current thread, so we can try to
593 // reacquire the lock in SharedRuntime::monitor_exit_helper().
594 if (!UseObjectMonitorTable) {
595 andptr(monitor, ~(int32_t)markWord::monitor_value);
596 }
597 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
598
599 orl(t, 1); // Fast Unlock ZF = 0
600 jmpb(slow_path);
601
602 // Recursive unlock.
603 bind(recursive);
604 decrement(recursions_address);
605 }
606
607 bind(unlocked);
608 xorl(t, t); // Fast Unlock ZF = 1
609
610 #ifdef ASSERT
611 // Check that unlocked label is reached with ZF set.
612 Label zf_correct;
613 Label zf_bad_zero;
614 jcc(Assembler::zero, zf_correct);
615 jmp(zf_bad_zero);
616 #endif
617
618 bind(slow_path);
619 if (stub != nullptr) {
620 bind(stub->slow_path_continuation());
621 }
622 #ifdef ASSERT
623 // Check that stub->continuation() label is reached with ZF not set.
624 jcc(Assembler::notZero, zf_correct);
625 stop("Fast Unlock ZF != 0");
626 bind(zf_bad_zero);
627 stop("Fast Unlock ZF != 1");
628 bind(zf_correct);
629 #endif
630 // C2 uses the value of ZF to determine the continuation.
631 }
632
633 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
634 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
635 }
636
637 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
638 const int framesize = Compile::current()->output()->frame_size_in_bytes();
639 masm->movptr(dst, rsp);
640 if (framesize > 2 * wordSize) {
641 masm->addptr(dst, framesize - 2 * wordSize);
642 }
643 }
644
645 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
646 if (PreserveFramePointer) {
647 // frame pointer is valid
648 #ifdef ASSERT
649 // Verify frame pointer value in rbp.
650 reconstruct_frame_pointer_helper(this, rtmp);
651 Label L_success;
652 cmpq(rbp, rtmp);
653 jccb(Assembler::equal, L_success);
654 STOP("frame pointer mismatch");
655 bind(L_success);
656 #endif // ASSERT
657 } else {
658 reconstruct_frame_pointer_helper(this, rbp);
659 }
660 }
661
662 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
663 jint lo = t->_lo;
664 jint hi = t->_hi;
665 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
666 if (t == TypeInt::INT) {
667 return;
668 }
669
670 BLOCK_COMMENT("CastII {");
671 Label fail;
672 Label succeed;
673
674 if (lo != min_jint) {
675 cmpl(val, lo);
676 jccb(Assembler::less, fail);
677 }
678 if (hi != max_jint) {
679 cmpl(val, hi);
680 jccb(Assembler::greater, fail);
681 }
682 jmpb(succeed);
683
684 bind(fail);
685 movl(c_rarg0, idx);
686 movl(c_rarg1, val);
687 movl(c_rarg2, lo);
688 movl(c_rarg3, hi);
689 reconstruct_frame_pointer(rscratch1);
690 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
691 hlt();
692 bind(succeed);
693 BLOCK_COMMENT("} // CastII");
694 }
695
696 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
697 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
698 }
699
700 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
701 jlong lo = t->_lo;
702 jlong hi = t->_hi;
703 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
704 if (t == TypeLong::LONG) {
705 return;
706 }
707
708 BLOCK_COMMENT("CastLL {");
709 Label fail;
710 Label succeed;
711
712 auto cmp_val = [&](jlong bound) {
713 if (is_simm32(bound)) {
714 cmpq(val, checked_cast<int>(bound));
715 } else {
716 mov64(tmp, bound);
717 cmpq(val, tmp);
718 }
719 };
720
721 if (lo != min_jlong) {
722 cmp_val(lo);
723 jccb(Assembler::less, fail);
724 }
725 if (hi != max_jlong) {
726 cmp_val(hi);
727 jccb(Assembler::greater, fail);
728 }
729 jmpb(succeed);
730
731 bind(fail);
732 movl(c_rarg0, idx);
733 movq(c_rarg1, val);
734 mov64(c_rarg2, lo);
735 mov64(c_rarg3, hi);
736 reconstruct_frame_pointer(rscratch1);
737 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
738 hlt();
739 bind(succeed);
740 BLOCK_COMMENT("} // CastLL");
741 }
742
743 //-------------------------------------------------------------------------------------------
744 // Generic instructions support for use in .ad files C2 code generation
745
746 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
747 if (dst != src) {
748 movdqu(dst, src);
749 }
750 if (opcode == Op_AbsVD) {
751 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
752 } else {
753 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
754 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
755 }
756 }
757
758 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
759 if (opcode == Op_AbsVD) {
760 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
761 } else {
762 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
763 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
764 }
765 }
766
767 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
768 if (dst != src) {
769 movdqu(dst, src);
770 }
771 if (opcode == Op_AbsVF) {
772 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
773 } else {
774 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
775 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
776 }
777 }
778
779 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
780 if (opcode == Op_AbsVF) {
781 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
782 } else {
783 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
784 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
785 }
786 }
787
788 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
789 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
790 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
791
792 if (opcode == Op_MinV) {
793 if (elem_bt == T_BYTE) {
794 pminsb(dst, src);
795 } else if (elem_bt == T_SHORT) {
796 pminsw(dst, src);
797 } else if (elem_bt == T_INT) {
798 pminsd(dst, src);
799 } else {
800 assert(elem_bt == T_LONG, "required");
801 assert(tmp == xmm0, "required");
802 assert_different_registers(dst, src, tmp);
803 movdqu(xmm0, dst);
804 pcmpgtq(xmm0, src);
805 blendvpd(dst, src); // xmm0 as mask
806 }
807 } else { // opcode == Op_MaxV
808 if (elem_bt == T_BYTE) {
809 pmaxsb(dst, src);
810 } else if (elem_bt == T_SHORT) {
811 pmaxsw(dst, src);
812 } else if (elem_bt == T_INT) {
813 pmaxsd(dst, src);
814 } else {
815 assert(elem_bt == T_LONG, "required");
816 assert(tmp == xmm0, "required");
817 assert_different_registers(dst, src, tmp);
818 movdqu(xmm0, src);
819 pcmpgtq(xmm0, dst);
820 blendvpd(dst, src); // xmm0 as mask
821 }
822 }
823 }
824
825 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
826 XMMRegister src1, Address src2, int vlen_enc) {
827 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
828 if (opcode == Op_UMinV) {
829 switch(elem_bt) {
830 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
831 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
832 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
833 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
834 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
835 }
836 } else {
837 assert(opcode == Op_UMaxV, "required");
838 switch(elem_bt) {
839 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
840 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
841 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
842 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
843 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
844 }
845 }
846 }
847
848 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
849 // For optimality, leverage a full vector width of 512 bits
850 // for operations over smaller vector sizes on AVX512 targets.
851 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
852 if (opcode == Op_UMaxV) {
853 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
854 } else {
855 assert(opcode == Op_UMinV, "required");
856 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
857 }
858 } else {
859 // T1 = -1
860 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
861 // T1 = -1 << 63
862 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
863 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
864 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
865 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
866 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
867 // Mask = T2 > T1
868 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
869 if (opcode == Op_UMaxV) {
870 // Res = Mask ? Src2 : Src1
871 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
872 } else {
873 // Res = Mask ? Src1 : Src2
874 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
875 }
876 }
877 }
878
879 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
880 XMMRegister src1, XMMRegister src2, int vlen_enc) {
881 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
882 if (opcode == Op_UMinV) {
883 switch(elem_bt) {
884 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
885 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
886 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
887 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
888 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
889 }
890 } else {
891 assert(opcode == Op_UMaxV, "required");
892 switch(elem_bt) {
893 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
894 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
895 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
896 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
897 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
898 }
899 }
900 }
901
902 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
903 XMMRegister dst, XMMRegister src1, XMMRegister src2,
904 int vlen_enc) {
905 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
906
907 if (opcode == Op_MinV) {
908 if (elem_bt == T_BYTE) {
909 vpminsb(dst, src1, src2, vlen_enc);
910 } else if (elem_bt == T_SHORT) {
911 vpminsw(dst, src1, src2, vlen_enc);
912 } else if (elem_bt == T_INT) {
913 vpminsd(dst, src1, src2, vlen_enc);
914 } else {
915 assert(elem_bt == T_LONG, "required");
916 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
917 vpminsq(dst, src1, src2, vlen_enc);
918 } else {
919 assert_different_registers(dst, src1, src2);
920 vpcmpgtq(dst, src1, src2, vlen_enc);
921 vblendvpd(dst, src1, src2, dst, vlen_enc);
922 }
923 }
924 } else { // opcode == Op_MaxV
925 if (elem_bt == T_BYTE) {
926 vpmaxsb(dst, src1, src2, vlen_enc);
927 } else if (elem_bt == T_SHORT) {
928 vpmaxsw(dst, src1, src2, vlen_enc);
929 } else if (elem_bt == T_INT) {
930 vpmaxsd(dst, src1, src2, vlen_enc);
931 } else {
932 assert(elem_bt == T_LONG, "required");
933 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
934 vpmaxsq(dst, src1, src2, vlen_enc);
935 } else {
936 assert_different_registers(dst, src1, src2);
937 vpcmpgtq(dst, src1, src2, vlen_enc);
938 vblendvpd(dst, src2, src1, dst, vlen_enc);
939 }
940 }
941 }
942 }
943
944 // Float/Double min max
945
946 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
947 XMMRegister dst, XMMRegister a, XMMRegister b,
948 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
949 int vlen_enc) {
950 assert(UseAVX > 0, "required");
951 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
952 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
953 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
954 assert_different_registers(a, tmp, atmp, btmp);
955 assert_different_registers(b, tmp, atmp, btmp);
956
957 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
958 bool is_double_word = is_double_word_type(elem_bt);
959
960 /* Note on 'non-obvious' assembly sequence:
961 *
962 * While there are vminps/vmaxps instructions, there are two important differences between hardware
963 * and Java on how they handle floats:
964 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
965 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
966 *
967 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
968 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
969 * (only useful when signs differ, noop otherwise)
970 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
971
972 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
973 * btmp = (b < +0.0) ? a : b
974 * atmp = (b < +0.0) ? b : a
975 * Tmp = Max_Float(atmp , btmp)
976 * Res = (atmp == NaN) ? atmp : Tmp
977 */
978
979 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
980 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
981 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
982 XMMRegister mask;
983
984 if (!is_double_word && is_min) {
985 mask = a;
986 vblend = &MacroAssembler::vblendvps;
987 vmaxmin = &MacroAssembler::vminps;
988 vcmp = &MacroAssembler::vcmpps;
989 } else if (!is_double_word && !is_min) {
990 mask = b;
991 vblend = &MacroAssembler::vblendvps;
992 vmaxmin = &MacroAssembler::vmaxps;
993 vcmp = &MacroAssembler::vcmpps;
994 } else if (is_double_word && is_min) {
995 mask = a;
996 vblend = &MacroAssembler::vblendvpd;
997 vmaxmin = &MacroAssembler::vminpd;
998 vcmp = &MacroAssembler::vcmppd;
999 } else {
1000 assert(is_double_word && !is_min, "sanity");
1001 mask = b;
1002 vblend = &MacroAssembler::vblendvpd;
1003 vmaxmin = &MacroAssembler::vmaxpd;
1004 vcmp = &MacroAssembler::vcmppd;
1005 }
1006
1007 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1008 XMMRegister maxmin, scratch;
1009 if (dst == btmp) {
1010 maxmin = btmp;
1011 scratch = tmp;
1012 } else {
1013 maxmin = tmp;
1014 scratch = btmp;
1015 }
1016
1017 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1018 if (precompute_mask && !is_double_word) {
1019 vpsrad(tmp, mask, 32, vlen_enc);
1020 mask = tmp;
1021 } else if (precompute_mask && is_double_word) {
1022 vpxor(tmp, tmp, tmp, vlen_enc);
1023 vpcmpgtq(tmp, tmp, mask, vlen_enc);
1024 mask = tmp;
1025 }
1026
1027 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1028 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1029 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1030 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1031 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1032 }
1033
1034 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1035 XMMRegister dst, XMMRegister a, XMMRegister b,
1036 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1037 int vlen_enc) {
1038 assert(UseAVX > 2, "required");
1039 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1040 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1041 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1042 assert_different_registers(dst, a, atmp, btmp);
1043 assert_different_registers(dst, b, atmp, btmp);
1044
1045 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1046 bool is_double_word = is_double_word_type(elem_bt);
1047 bool merge = true;
1048
1049 if (!is_double_word && is_min) {
1050 evpmovd2m(ktmp, a, vlen_enc);
1051 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1052 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1053 vminps(dst, atmp, btmp, vlen_enc);
1054 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1055 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1056 } else if (!is_double_word && !is_min) {
1057 evpmovd2m(ktmp, b, vlen_enc);
1058 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1059 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1060 vmaxps(dst, atmp, btmp, vlen_enc);
1061 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1062 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1063 } else if (is_double_word && is_min) {
1064 evpmovq2m(ktmp, a, vlen_enc);
1065 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1066 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1067 vminpd(dst, atmp, btmp, vlen_enc);
1068 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1069 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1070 } else {
1071 assert(is_double_word && !is_min, "sanity");
1072 evpmovq2m(ktmp, b, vlen_enc);
1073 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1074 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1075 vmaxpd(dst, atmp, btmp, vlen_enc);
1076 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1077 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1078 }
1079 }
1080
1081 void C2_MacroAssembler::vminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1082 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1083 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1084 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1085
1086 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1087 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1088 if (elem_bt == T_FLOAT) {
1089 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1090 } else {
1091 assert(elem_bt == T_DOUBLE, "");
1092 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1093 }
1094 }
1095
1096 void C2_MacroAssembler::sminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1097 XMMRegister src1, XMMRegister src2) {
1098 assert(opc == Op_MinF || opc == Op_MaxF ||
1099 opc == Op_MinD || opc == Op_MaxD, "sanity");
1100
1101 int imm8 = (opc == Op_MinF || opc == Op_MinD) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1102 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1103 if (elem_bt == T_FLOAT) {
1104 evminmaxss(dst, mask, src1, src2, true, imm8);
1105 } else {
1106 assert(elem_bt == T_DOUBLE, "");
1107 evminmaxsd(dst, mask, src1, src2, true, imm8);
1108 }
1109 }
1110
1111 // Float/Double signum
1112 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1113 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1114
1115 Label DONE_LABEL;
1116
1117 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1118 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1119 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1120 if (opcode == Op_SignumF) {
1121 if (VM_Version::supports_avx10_2()) {
1122 evucomxss(dst, zero);
1123 jcc(Assembler::negative, DONE_LABEL);
1124 } else {
1125 ucomiss(dst, zero);
1126 jcc(Assembler::equal, DONE_LABEL);
1127 }
1128 movflt(dst, one);
1129 jcc(Assembler::above, DONE_LABEL);
1130 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1131 } else if (opcode == Op_SignumD) {
1132 if (VM_Version::supports_avx10_2()) {
1133 evucomxsd(dst, zero);
1134 jcc(Assembler::negative, DONE_LABEL);
1135 } else {
1136 ucomisd(dst, zero);
1137 jcc(Assembler::equal, DONE_LABEL);
1138 }
1139 movdbl(dst, one);
1140 jcc(Assembler::above, DONE_LABEL);
1141 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1142 }
1143
1144 bind(DONE_LABEL);
1145 }
1146
1147 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1148 if (sign) {
1149 pmovsxbw(dst, src);
1150 } else {
1151 pmovzxbw(dst, src);
1152 }
1153 }
1154
1155 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1156 if (sign) {
1157 vpmovsxbw(dst, src, vector_len);
1158 } else {
1159 vpmovzxbw(dst, src, vector_len);
1160 }
1161 }
1162
1163 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1164 if (sign) {
1165 vpmovsxbd(dst, src, vector_len);
1166 } else {
1167 vpmovzxbd(dst, src, vector_len);
1168 }
1169 }
1170
1171 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1172 if (sign) {
1173 vpmovsxwd(dst, src, vector_len);
1174 } else {
1175 vpmovzxwd(dst, src, vector_len);
1176 }
1177 }
1178
1179 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1180 int shift, int vector_len) {
1181 if (opcode == Op_RotateLeftV) {
1182 if (etype == T_INT) {
1183 evprold(dst, src, shift, vector_len);
1184 } else {
1185 assert(etype == T_LONG, "expected type T_LONG");
1186 evprolq(dst, src, shift, vector_len);
1187 }
1188 } else {
1189 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1190 if (etype == T_INT) {
1191 evprord(dst, src, shift, vector_len);
1192 } else {
1193 assert(etype == T_LONG, "expected type T_LONG");
1194 evprorq(dst, src, shift, vector_len);
1195 }
1196 }
1197 }
1198
1199 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1200 XMMRegister shift, int vector_len) {
1201 if (opcode == Op_RotateLeftV) {
1202 if (etype == T_INT) {
1203 evprolvd(dst, src, shift, vector_len);
1204 } else {
1205 assert(etype == T_LONG, "expected type T_LONG");
1206 evprolvq(dst, src, shift, vector_len);
1207 }
1208 } else {
1209 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1210 if (etype == T_INT) {
1211 evprorvd(dst, src, shift, vector_len);
1212 } else {
1213 assert(etype == T_LONG, "expected type T_LONG");
1214 evprorvq(dst, src, shift, vector_len);
1215 }
1216 }
1217 }
1218
1219 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1220 if (opcode == Op_RShiftVI) {
1221 psrad(dst, shift);
1222 } else if (opcode == Op_LShiftVI) {
1223 pslld(dst, shift);
1224 } else {
1225 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1226 psrld(dst, shift);
1227 }
1228 }
1229
1230 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1231 switch (opcode) {
1232 case Op_RShiftVI: psrad(dst, shift); break;
1233 case Op_LShiftVI: pslld(dst, shift); break;
1234 case Op_URShiftVI: psrld(dst, shift); break;
1235
1236 default: assert(false, "%s", NodeClassNames[opcode]);
1237 }
1238 }
1239
1240 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1241 if (opcode == Op_RShiftVI) {
1242 vpsrad(dst, nds, shift, vector_len);
1243 } else if (opcode == Op_LShiftVI) {
1244 vpslld(dst, nds, shift, vector_len);
1245 } else {
1246 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1247 vpsrld(dst, nds, shift, vector_len);
1248 }
1249 }
1250
1251 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1252 switch (opcode) {
1253 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1254 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1255 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1256
1257 default: assert(false, "%s", NodeClassNames[opcode]);
1258 }
1259 }
1260
1261 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1262 switch (opcode) {
1263 case Op_RShiftVB: // fall-through
1264 case Op_RShiftVS: psraw(dst, shift); break;
1265
1266 case Op_LShiftVB: // fall-through
1267 case Op_LShiftVS: psllw(dst, shift); break;
1268
1269 case Op_URShiftVS: // fall-through
1270 case Op_URShiftVB: psrlw(dst, shift); break;
1271
1272 default: assert(false, "%s", NodeClassNames[opcode]);
1273 }
1274 }
1275
1276 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1277 switch (opcode) {
1278 case Op_RShiftVB: // fall-through
1279 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1280
1281 case Op_LShiftVB: // fall-through
1282 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1283
1284 case Op_URShiftVS: // fall-through
1285 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1286
1287 default: assert(false, "%s", NodeClassNames[opcode]);
1288 }
1289 }
1290
1291 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1292 switch (opcode) {
1293 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1294 case Op_LShiftVL: psllq(dst, shift); break;
1295 case Op_URShiftVL: psrlq(dst, shift); break;
1296
1297 default: assert(false, "%s", NodeClassNames[opcode]);
1298 }
1299 }
1300
1301 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1302 if (opcode == Op_RShiftVL) {
1303 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1304 } else if (opcode == Op_LShiftVL) {
1305 psllq(dst, shift);
1306 } else {
1307 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1308 psrlq(dst, shift);
1309 }
1310 }
1311
1312 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1313 switch (opcode) {
1314 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1315 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1316 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1317
1318 default: assert(false, "%s", NodeClassNames[opcode]);
1319 }
1320 }
1321
1322 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1323 if (opcode == Op_RShiftVL) {
1324 evpsraq(dst, nds, shift, vector_len);
1325 } else if (opcode == Op_LShiftVL) {
1326 vpsllq(dst, nds, shift, vector_len);
1327 } else {
1328 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1329 vpsrlq(dst, nds, shift, vector_len);
1330 }
1331 }
1332
1333 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1334 switch (opcode) {
1335 case Op_RShiftVB: // fall-through
1336 case Op_RShiftVS: // fall-through
1337 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1338
1339 case Op_LShiftVB: // fall-through
1340 case Op_LShiftVS: // fall-through
1341 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1342
1343 case Op_URShiftVB: // fall-through
1344 case Op_URShiftVS: // fall-through
1345 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1346
1347 default: assert(false, "%s", NodeClassNames[opcode]);
1348 }
1349 }
1350
1351 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1352 switch (opcode) {
1353 case Op_RShiftVB: // fall-through
1354 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1355
1356 case Op_LShiftVB: // fall-through
1357 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1358
1359 case Op_URShiftVB: // fall-through
1360 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1361
1362 default: assert(false, "%s", NodeClassNames[opcode]);
1363 }
1364 }
1365
1366 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1367 assert(UseAVX >= 2, "required");
1368 switch (opcode) {
1369 case Op_RShiftVL: {
1370 if (UseAVX > 2) {
1371 assert(tmp == xnoreg, "not used");
1372 if (!VM_Version::supports_avx512vl()) {
1373 vlen_enc = Assembler::AVX_512bit;
1374 }
1375 evpsravq(dst, src, shift, vlen_enc);
1376 } else {
1377 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1378 vpsrlvq(dst, src, shift, vlen_enc);
1379 vpsrlvq(tmp, tmp, shift, vlen_enc);
1380 vpxor(dst, dst, tmp, vlen_enc);
1381 vpsubq(dst, dst, tmp, vlen_enc);
1382 }
1383 break;
1384 }
1385 case Op_LShiftVL: {
1386 assert(tmp == xnoreg, "not used");
1387 vpsllvq(dst, src, shift, vlen_enc);
1388 break;
1389 }
1390 case Op_URShiftVL: {
1391 assert(tmp == xnoreg, "not used");
1392 vpsrlvq(dst, src, shift, vlen_enc);
1393 break;
1394 }
1395 default: assert(false, "%s", NodeClassNames[opcode]);
1396 }
1397 }
1398
1399 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1400 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1401 assert(opcode == Op_LShiftVB ||
1402 opcode == Op_RShiftVB ||
1403 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1404 bool sign = (opcode != Op_URShiftVB);
1405 assert(vector_len == 0, "required");
1406 vextendbd(sign, dst, src, 1);
1407 vpmovzxbd(vtmp, shift, 1);
1408 varshiftd(opcode, dst, dst, vtmp, 1);
1409 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1410 vextracti128_high(vtmp, dst);
1411 vpackusdw(dst, dst, vtmp, 0);
1412 }
1413
1414 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1415 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1416 assert(opcode == Op_LShiftVB ||
1417 opcode == Op_RShiftVB ||
1418 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1419 bool sign = (opcode != Op_URShiftVB);
1420 int ext_vector_len = vector_len + 1;
1421 vextendbw(sign, dst, src, ext_vector_len);
1422 vpmovzxbw(vtmp, shift, ext_vector_len);
1423 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1424 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1425 if (vector_len == 0) {
1426 vextracti128_high(vtmp, dst);
1427 vpackuswb(dst, dst, vtmp, vector_len);
1428 } else {
1429 vextracti64x4_high(vtmp, dst);
1430 vpackuswb(dst, dst, vtmp, vector_len);
1431 vpermq(dst, dst, 0xD8, vector_len);
1432 }
1433 }
1434
1435 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1436 switch(typ) {
1437 case T_BYTE:
1438 pinsrb(dst, val, idx);
1439 break;
1440 case T_SHORT:
1441 pinsrw(dst, val, idx);
1442 break;
1443 case T_INT:
1444 pinsrd(dst, val, idx);
1445 break;
1446 case T_LONG:
1447 pinsrq(dst, val, idx);
1448 break;
1449 default:
1450 assert(false,"Should not reach here.");
1451 break;
1452 }
1453 }
1454
1455 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1456 switch(typ) {
1457 case T_BYTE:
1458 vpinsrb(dst, src, val, idx);
1459 break;
1460 case T_SHORT:
1461 vpinsrw(dst, src, val, idx);
1462 break;
1463 case T_INT:
1464 vpinsrd(dst, src, val, idx);
1465 break;
1466 case T_LONG:
1467 vpinsrq(dst, src, val, idx);
1468 break;
1469 default:
1470 assert(false,"Should not reach here.");
1471 break;
1472 }
1473 }
1474
1475 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1476 Register base, Register idx_base,
1477 Register mask, Register mask_idx,
1478 Register rtmp, int vlen_enc) {
1479 vpxor(dst, dst, dst, vlen_enc);
1480 if (elem_bt == T_SHORT) {
1481 for (int i = 0; i < 4; i++) {
1482 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1483 Label skip_load;
1484 btq(mask, mask_idx);
1485 jccb(Assembler::carryClear, skip_load);
1486 movl(rtmp, Address(idx_base, i * 4));
1487 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1488 bind(skip_load);
1489 incq(mask_idx);
1490 }
1491 } else {
1492 assert(elem_bt == T_BYTE, "");
1493 for (int i = 0; i < 8; i++) {
1494 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1495 Label skip_load;
1496 btq(mask, mask_idx);
1497 jccb(Assembler::carryClear, skip_load);
1498 movl(rtmp, Address(idx_base, i * 4));
1499 pinsrb(dst, Address(base, rtmp), i);
1500 bind(skip_load);
1501 incq(mask_idx);
1502 }
1503 }
1504 }
1505
1506 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1507 Register base, Register idx_base,
1508 Register rtmp, int vlen_enc) {
1509 vpxor(dst, dst, dst, vlen_enc);
1510 if (elem_bt == T_SHORT) {
1511 for (int i = 0; i < 4; i++) {
1512 // dst[i] = src[idx_base[i]]
1513 movl(rtmp, Address(idx_base, i * 4));
1514 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1515 }
1516 } else {
1517 assert(elem_bt == T_BYTE, "");
1518 for (int i = 0; i < 8; i++) {
1519 // dst[i] = src[idx_base[i]]
1520 movl(rtmp, Address(idx_base, i * 4));
1521 pinsrb(dst, Address(base, rtmp), i);
1522 }
1523 }
1524 }
1525
1526 /*
1527 * Gather using hybrid algorithm, first partially unroll scalar loop
1528 * to accumulate values from gather indices into a quad-word(64bit) slice.
1529 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1530 * permutation to place the slice into appropriate vector lane
1531 * locations in destination vector. Following pseudo code describes the
1532 * algorithm in detail:
1533 *
1534 * DST_VEC = ZERO_VEC
1535 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1536 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1537 * FOREACH_ITER:
1538 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1539 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1540 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1541 * PERM_INDEX = PERM_INDEX - TWO_VEC
1542 *
1543 * With each iteration, doubleword permute indices (0,1) corresponding
1544 * to gathered quadword gets right shifted by two lane positions.
1545 *
1546 */
1547 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1548 Register base, Register idx_base,
1549 Register mask, XMMRegister xtmp1,
1550 XMMRegister xtmp2, XMMRegister temp_dst,
1551 Register rtmp, Register mask_idx,
1552 Register length, int vector_len, int vlen_enc) {
1553 Label GATHER8_LOOP;
1554 assert(is_subword_type(elem_ty), "");
1555 movl(length, vector_len);
1556 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1557 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1558 vallones(xtmp2, vlen_enc);
1559 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1560 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1561 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1562
1563 bind(GATHER8_LOOP);
1564 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1565 if (mask == noreg) {
1566 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1567 } else {
1568 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1569 }
1570 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1571 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1572 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1573 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1574 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1575 vpor(dst, dst, temp_dst, vlen_enc);
1576 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1577 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1578 jcc(Assembler::notEqual, GATHER8_LOOP);
1579 }
1580
1581 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1582 switch(typ) {
1583 case T_INT:
1584 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1585 break;
1586 case T_FLOAT:
1587 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1588 break;
1589 case T_LONG:
1590 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1591 break;
1592 case T_DOUBLE:
1593 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1594 break;
1595 default:
1596 assert(false,"Should not reach here.");
1597 break;
1598 }
1599 }
1600
1601 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1602 switch(typ) {
1603 case T_INT:
1604 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1605 break;
1606 case T_FLOAT:
1607 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1608 break;
1609 case T_LONG:
1610 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1611 break;
1612 case T_DOUBLE:
1613 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1614 break;
1615 default:
1616 assert(false,"Should not reach here.");
1617 break;
1618 }
1619 }
1620
1621 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1622 switch(typ) {
1623 case T_INT:
1624 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1625 break;
1626 case T_FLOAT:
1627 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1628 break;
1629 case T_LONG:
1630 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1631 break;
1632 case T_DOUBLE:
1633 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1634 break;
1635 default:
1636 assert(false,"Should not reach here.");
1637 break;
1638 }
1639 }
1640
1641 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1642 if (vlen_in_bytes <= 16) {
1643 pxor (dst, dst);
1644 psubb(dst, src);
1645 switch (elem_bt) {
1646 case T_BYTE: /* nothing to do */ break;
1647 case T_SHORT: pmovsxbw(dst, dst); break;
1648 case T_INT: pmovsxbd(dst, dst); break;
1649 case T_FLOAT: pmovsxbd(dst, dst); break;
1650 case T_LONG: pmovsxbq(dst, dst); break;
1651 case T_DOUBLE: pmovsxbq(dst, dst); break;
1652
1653 default: assert(false, "%s", type2name(elem_bt));
1654 }
1655 } else {
1656 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1657 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1658
1659 vpxor (dst, dst, dst, vlen_enc);
1660 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1661
1662 switch (elem_bt) {
1663 case T_BYTE: /* nothing to do */ break;
1664 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1665 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1666 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1667 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1668 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1669
1670 default: assert(false, "%s", type2name(elem_bt));
1671 }
1672 }
1673 }
1674
1675 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1676 if (novlbwdq) {
1677 vpmovsxbd(xtmp, src, vlen_enc);
1678 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1679 Assembler::eq, true, vlen_enc, noreg);
1680 } else {
1681 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1682 vpsubb(xtmp, xtmp, src, vlen_enc);
1683 evpmovb2m(dst, xtmp, vlen_enc);
1684 }
1685 }
1686
1687 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1688 if (is_integral_type(bt)) {
1689 switch (vlen_in_bytes) {
1690 case 4: movdl(dst, src); break;
1691 case 8: movq(dst, src); break;
1692 case 16: movdqu(dst, src); break;
1693 case 32: vmovdqu(dst, src); break;
1694 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1695 default: ShouldNotReachHere();
1696 }
1697 } else {
1698 switch (vlen_in_bytes) {
1699 case 4: movflt(dst, src); break;
1700 case 8: movdbl(dst, src); break;
1701 case 16: movups(dst, src); break;
1702 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1703 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1704 default: ShouldNotReachHere();
1705 }
1706 }
1707 }
1708
1709 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1710 assert(rscratch != noreg || always_reachable(src), "missing");
1711
1712 if (reachable(src)) {
1713 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1714 } else {
1715 lea(rscratch, src);
1716 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1717 }
1718 }
1719
1720 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1721 int vlen_enc = vector_length_encoding(vlen);
1722 if (VM_Version::supports_avx()) {
1723 if (bt == T_LONG) {
1724 if (VM_Version::supports_avx2()) {
1725 vpbroadcastq(dst, src, vlen_enc);
1726 } else {
1727 vmovddup(dst, src, vlen_enc);
1728 }
1729 } else if (bt == T_DOUBLE) {
1730 if (vlen_enc != Assembler::AVX_128bit) {
1731 vbroadcastsd(dst, src, vlen_enc, noreg);
1732 } else {
1733 vmovddup(dst, src, vlen_enc);
1734 }
1735 } else {
1736 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1737 vpbroadcastd(dst, src, vlen_enc);
1738 } else {
1739 vbroadcastss(dst, src, vlen_enc);
1740 }
1741 }
1742 } else if (VM_Version::supports_sse3()) {
1743 movddup(dst, src);
1744 } else {
1745 load_vector(bt, dst, src, vlen);
1746 }
1747 }
1748
1749 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1750 int entry_idx = vector_iota_entry_index(bt);
1751 ExternalAddress addr(StubRoutines::x86::vector_iota_indices(entry_idx));
1752 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1753 }
1754
1755 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1756
1757 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1758 int vector_len = Assembler::AVX_128bit;
1759
1760 switch (opcode) {
1761 case Op_AndReductionV: pand(dst, src); break;
1762 case Op_OrReductionV: por (dst, src); break;
1763 case Op_XorReductionV: pxor(dst, src); break;
1764 case Op_MinReductionV:
1765 switch (typ) {
1766 case T_BYTE: pminsb(dst, src); break;
1767 case T_SHORT: pminsw(dst, src); break;
1768 case T_INT: pminsd(dst, src); break;
1769 case T_LONG: assert(UseAVX > 2, "required");
1770 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1771 default: assert(false, "wrong type");
1772 }
1773 break;
1774 case Op_MaxReductionV:
1775 switch (typ) {
1776 case T_BYTE: pmaxsb(dst, src); break;
1777 case T_SHORT: pmaxsw(dst, src); break;
1778 case T_INT: pmaxsd(dst, src); break;
1779 case T_LONG: assert(UseAVX > 2, "required");
1780 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1781 default: assert(false, "wrong type");
1782 }
1783 break;
1784 case Op_UMinReductionV:
1785 switch (typ) {
1786 case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1787 case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1788 case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1789 case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1790 default: assert(false, "wrong type");
1791 }
1792 break;
1793 case Op_UMaxReductionV:
1794 switch (typ) {
1795 case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1796 case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1797 case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1798 case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1799 default: assert(false, "wrong type");
1800 }
1801 break;
1802 case Op_AddReductionVF: addss(dst, src); break;
1803 case Op_AddReductionVD: addsd(dst, src); break;
1804 case Op_AddReductionVI:
1805 switch (typ) {
1806 case T_BYTE: paddb(dst, src); break;
1807 case T_SHORT: paddw(dst, src); break;
1808 case T_INT: paddd(dst, src); break;
1809 default: assert(false, "wrong type");
1810 }
1811 break;
1812 case Op_AddReductionVL: paddq(dst, src); break;
1813 case Op_MulReductionVF: mulss(dst, src); break;
1814 case Op_MulReductionVD: mulsd(dst, src); break;
1815 case Op_MulReductionVI:
1816 switch (typ) {
1817 case T_SHORT: pmullw(dst, src); break;
1818 case T_INT: pmulld(dst, src); break;
1819 default: assert(false, "wrong type");
1820 }
1821 break;
1822 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1823 evpmullq(dst, dst, src, vector_len); break;
1824 default: assert(false, "wrong opcode");
1825 }
1826 }
1827
1828 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1829 switch (opcode) {
1830 case Op_AddReductionVF: addps(dst, src); break;
1831 case Op_AddReductionVD: addpd(dst, src); break;
1832 case Op_MulReductionVF: mulps(dst, src); break;
1833 case Op_MulReductionVD: mulpd(dst, src); break;
1834 default: assert(false, "%s", NodeClassNames[opcode]);
1835 }
1836 }
1837
1838 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1839 int vector_len = Assembler::AVX_256bit;
1840
1841 switch (opcode) {
1842 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1843 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1844 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1845 case Op_MinReductionV:
1846 switch (typ) {
1847 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1848 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1849 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1850 case T_LONG: assert(UseAVX > 2, "required");
1851 vpminsq(dst, src1, src2, vector_len); break;
1852 default: assert(false, "wrong type");
1853 }
1854 break;
1855 case Op_MaxReductionV:
1856 switch (typ) {
1857 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1858 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1859 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1860 case T_LONG: assert(UseAVX > 2, "required");
1861 vpmaxsq(dst, src1, src2, vector_len); break;
1862 default: assert(false, "wrong type");
1863 }
1864 break;
1865 case Op_UMinReductionV:
1866 switch (typ) {
1867 case T_BYTE: vpminub(dst, src1, src2, vector_len); break;
1868 case T_SHORT: vpminuw(dst, src1, src2, vector_len); break;
1869 case T_INT: vpminud(dst, src1, src2, vector_len); break;
1870 case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break;
1871 default: assert(false, "wrong type");
1872 }
1873 break;
1874 case Op_UMaxReductionV:
1875 switch (typ) {
1876 case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break;
1877 case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break;
1878 case T_INT: vpmaxud(dst, src1, src2, vector_len); break;
1879 case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1880 default: assert(false, "wrong type");
1881 }
1882 break;
1883 case Op_AddReductionVI:
1884 switch (typ) {
1885 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1886 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1887 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1888 default: assert(false, "wrong type");
1889 }
1890 break;
1891 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1892 case Op_MulReductionVI:
1893 switch (typ) {
1894 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1895 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1896 default: assert(false, "wrong type");
1897 }
1898 break;
1899 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1900 default: assert(false, "wrong opcode");
1901 }
1902 }
1903
1904 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1905 int vector_len = Assembler::AVX_256bit;
1906
1907 switch (opcode) {
1908 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1909 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1910 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1911 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1912 default: assert(false, "%s", NodeClassNames[opcode]);
1913 }
1914 }
1915
1916 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1917 XMMRegister dst, XMMRegister src,
1918 XMMRegister vtmp1, XMMRegister vtmp2) {
1919 switch (opcode) {
1920 case Op_AddReductionVF:
1921 case Op_MulReductionVF:
1922 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1923 break;
1924
1925 case Op_AddReductionVD:
1926 case Op_MulReductionVD:
1927 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1928 break;
1929
1930 default: assert(false, "wrong opcode");
1931 }
1932 }
1933
1934 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1935 XMMRegister dst, XMMRegister src,
1936 XMMRegister vtmp1, XMMRegister vtmp2) {
1937 switch (opcode) {
1938 case Op_AddReductionVF:
1939 case Op_MulReductionVF:
1940 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1941 break;
1942
1943 case Op_AddReductionVD:
1944 case Op_MulReductionVD:
1945 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1946 break;
1947
1948 default: assert(false, "%s", NodeClassNames[opcode]);
1949 }
1950 }
1951
1952 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1953 Register dst, Register src1, XMMRegister src2,
1954 XMMRegister vtmp1, XMMRegister vtmp2) {
1955 switch (vlen) {
1956 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1957 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1958 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1959 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1960
1961 default: assert(false, "wrong vector length");
1962 }
1963 }
1964
1965 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1966 Register dst, Register src1, XMMRegister src2,
1967 XMMRegister vtmp1, XMMRegister vtmp2) {
1968 switch (vlen) {
1969 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1970 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1971 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1972 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1973
1974 default: assert(false, "wrong vector length");
1975 }
1976 }
1977
1978 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1979 Register dst, Register src1, XMMRegister src2,
1980 XMMRegister vtmp1, XMMRegister vtmp2) {
1981 switch (vlen) {
1982 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1983 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1984 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1985 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1986
1987 default: assert(false, "wrong vector length");
1988 }
1989 }
1990
1991 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1992 Register dst, Register src1, XMMRegister src2,
1993 XMMRegister vtmp1, XMMRegister vtmp2) {
1994 switch (vlen) {
1995 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1996 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1997 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1998 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1999
2000 default: assert(false, "wrong vector length");
2001 }
2002 }
2003
2004 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2005 Register dst, Register src1, XMMRegister src2,
2006 XMMRegister vtmp1, XMMRegister vtmp2) {
2007 switch (vlen) {
2008 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2009 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2010 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2011
2012 default: assert(false, "wrong vector length");
2013 }
2014 }
2015
2016 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2017 switch (vlen) {
2018 case 2:
2019 assert(vtmp2 == xnoreg, "");
2020 reduce2F(opcode, dst, src, vtmp1);
2021 break;
2022 case 4:
2023 assert(vtmp2 == xnoreg, "");
2024 reduce4F(opcode, dst, src, vtmp1);
2025 break;
2026 case 8:
2027 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2028 break;
2029 case 16:
2030 reduce16F(opcode, dst, src, vtmp1, vtmp2);
2031 break;
2032 default: assert(false, "wrong vector length");
2033 }
2034 }
2035
2036 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2037 switch (vlen) {
2038 case 2:
2039 assert(vtmp2 == xnoreg, "");
2040 reduce2D(opcode, dst, src, vtmp1);
2041 break;
2042 case 4:
2043 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2044 break;
2045 case 8:
2046 reduce8D(opcode, dst, src, vtmp1, vtmp2);
2047 break;
2048 default: assert(false, "wrong vector length");
2049 }
2050 }
2051
2052 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2053 switch (vlen) {
2054 case 2:
2055 assert(vtmp1 == xnoreg, "");
2056 assert(vtmp2 == xnoreg, "");
2057 unorderedReduce2F(opcode, dst, src);
2058 break;
2059 case 4:
2060 assert(vtmp2 == xnoreg, "");
2061 unorderedReduce4F(opcode, dst, src, vtmp1);
2062 break;
2063 case 8:
2064 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2065 break;
2066 case 16:
2067 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2068 break;
2069 default: assert(false, "wrong vector length");
2070 }
2071 }
2072
2073 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2074 switch (vlen) {
2075 case 2:
2076 assert(vtmp1 == xnoreg, "");
2077 assert(vtmp2 == xnoreg, "");
2078 unorderedReduce2D(opcode, dst, src);
2079 break;
2080 case 4:
2081 assert(vtmp2 == xnoreg, "");
2082 unorderedReduce4D(opcode, dst, src, vtmp1);
2083 break;
2084 case 8:
2085 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2086 break;
2087 default: assert(false, "wrong vector length");
2088 }
2089 }
2090
2091 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2092 if (opcode == Op_AddReductionVI) {
2093 if (vtmp1 != src2) {
2094 movdqu(vtmp1, src2);
2095 }
2096 phaddd(vtmp1, vtmp1);
2097 } else {
2098 pshufd(vtmp1, src2, 0x1);
2099 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2100 }
2101 movdl(vtmp2, src1);
2102 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2103 movdl(dst, vtmp1);
2104 }
2105
2106 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2107 if (opcode == Op_AddReductionVI) {
2108 if (vtmp1 != src2) {
2109 movdqu(vtmp1, src2);
2110 }
2111 phaddd(vtmp1, src2);
2112 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2113 } else {
2114 pshufd(vtmp2, src2, 0xE);
2115 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2116 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2117 }
2118 }
2119
2120 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2121 if (opcode == Op_AddReductionVI) {
2122 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2123 vextracti128_high(vtmp2, vtmp1);
2124 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2125 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2126 } else {
2127 vextracti128_high(vtmp1, src2);
2128 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2129 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2130 }
2131 }
2132
2133 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2134 vextracti64x4_high(vtmp2, src2);
2135 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2136 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2137 }
2138
2139 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2140 pshufd(vtmp2, src2, 0x1);
2141 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2142 movdqu(vtmp1, vtmp2);
2143 psrldq(vtmp1, 2);
2144 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2145 movdqu(vtmp2, vtmp1);
2146 psrldq(vtmp2, 1);
2147 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2148 movdl(vtmp2, src1);
2149 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2150 pmovzxbd(vtmp1, vtmp1);
2151 } else {
2152 pmovsxbd(vtmp1, vtmp1);
2153 }
2154 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2155 pextrb(dst, vtmp1, 0x0);
2156 movsbl(dst, dst);
2157 }
2158
2159 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2160 pshufd(vtmp1, src2, 0xE);
2161 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2162 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2163 }
2164
2165 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2166 vextracti128_high(vtmp2, src2);
2167 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2168 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2169 }
2170
2171 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2172 vextracti64x4_high(vtmp1, src2);
2173 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2174 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2175 }
2176
2177 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2178 pmovsxbw(vtmp2, src2);
2179 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2180 }
2181
2182 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2183 if (UseAVX > 1) {
2184 int vector_len = Assembler::AVX_256bit;
2185 vpmovsxbw(vtmp1, src2, vector_len);
2186 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2187 } else {
2188 pmovsxbw(vtmp2, src2);
2189 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2190 pshufd(vtmp2, src2, 0xe);
2191 pmovsxbw(vtmp2, vtmp2);
2192 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2193 }
2194 }
2195
2196 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2197 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2198 int vector_len = Assembler::AVX_512bit;
2199 vpmovsxbw(vtmp1, src2, vector_len);
2200 reduce32S(opcode, dst, src1, vtmp1, vtmp2, vtmp1);
2201 } else {
2202 assert(UseAVX >= 2,"Should not reach here.");
2203 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2204 vextracti128_high(vtmp2, src2);
2205 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2206 }
2207 }
2208
2209 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2210 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2211 vextracti64x4_high(vtmp2, src2);
2212 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2213 }
2214
2215 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2216 if (opcode == Op_AddReductionVI) {
2217 if (vtmp1 != src2) {
2218 movdqu(vtmp1, src2);
2219 }
2220 phaddw(vtmp1, vtmp1);
2221 phaddw(vtmp1, vtmp1);
2222 } else {
2223 pshufd(vtmp2, src2, 0x1);
2224 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2225 movdqu(vtmp1, vtmp2);
2226 psrldq(vtmp1, 2);
2227 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2228 }
2229 movdl(vtmp2, src1);
2230 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2231 pmovzxwd(vtmp1, vtmp1);
2232 } else {
2233 pmovsxwd(vtmp1, vtmp1);
2234 }
2235 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2236 pextrw(dst, vtmp1, 0x0);
2237 movswl(dst, dst);
2238 }
2239
2240 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2241 if (opcode == Op_AddReductionVI) {
2242 if (vtmp1 != src2) {
2243 movdqu(vtmp1, src2);
2244 }
2245 phaddw(vtmp1, src2);
2246 } else {
2247 assert_different_registers(src2, vtmp1);
2248 pshufd(vtmp1, src2, 0xE);
2249 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2250 }
2251 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2252 }
2253
2254 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2255 if (opcode == Op_AddReductionVI) {
2256 int vector_len = Assembler::AVX_256bit;
2257 vphaddw(vtmp2, src2, src2, vector_len);
2258 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2259 } else {
2260 assert_different_registers(src2, vtmp2);
2261 vextracti128_high(vtmp2, src2);
2262 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2263 }
2264 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2265 }
2266
2267 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2268 assert_different_registers(src2, vtmp1);
2269 int vector_len = Assembler::AVX_256bit;
2270 vextracti64x4_high(vtmp1, src2);
2271 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2272 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2273 }
2274
2275 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2276 pshufd(vtmp2, src2, 0xE);
2277 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2278 movdq(vtmp1, src1);
2279 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2280 movdq(dst, vtmp1);
2281 }
2282
2283 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2284 vextracti128_high(vtmp1, src2);
2285 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2286 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2287 }
2288
2289 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2290 vextracti64x4_high(vtmp2, src2);
2291 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2292 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2293 }
2294
2295 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2296 mov64(temp, -1L);
2297 bzhiq(temp, temp, len);
2298 kmovql(dst, temp);
2299 }
2300
2301 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2302 reduce_operation_128(T_FLOAT, opcode, dst, src);
2303 pshufd(vtmp, src, 0x1);
2304 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2305 }
2306
2307 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2308 reduce2F(opcode, dst, src, vtmp);
2309 pshufd(vtmp, src, 0x2);
2310 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2311 pshufd(vtmp, src, 0x3);
2312 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2313 }
2314
2315 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2316 reduce4F(opcode, dst, src, vtmp2);
2317 vextractf128_high(vtmp2, src);
2318 reduce4F(opcode, dst, vtmp2, vtmp1);
2319 }
2320
2321 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2322 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2323 vextracti64x4_high(vtmp1, src);
2324 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2325 }
2326
2327 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2328 pshufd(dst, src, 0x1);
2329 reduce_operation_128(T_FLOAT, opcode, dst, src);
2330 }
2331
2332 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2333 pshufd(vtmp, src, 0xE);
2334 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2335 unorderedReduce2F(opcode, dst, vtmp);
2336 }
2337
2338 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2339 vextractf128_high(vtmp1, src);
2340 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2341 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2342 }
2343
2344 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2345 vextractf64x4_high(vtmp2, src);
2346 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2347 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2348 }
2349
2350 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2351 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2352 pshufd(vtmp, src, 0xE);
2353 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2354 }
2355
2356 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2357 reduce2D(opcode, dst, src, vtmp2);
2358 vextractf128_high(vtmp2, src);
2359 reduce2D(opcode, dst, vtmp2, vtmp1);
2360 }
2361
2362 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2363 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2364 vextracti64x4_high(vtmp1, src);
2365 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2366 }
2367
2368 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2369 pshufd(dst, src, 0xE);
2370 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2371 }
2372
2373 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2374 vextractf128_high(vtmp, src);
2375 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2376 unorderedReduce2D(opcode, dst, vtmp);
2377 }
2378
2379 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2380 vextractf64x4_high(vtmp2, src);
2381 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2382 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2383 }
2384
2385 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2386 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2387 }
2388
2389 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2390 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2391 }
2392
2393 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2394 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2395 }
2396
2397 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2398 int vec_enc) {
2399 switch(elem_bt) {
2400 case T_INT:
2401 case T_FLOAT:
2402 vmaskmovps(dst, src, mask, vec_enc);
2403 break;
2404 case T_LONG:
2405 case T_DOUBLE:
2406 vmaskmovpd(dst, src, mask, vec_enc);
2407 break;
2408 default:
2409 fatal("Unsupported type %s", type2name(elem_bt));
2410 break;
2411 }
2412 }
2413
2414 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2415 int vec_enc) {
2416 switch(elem_bt) {
2417 case T_INT:
2418 case T_FLOAT:
2419 vmaskmovps(dst, src, mask, vec_enc);
2420 break;
2421 case T_LONG:
2422 case T_DOUBLE:
2423 vmaskmovpd(dst, src, mask, vec_enc);
2424 break;
2425 default:
2426 fatal("Unsupported type %s", type2name(elem_bt));
2427 break;
2428 }
2429 }
2430
2431 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2432 XMMRegister dst, XMMRegister src,
2433 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2434 XMMRegister xmm_0, XMMRegister xmm_1) {
2435 const int permconst[] = {1, 14};
2436 XMMRegister wsrc = src;
2437 XMMRegister wdst = xmm_0;
2438 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2439
2440 int vlen_enc = Assembler::AVX_128bit;
2441 if (vlen == 16) {
2442 vlen_enc = Assembler::AVX_256bit;
2443 }
2444
2445 for (int i = log2(vlen) - 1; i >=0; i--) {
2446 if (i == 0 && !is_dst_valid) {
2447 wdst = dst;
2448 }
2449 if (i == 3) {
2450 vextracti64x4_high(wtmp, wsrc);
2451 } else if (i == 2) {
2452 vextracti128_high(wtmp, wsrc);
2453 } else { // i = [0,1]
2454 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2455 }
2456
2457 if (VM_Version::supports_avx10_2()) {
2458 vminmax_fp_avx10_2(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2459 } else {
2460 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2461 }
2462 wsrc = wdst;
2463 vlen_enc = Assembler::AVX_128bit;
2464 }
2465 if (is_dst_valid) {
2466 if (VM_Version::supports_avx10_2()) {
2467 vminmax_fp_avx10_2(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2468 } else {
2469 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2470 }
2471 }
2472 }
2473
2474 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2475 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2476 XMMRegister xmm_0, XMMRegister xmm_1) {
2477 XMMRegister wsrc = src;
2478 XMMRegister wdst = xmm_0;
2479 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2480 int vlen_enc = Assembler::AVX_128bit;
2481 if (vlen == 8) {
2482 vlen_enc = Assembler::AVX_256bit;
2483 }
2484 for (int i = log2(vlen) - 1; i >=0; i--) {
2485 if (i == 0 && !is_dst_valid) {
2486 wdst = dst;
2487 }
2488 if (i == 1) {
2489 vextracti128_high(wtmp, wsrc);
2490 } else if (i == 2) {
2491 vextracti64x4_high(wtmp, wsrc);
2492 } else {
2493 assert(i == 0, "%d", i);
2494 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2495 }
2496
2497 if (VM_Version::supports_avx10_2()) {
2498 vminmax_fp_avx10_2(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2499 } else {
2500 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2501 }
2502
2503 wsrc = wdst;
2504 vlen_enc = Assembler::AVX_128bit;
2505 }
2506
2507 if (is_dst_valid) {
2508 if (VM_Version::supports_avx10_2()) {
2509 vminmax_fp_avx10_2(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2510 } else {
2511 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2512 }
2513 }
2514 }
2515
2516 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2517 switch (bt) {
2518 case T_BYTE: pextrb(dst, src, idx); break;
2519 case T_SHORT: pextrw(dst, src, idx); break;
2520 case T_INT: pextrd(dst, src, idx); break;
2521 case T_LONG: pextrq(dst, src, idx); break;
2522
2523 default:
2524 assert(false,"Should not reach here.");
2525 break;
2526 }
2527 }
2528
2529 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2530 int esize = type2aelembytes(typ);
2531 int elem_per_lane = 16/esize;
2532 int lane = elemindex / elem_per_lane;
2533 int eindex = elemindex % elem_per_lane;
2534
2535 if (lane >= 2) {
2536 assert(UseAVX > 2, "required");
2537 vextractf32x4(dst, src, lane & 3);
2538 return dst;
2539 } else if (lane > 0) {
2540 assert(UseAVX > 0, "required");
2541 vextractf128(dst, src, lane);
2542 return dst;
2543 } else {
2544 return src;
2545 }
2546 }
2547
2548 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2549 if (typ == T_BYTE) {
2550 movsbl(dst, dst);
2551 } else if (typ == T_SHORT) {
2552 movswl(dst, dst);
2553 }
2554 }
2555
2556 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2557 int esize = type2aelembytes(typ);
2558 int elem_per_lane = 16/esize;
2559 int eindex = elemindex % elem_per_lane;
2560 assert(is_integral_type(typ),"required");
2561
2562 if (eindex == 0) {
2563 if (typ == T_LONG) {
2564 movq(dst, src);
2565 } else {
2566 movdl(dst, src);
2567 movsxl(typ, dst);
2568 }
2569 } else {
2570 extract(typ, dst, src, eindex);
2571 movsxl(typ, dst);
2572 }
2573 }
2574
2575 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2576 int esize = type2aelembytes(typ);
2577 int elem_per_lane = 16/esize;
2578 int eindex = elemindex % elem_per_lane;
2579 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2580
2581 if (eindex == 0) {
2582 movq(dst, src);
2583 } else {
2584 if (typ == T_FLOAT) {
2585 if (UseAVX == 0) {
2586 movdqu(dst, src);
2587 shufps(dst, dst, eindex);
2588 } else {
2589 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2590 }
2591 } else {
2592 if (UseAVX == 0) {
2593 movdqu(dst, src);
2594 psrldq(dst, eindex*esize);
2595 } else {
2596 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2597 }
2598 movq(dst, dst);
2599 }
2600 }
2601 // Zero upper bits
2602 if (typ == T_FLOAT) {
2603 if (UseAVX == 0) {
2604 assert(vtmp != xnoreg, "required.");
2605 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2606 pand(dst, vtmp);
2607 } else {
2608 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2609 }
2610 }
2611 }
2612
2613 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2614 switch(typ) {
2615 case T_BYTE:
2616 case T_BOOLEAN:
2617 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2618 break;
2619 case T_SHORT:
2620 case T_CHAR:
2621 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2622 break;
2623 case T_INT:
2624 case T_FLOAT:
2625 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2626 break;
2627 case T_LONG:
2628 case T_DOUBLE:
2629 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2630 break;
2631 default:
2632 assert(false,"Should not reach here.");
2633 break;
2634 }
2635 }
2636
2637 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2638 assert(rscratch != noreg || always_reachable(src2), "missing");
2639
2640 switch(typ) {
2641 case T_BOOLEAN:
2642 case T_BYTE:
2643 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2644 break;
2645 case T_CHAR:
2646 case T_SHORT:
2647 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2648 break;
2649 case T_INT:
2650 case T_FLOAT:
2651 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2652 break;
2653 case T_LONG:
2654 case T_DOUBLE:
2655 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2656 break;
2657 default:
2658 assert(false,"Should not reach here.");
2659 break;
2660 }
2661 }
2662
2663 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2664 switch(typ) {
2665 case T_BYTE:
2666 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2667 break;
2668 case T_SHORT:
2669 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2670 break;
2671 case T_INT:
2672 case T_FLOAT:
2673 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2674 break;
2675 case T_LONG:
2676 case T_DOUBLE:
2677 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2678 break;
2679 default:
2680 assert(false,"Should not reach here.");
2681 break;
2682 }
2683 }
2684
2685 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2686 assert(vlen_in_bytes <= 32, "");
2687 int esize = type2aelembytes(bt);
2688 if (vlen_in_bytes == 32) {
2689 assert(vtmp == xnoreg, "required.");
2690 if (esize >= 4) {
2691 vtestps(src1, src2, AVX_256bit);
2692 } else {
2693 vptest(src1, src2, AVX_256bit);
2694 }
2695 return;
2696 }
2697 if (vlen_in_bytes < 16) {
2698 // Duplicate the lower part to fill the whole register,
2699 // Don't need to do so for src2
2700 assert(vtmp != xnoreg, "required");
2701 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2702 pshufd(vtmp, src1, shuffle_imm);
2703 } else {
2704 assert(vtmp == xnoreg, "required");
2705 vtmp = src1;
2706 }
2707 if (esize >= 4 && VM_Version::supports_avx()) {
2708 vtestps(vtmp, src2, AVX_128bit);
2709 } else {
2710 ptest(vtmp, src2);
2711 }
2712 }
2713
2714 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2715 #ifdef ASSERT
2716 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2717 bool is_bw_supported = VM_Version::supports_avx512bw();
2718 if (is_bw && !is_bw_supported) {
2719 assert(vlen_enc != Assembler::AVX_512bit, "required");
2720 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2721 "XMM register should be 0-15");
2722 }
2723 #endif // ASSERT
2724 switch (elem_bt) {
2725 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2726 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2727 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2728 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2729 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2730 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2731 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2732 }
2733 }
2734
2735 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2736 assert(UseAVX >= 2, "required");
2737 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2738 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2739 if ((UseAVX > 2) &&
2740 (!is_bw || VM_Version::supports_avx512bw()) &&
2741 (!is_vl || VM_Version::supports_avx512vl())) {
2742 switch (elem_bt) {
2743 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2744 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2745 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2746 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2747 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2748 }
2749 } else {
2750 assert(vlen_enc != Assembler::AVX_512bit, "required");
2751 assert((dst->encoding() < 16),"XMM register should be 0-15");
2752 switch (elem_bt) {
2753 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2754 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2755 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2756 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2757 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2758 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2759 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2760 }
2761 }
2762 }
2763
2764 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2765 switch (to_elem_bt) {
2766 case T_SHORT:
2767 vpmovsxbw(dst, src, vlen_enc);
2768 break;
2769 case T_INT:
2770 vpmovsxbd(dst, src, vlen_enc);
2771 break;
2772 case T_FLOAT:
2773 vpmovsxbd(dst, src, vlen_enc);
2774 vcvtdq2ps(dst, dst, vlen_enc);
2775 break;
2776 case T_LONG:
2777 vpmovsxbq(dst, src, vlen_enc);
2778 break;
2779 case T_DOUBLE: {
2780 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2781 vpmovsxbd(dst, src, mid_vlen_enc);
2782 vcvtdq2pd(dst, dst, vlen_enc);
2783 break;
2784 }
2785 default:
2786 fatal("Unsupported type %s", type2name(to_elem_bt));
2787 break;
2788 }
2789 }
2790
2791 //-------------------------------------------------------------------------------------------
2792
2793 // IndexOf for constant substrings with size >= 8 chars
2794 // which don't need to be loaded through stack.
2795 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2796 Register cnt1, Register cnt2,
2797 int int_cnt2, Register result,
2798 XMMRegister vec, Register tmp,
2799 int ae) {
2800 ShortBranchVerifier sbv(this);
2801 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2802 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2803
2804 // This method uses the pcmpestri instruction with bound registers
2805 // inputs:
2806 // xmm - substring
2807 // rax - substring length (elements count)
2808 // mem - scanned string
2809 // rdx - string length (elements count)
2810 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2811 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2812 // outputs:
2813 // rcx - matched index in string
2814 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2815 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2816 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2817 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2818 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2819
2820 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2821 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2822 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2823
2824 // Note, inline_string_indexOf() generates checks:
2825 // if (substr.count > string.count) return -1;
2826 // if (substr.count == 0) return 0;
2827 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2828
2829 // Load substring.
2830 if (ae == StrIntrinsicNode::UL) {
2831 pmovzxbw(vec, Address(str2, 0));
2832 } else {
2833 movdqu(vec, Address(str2, 0));
2834 }
2835 movl(cnt2, int_cnt2);
2836 movptr(result, str1); // string addr
2837
2838 if (int_cnt2 > stride) {
2839 jmpb(SCAN_TO_SUBSTR);
2840
2841 // Reload substr for rescan, this code
2842 // is executed only for large substrings (> 8 chars)
2843 bind(RELOAD_SUBSTR);
2844 if (ae == StrIntrinsicNode::UL) {
2845 pmovzxbw(vec, Address(str2, 0));
2846 } else {
2847 movdqu(vec, Address(str2, 0));
2848 }
2849 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2850
2851 bind(RELOAD_STR);
2852 // We came here after the beginning of the substring was
2853 // matched but the rest of it was not so we need to search
2854 // again. Start from the next element after the previous match.
2855
2856 // cnt2 is number of substring reminding elements and
2857 // cnt1 is number of string reminding elements when cmp failed.
2858 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2859 subl(cnt1, cnt2);
2860 addl(cnt1, int_cnt2);
2861 movl(cnt2, int_cnt2); // Now restore cnt2
2862
2863 decrementl(cnt1); // Shift to next element
2864 cmpl(cnt1, cnt2);
2865 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2866
2867 addptr(result, (1<<scale1));
2868
2869 } // (int_cnt2 > 8)
2870
2871 // Scan string for start of substr in 16-byte vectors
2872 bind(SCAN_TO_SUBSTR);
2873 pcmpestri(vec, Address(result, 0), mode);
2874 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2875 subl(cnt1, stride);
2876 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2877 cmpl(cnt1, cnt2);
2878 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2879 addptr(result, 16);
2880 jmpb(SCAN_TO_SUBSTR);
2881
2882 // Found a potential substr
2883 bind(FOUND_CANDIDATE);
2884 // Matched whole vector if first element matched (tmp(rcx) == 0).
2885 if (int_cnt2 == stride) {
2886 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2887 } else { // int_cnt2 > 8
2888 jccb(Assembler::overflow, FOUND_SUBSTR);
2889 }
2890 // After pcmpestri tmp(rcx) contains matched element index
2891 // Compute start addr of substr
2892 lea(result, Address(result, tmp, scale1));
2893
2894 // Make sure string is still long enough
2895 subl(cnt1, tmp);
2896 cmpl(cnt1, cnt2);
2897 if (int_cnt2 == stride) {
2898 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2899 } else { // int_cnt2 > 8
2900 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2901 }
2902 // Left less then substring.
2903
2904 bind(RET_NOT_FOUND);
2905 movl(result, -1);
2906 jmp(EXIT);
2907
2908 if (int_cnt2 > stride) {
2909 // This code is optimized for the case when whole substring
2910 // is matched if its head is matched.
2911 bind(MATCH_SUBSTR_HEAD);
2912 pcmpestri(vec, Address(result, 0), mode);
2913 // Reload only string if does not match
2914 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2915
2916 Label CONT_SCAN_SUBSTR;
2917 // Compare the rest of substring (> 8 chars).
2918 bind(FOUND_SUBSTR);
2919 // First 8 chars are already matched.
2920 negptr(cnt2);
2921 addptr(cnt2, stride);
2922
2923 bind(SCAN_SUBSTR);
2924 subl(cnt1, stride);
2925 cmpl(cnt2, -stride); // Do not read beyond substring
2926 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2927 // Back-up strings to avoid reading beyond substring:
2928 // cnt1 = cnt1 - cnt2 + 8
2929 addl(cnt1, cnt2); // cnt2 is negative
2930 addl(cnt1, stride);
2931 movl(cnt2, stride); negptr(cnt2);
2932 bind(CONT_SCAN_SUBSTR);
2933 if (int_cnt2 < (int)G) {
2934 int tail_off1 = int_cnt2<<scale1;
2935 int tail_off2 = int_cnt2<<scale2;
2936 if (ae == StrIntrinsicNode::UL) {
2937 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2938 } else {
2939 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2940 }
2941 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2942 } else {
2943 // calculate index in register to avoid integer overflow (int_cnt2*2)
2944 movl(tmp, int_cnt2);
2945 addptr(tmp, cnt2);
2946 if (ae == StrIntrinsicNode::UL) {
2947 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2948 } else {
2949 movdqu(vec, Address(str2, tmp, scale2, 0));
2950 }
2951 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2952 }
2953 // Need to reload strings pointers if not matched whole vector
2954 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2955 addptr(cnt2, stride);
2956 jcc(Assembler::negative, SCAN_SUBSTR);
2957 // Fall through if found full substring
2958
2959 } // (int_cnt2 > 8)
2960
2961 bind(RET_FOUND);
2962 // Found result if we matched full small substring.
2963 // Compute substr offset
2964 subptr(result, str1);
2965 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2966 shrl(result, 1); // index
2967 }
2968 bind(EXIT);
2969
2970 } // string_indexofC8
2971
2972 // Small strings are loaded through stack if they cross page boundary.
2973 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2974 Register cnt1, Register cnt2,
2975 int int_cnt2, Register result,
2976 XMMRegister vec, Register tmp,
2977 int ae) {
2978 ShortBranchVerifier sbv(this);
2979 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2980 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2981
2982 //
2983 // int_cnt2 is length of small (< 8 chars) constant substring
2984 // or (-1) for non constant substring in which case its length
2985 // is in cnt2 register.
2986 //
2987 // Note, inline_string_indexOf() generates checks:
2988 // if (substr.count > string.count) return -1;
2989 // if (substr.count == 0) return 0;
2990 //
2991 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2992 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2993 // This method uses the pcmpestri instruction with bound registers
2994 // inputs:
2995 // xmm - substring
2996 // rax - substring length (elements count)
2997 // mem - scanned string
2998 // rdx - string length (elements count)
2999 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3000 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3001 // outputs:
3002 // rcx - matched index in string
3003 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3004 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3005 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3006 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3007
3008 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3009 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3010 FOUND_CANDIDATE;
3011
3012 { //========================================================
3013 // We don't know where these strings are located
3014 // and we can't read beyond them. Load them through stack.
3015 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3016
3017 movptr(tmp, rsp); // save old SP
3018
3019 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
3020 if (int_cnt2 == (1>>scale2)) { // One byte
3021 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3022 load_unsigned_byte(result, Address(str2, 0));
3023 movdl(vec, result); // move 32 bits
3024 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
3025 // Not enough header space in 32-bit VM: 12+3 = 15.
3026 movl(result, Address(str2, -1));
3027 shrl(result, 8);
3028 movdl(vec, result); // move 32 bits
3029 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
3030 load_unsigned_short(result, Address(str2, 0));
3031 movdl(vec, result); // move 32 bits
3032 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3033 movdl(vec, Address(str2, 0)); // move 32 bits
3034 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3035 movq(vec, Address(str2, 0)); // move 64 bits
3036 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3037 // Array header size is 12 bytes in 32-bit VM
3038 // + 6 bytes for 3 chars == 18 bytes,
3039 // enough space to load vec and shift.
3040 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3041 if (ae == StrIntrinsicNode::UL) {
3042 int tail_off = int_cnt2-8;
3043 pmovzxbw(vec, Address(str2, tail_off));
3044 psrldq(vec, -2*tail_off);
3045 }
3046 else {
3047 int tail_off = int_cnt2*(1<<scale2);
3048 movdqu(vec, Address(str2, tail_off-16));
3049 psrldq(vec, 16-tail_off);
3050 }
3051 }
3052 } else { // not constant substring
3053 cmpl(cnt2, stride);
3054 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3055
3056 // We can read beyond string if srt+16 does not cross page boundary
3057 // since heaps are aligned and mapped by pages.
3058 assert(os::vm_page_size() < (int)G, "default page should be small");
3059 movl(result, str2); // We need only low 32 bits
3060 andl(result, ((int)os::vm_page_size()-1));
3061 cmpl(result, ((int)os::vm_page_size()-16));
3062 jccb(Assembler::belowEqual, CHECK_STR);
3063
3064 // Move small strings to stack to allow load 16 bytes into vec.
3065 subptr(rsp, 16);
3066 int stk_offset = wordSize-(1<<scale2);
3067 push(cnt2);
3068
3069 bind(COPY_SUBSTR);
3070 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3071 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3072 movb(Address(rsp, cnt2, scale2, stk_offset), result);
3073 } else if (ae == StrIntrinsicNode::UU) {
3074 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3075 movw(Address(rsp, cnt2, scale2, stk_offset), result);
3076 }
3077 decrement(cnt2);
3078 jccb(Assembler::notZero, COPY_SUBSTR);
3079
3080 pop(cnt2);
3081 movptr(str2, rsp); // New substring address
3082 } // non constant
3083
3084 bind(CHECK_STR);
3085 cmpl(cnt1, stride);
3086 jccb(Assembler::aboveEqual, BIG_STRINGS);
3087
3088 // Check cross page boundary.
3089 movl(result, str1); // We need only low 32 bits
3090 andl(result, ((int)os::vm_page_size()-1));
3091 cmpl(result, ((int)os::vm_page_size()-16));
3092 jccb(Assembler::belowEqual, BIG_STRINGS);
3093
3094 subptr(rsp, 16);
3095 int stk_offset = -(1<<scale1);
3096 if (int_cnt2 < 0) { // not constant
3097 push(cnt2);
3098 stk_offset += wordSize;
3099 }
3100 movl(cnt2, cnt1);
3101
3102 bind(COPY_STR);
3103 if (ae == StrIntrinsicNode::LL) {
3104 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3105 movb(Address(rsp, cnt2, scale1, stk_offset), result);
3106 } else {
3107 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3108 movw(Address(rsp, cnt2, scale1, stk_offset), result);
3109 }
3110 decrement(cnt2);
3111 jccb(Assembler::notZero, COPY_STR);
3112
3113 if (int_cnt2 < 0) { // not constant
3114 pop(cnt2);
3115 }
3116 movptr(str1, rsp); // New string address
3117
3118 bind(BIG_STRINGS);
3119 // Load substring.
3120 if (int_cnt2 < 0) { // -1
3121 if (ae == StrIntrinsicNode::UL) {
3122 pmovzxbw(vec, Address(str2, 0));
3123 } else {
3124 movdqu(vec, Address(str2, 0));
3125 }
3126 push(cnt2); // substr count
3127 push(str2); // substr addr
3128 push(str1); // string addr
3129 } else {
3130 // Small (< 8 chars) constant substrings are loaded already.
3131 movl(cnt2, int_cnt2);
3132 }
3133 push(tmp); // original SP
3134
3135 } // Finished loading
3136
3137 //========================================================
3138 // Start search
3139 //
3140
3141 movptr(result, str1); // string addr
3142
3143 if (int_cnt2 < 0) { // Only for non constant substring
3144 jmpb(SCAN_TO_SUBSTR);
3145
3146 // SP saved at sp+0
3147 // String saved at sp+1*wordSize
3148 // Substr saved at sp+2*wordSize
3149 // Substr count saved at sp+3*wordSize
3150
3151 // Reload substr for rescan, this code
3152 // is executed only for large substrings (> 8 chars)
3153 bind(RELOAD_SUBSTR);
3154 movptr(str2, Address(rsp, 2*wordSize));
3155 movl(cnt2, Address(rsp, 3*wordSize));
3156 if (ae == StrIntrinsicNode::UL) {
3157 pmovzxbw(vec, Address(str2, 0));
3158 } else {
3159 movdqu(vec, Address(str2, 0));
3160 }
3161 // We came here after the beginning of the substring was
3162 // matched but the rest of it was not so we need to search
3163 // again. Start from the next element after the previous match.
3164 subptr(str1, result); // Restore counter
3165 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3166 shrl(str1, 1);
3167 }
3168 addl(cnt1, str1);
3169 decrementl(cnt1); // Shift to next element
3170 cmpl(cnt1, cnt2);
3171 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3172
3173 addptr(result, (1<<scale1));
3174 } // non constant
3175
3176 // Scan string for start of substr in 16-byte vectors
3177 bind(SCAN_TO_SUBSTR);
3178 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3179 pcmpestri(vec, Address(result, 0), mode);
3180 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3181 subl(cnt1, stride);
3182 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3183 cmpl(cnt1, cnt2);
3184 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3185 addptr(result, 16);
3186
3187 bind(ADJUST_STR);
3188 cmpl(cnt1, stride); // Do not read beyond string
3189 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3190 // Back-up string to avoid reading beyond string.
3191 lea(result, Address(result, cnt1, scale1, -16));
3192 movl(cnt1, stride);
3193 jmpb(SCAN_TO_SUBSTR);
3194
3195 // Found a potential substr
3196 bind(FOUND_CANDIDATE);
3197 // After pcmpestri tmp(rcx) contains matched element index
3198
3199 // Make sure string is still long enough
3200 subl(cnt1, tmp);
3201 cmpl(cnt1, cnt2);
3202 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3203 // Left less then substring.
3204
3205 bind(RET_NOT_FOUND);
3206 movl(result, -1);
3207 jmp(CLEANUP);
3208
3209 bind(FOUND_SUBSTR);
3210 // Compute start addr of substr
3211 lea(result, Address(result, tmp, scale1));
3212 if (int_cnt2 > 0) { // Constant substring
3213 // Repeat search for small substring (< 8 chars)
3214 // from new point without reloading substring.
3215 // Have to check that we don't read beyond string.
3216 cmpl(tmp, stride-int_cnt2);
3217 jccb(Assembler::greater, ADJUST_STR);
3218 // Fall through if matched whole substring.
3219 } else { // non constant
3220 assert(int_cnt2 == -1, "should be != 0");
3221
3222 addl(tmp, cnt2);
3223 // Found result if we matched whole substring.
3224 cmpl(tmp, stride);
3225 jcc(Assembler::lessEqual, RET_FOUND);
3226
3227 // Repeat search for small substring (<= 8 chars)
3228 // from new point 'str1' without reloading substring.
3229 cmpl(cnt2, stride);
3230 // Have to check that we don't read beyond string.
3231 jccb(Assembler::lessEqual, ADJUST_STR);
3232
3233 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3234 // Compare the rest of substring (> 8 chars).
3235 movptr(str1, result);
3236
3237 cmpl(tmp, cnt2);
3238 // First 8 chars are already matched.
3239 jccb(Assembler::equal, CHECK_NEXT);
3240
3241 bind(SCAN_SUBSTR);
3242 pcmpestri(vec, Address(str1, 0), mode);
3243 // Need to reload strings pointers if not matched whole vector
3244 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3245
3246 bind(CHECK_NEXT);
3247 subl(cnt2, stride);
3248 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3249 addptr(str1, 16);
3250 if (ae == StrIntrinsicNode::UL) {
3251 addptr(str2, 8);
3252 } else {
3253 addptr(str2, 16);
3254 }
3255 subl(cnt1, stride);
3256 cmpl(cnt2, stride); // Do not read beyond substring
3257 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3258 // Back-up strings to avoid reading beyond substring.
3259
3260 if (ae == StrIntrinsicNode::UL) {
3261 lea(str2, Address(str2, cnt2, scale2, -8));
3262 lea(str1, Address(str1, cnt2, scale1, -16));
3263 } else {
3264 lea(str2, Address(str2, cnt2, scale2, -16));
3265 lea(str1, Address(str1, cnt2, scale1, -16));
3266 }
3267 subl(cnt1, cnt2);
3268 movl(cnt2, stride);
3269 addl(cnt1, stride);
3270 bind(CONT_SCAN_SUBSTR);
3271 if (ae == StrIntrinsicNode::UL) {
3272 pmovzxbw(vec, Address(str2, 0));
3273 } else {
3274 movdqu(vec, Address(str2, 0));
3275 }
3276 jmp(SCAN_SUBSTR);
3277
3278 bind(RET_FOUND_LONG);
3279 movptr(str1, Address(rsp, wordSize));
3280 } // non constant
3281
3282 bind(RET_FOUND);
3283 // Compute substr offset
3284 subptr(result, str1);
3285 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3286 shrl(result, 1); // index
3287 }
3288 bind(CLEANUP);
3289 pop(rsp); // restore SP
3290
3291 } // string_indexof
3292
3293 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3294 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3295 ShortBranchVerifier sbv(this);
3296 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3297
3298 int stride = 8;
3299
3300 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3301 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3302 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3303 FOUND_SEQ_CHAR, DONE_LABEL;
3304
3305 movptr(result, str1);
3306 if (UseAVX >= 2) {
3307 cmpl(cnt1, stride);
3308 jcc(Assembler::less, SCAN_TO_CHAR);
3309 cmpl(cnt1, 2*stride);
3310 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3311 movdl(vec1, ch);
3312 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3313 vpxor(vec2, vec2);
3314 movl(tmp, cnt1);
3315 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3316 andl(cnt1,0x0000000F); //tail count (in chars)
3317
3318 bind(SCAN_TO_16_CHAR_LOOP);
3319 vmovdqu(vec3, Address(result, 0));
3320 vpcmpeqw(vec3, vec3, vec1, 1);
3321 vptest(vec2, vec3);
3322 jcc(Assembler::carryClear, FOUND_CHAR);
3323 addptr(result, 32);
3324 subl(tmp, 2*stride);
3325 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3326 jmp(SCAN_TO_8_CHAR);
3327 bind(SCAN_TO_8_CHAR_INIT);
3328 movdl(vec1, ch);
3329 pshuflw(vec1, vec1, 0x00);
3330 pshufd(vec1, vec1, 0);
3331 pxor(vec2, vec2);
3332 }
3333 bind(SCAN_TO_8_CHAR);
3334 cmpl(cnt1, stride);
3335 jcc(Assembler::less, SCAN_TO_CHAR);
3336 if (UseAVX < 2) {
3337 movdl(vec1, ch);
3338 pshuflw(vec1, vec1, 0x00);
3339 pshufd(vec1, vec1, 0);
3340 pxor(vec2, vec2);
3341 }
3342 movl(tmp, cnt1);
3343 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3344 andl(cnt1,0x00000007); //tail count (in chars)
3345
3346 bind(SCAN_TO_8_CHAR_LOOP);
3347 movdqu(vec3, Address(result, 0));
3348 pcmpeqw(vec3, vec1);
3349 ptest(vec2, vec3);
3350 jcc(Assembler::carryClear, FOUND_CHAR);
3351 addptr(result, 16);
3352 subl(tmp, stride);
3353 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3354 bind(SCAN_TO_CHAR);
3355 testl(cnt1, cnt1);
3356 jcc(Assembler::zero, RET_NOT_FOUND);
3357 bind(SCAN_TO_CHAR_LOOP);
3358 load_unsigned_short(tmp, Address(result, 0));
3359 cmpl(ch, tmp);
3360 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3361 addptr(result, 2);
3362 subl(cnt1, 1);
3363 jccb(Assembler::zero, RET_NOT_FOUND);
3364 jmp(SCAN_TO_CHAR_LOOP);
3365
3366 bind(RET_NOT_FOUND);
3367 movl(result, -1);
3368 jmpb(DONE_LABEL);
3369
3370 bind(FOUND_CHAR);
3371 if (UseAVX >= 2) {
3372 vpmovmskb(tmp, vec3);
3373 } else {
3374 pmovmskb(tmp, vec3);
3375 }
3376 bsfl(ch, tmp);
3377 addptr(result, ch);
3378
3379 bind(FOUND_SEQ_CHAR);
3380 subptr(result, str1);
3381 shrl(result, 1);
3382
3383 bind(DONE_LABEL);
3384 } // string_indexof_char
3385
3386 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3387 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3388 ShortBranchVerifier sbv(this);
3389 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3390
3391 int stride = 16;
3392
3393 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3394 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3395 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3396 FOUND_SEQ_CHAR, DONE_LABEL;
3397
3398 movptr(result, str1);
3399 if (UseAVX >= 2) {
3400 cmpl(cnt1, stride);
3401 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3402 cmpl(cnt1, stride*2);
3403 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3404 movdl(vec1, ch);
3405 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3406 vpxor(vec2, vec2);
3407 movl(tmp, cnt1);
3408 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3409 andl(cnt1,0x0000001F); //tail count (in chars)
3410
3411 bind(SCAN_TO_32_CHAR_LOOP);
3412 vmovdqu(vec3, Address(result, 0));
3413 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3414 vptest(vec2, vec3);
3415 jcc(Assembler::carryClear, FOUND_CHAR);
3416 addptr(result, 32);
3417 subl(tmp, stride*2);
3418 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3419 jmp(SCAN_TO_16_CHAR);
3420
3421 bind(SCAN_TO_16_CHAR_INIT);
3422 movdl(vec1, ch);
3423 pxor(vec2, vec2);
3424 pshufb(vec1, vec2);
3425 }
3426
3427 bind(SCAN_TO_16_CHAR);
3428 cmpl(cnt1, stride);
3429 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3430 if (UseAVX < 2) {
3431 movdl(vec1, ch);
3432 pxor(vec2, vec2);
3433 pshufb(vec1, vec2);
3434 }
3435 movl(tmp, cnt1);
3436 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3437 andl(cnt1,0x0000000F); //tail count (in bytes)
3438
3439 bind(SCAN_TO_16_CHAR_LOOP);
3440 movdqu(vec3, Address(result, 0));
3441 pcmpeqb(vec3, vec1);
3442 ptest(vec2, vec3);
3443 jcc(Assembler::carryClear, FOUND_CHAR);
3444 addptr(result, 16);
3445 subl(tmp, stride);
3446 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3447
3448 bind(SCAN_TO_CHAR_INIT);
3449 testl(cnt1, cnt1);
3450 jcc(Assembler::zero, RET_NOT_FOUND);
3451 bind(SCAN_TO_CHAR_LOOP);
3452 load_unsigned_byte(tmp, Address(result, 0));
3453 cmpl(ch, tmp);
3454 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3455 addptr(result, 1);
3456 subl(cnt1, 1);
3457 jccb(Assembler::zero, RET_NOT_FOUND);
3458 jmp(SCAN_TO_CHAR_LOOP);
3459
3460 bind(RET_NOT_FOUND);
3461 movl(result, -1);
3462 jmpb(DONE_LABEL);
3463
3464 bind(FOUND_CHAR);
3465 if (UseAVX >= 2) {
3466 vpmovmskb(tmp, vec3);
3467 } else {
3468 pmovmskb(tmp, vec3);
3469 }
3470 bsfl(ch, tmp);
3471 addptr(result, ch);
3472
3473 bind(FOUND_SEQ_CHAR);
3474 subptr(result, str1);
3475
3476 bind(DONE_LABEL);
3477 } // stringL_indexof_char
3478
3479 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3480 switch (eltype) {
3481 case T_BOOLEAN: return sizeof(jboolean);
3482 case T_BYTE: return sizeof(jbyte);
3483 case T_SHORT: return sizeof(jshort);
3484 case T_CHAR: return sizeof(jchar);
3485 case T_INT: return sizeof(jint);
3486 default:
3487 ShouldNotReachHere();
3488 return -1;
3489 }
3490 }
3491
3492 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3493 switch (eltype) {
3494 // T_BOOLEAN used as surrogate for unsigned byte
3495 case T_BOOLEAN: movzbl(dst, src); break;
3496 case T_BYTE: movsbl(dst, src); break;
3497 case T_SHORT: movswl(dst, src); break;
3498 case T_CHAR: movzwl(dst, src); break;
3499 case T_INT: movl(dst, src); break;
3500 default:
3501 ShouldNotReachHere();
3502 }
3503 }
3504
3505 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3506 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3507 }
3508
3509 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3510 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3511 }
3512
3513 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3514 const int vlen = Assembler::AVX_256bit;
3515 switch (eltype) {
3516 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3517 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3518 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3519 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3520 case T_INT:
3521 // do nothing
3522 break;
3523 default:
3524 ShouldNotReachHere();
3525 }
3526 }
3527
3528 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3529 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3530 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3531 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3532 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3533 BasicType eltype) {
3534 ShortBranchVerifier sbv(this);
3535 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3536 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3537 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3538
3539 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3540 SHORT_UNROLLED_LOOP_EXIT,
3541 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3542 UNROLLED_VECTOR_LOOP_BEGIN,
3543 END;
3544 switch (eltype) {
3545 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3546 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3547 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3548 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3549 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3550 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3551 }
3552
3553 // For "renaming" for readibility of the code
3554 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3555 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3556 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3557
3558 const int elsize = arrays_hashcode_elsize(eltype);
3559
3560 /*
3561 if (cnt1 >= 2) {
3562 if (cnt1 >= 32) {
3563 UNROLLED VECTOR LOOP
3564 }
3565 UNROLLED SCALAR LOOP
3566 }
3567 SINGLE SCALAR
3568 */
3569
3570 cmpl(cnt1, 32);
3571 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3572
3573 // cnt1 >= 32 && generate_vectorized_loop
3574 xorl(index, index);
3575
3576 // vresult = IntVector.zero(I256);
3577 for (int idx = 0; idx < 4; idx++) {
3578 vpxor(vresult[idx], vresult[idx]);
3579 }
3580 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3581 Register bound = tmp2;
3582 Register next = tmp3;
3583 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3584 movl(next, Address(tmp2, 0));
3585 movdl(vnext, next);
3586 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3587
3588 // index = 0;
3589 // bound = cnt1 & ~(32 - 1);
3590 movl(bound, cnt1);
3591 andl(bound, ~(32 - 1));
3592 // for (; index < bound; index += 32) {
3593 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3594 // result *= next;
3595 imull(result, next);
3596 // loop fission to upfront the cost of fetching from memory, OOO execution
3597 // can then hopefully do a better job of prefetching
3598 for (int idx = 0; idx < 4; idx++) {
3599 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3600 }
3601 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3602 for (int idx = 0; idx < 4; idx++) {
3603 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3604 arrays_hashcode_elvcast(vtmp[idx], eltype);
3605 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3606 }
3607 // index += 32;
3608 addl(index, 32);
3609 // index < bound;
3610 cmpl(index, bound);
3611 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3612 // }
3613
3614 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3615 subl(cnt1, bound);
3616 // release bound
3617
3618 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3619 for (int idx = 0; idx < 4; idx++) {
3620 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3621 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3622 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3623 }
3624 // result += vresult.reduceLanes(ADD);
3625 for (int idx = 0; idx < 4; idx++) {
3626 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3627 }
3628
3629 // } else if (cnt1 < 32) {
3630
3631 bind(SHORT_UNROLLED_BEGIN);
3632 // int i = 1;
3633 movl(index, 1);
3634 cmpl(index, cnt1);
3635 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3636
3637 // for (; i < cnt1 ; i += 2) {
3638 bind(SHORT_UNROLLED_LOOP_BEGIN);
3639 movl(tmp3, 961);
3640 imull(result, tmp3);
3641 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3642 movl(tmp3, tmp2);
3643 shll(tmp3, 5);
3644 subl(tmp3, tmp2);
3645 addl(result, tmp3);
3646 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3647 addl(result, tmp3);
3648 addl(index, 2);
3649 cmpl(index, cnt1);
3650 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3651
3652 // }
3653 // if (i >= cnt1) {
3654 bind(SHORT_UNROLLED_LOOP_EXIT);
3655 jccb(Assembler::greater, END);
3656 movl(tmp2, result);
3657 shll(result, 5);
3658 subl(result, tmp2);
3659 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3660 addl(result, tmp3);
3661 // }
3662 bind(END);
3663
3664 BLOCK_COMMENT("} // arrays_hashcode");
3665
3666 } // arrays_hashcode
3667
3668 // helper function for string_compare
3669 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3670 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3671 Address::ScaleFactor scale2, Register index, int ae) {
3672 if (ae == StrIntrinsicNode::LL) {
3673 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3674 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3675 } else if (ae == StrIntrinsicNode::UU) {
3676 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3677 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3678 } else {
3679 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3680 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3681 }
3682 }
3683
3684 // Compare strings, used for char[] and byte[].
3685 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3686 Register cnt1, Register cnt2, Register result,
3687 XMMRegister vec1, int ae, KRegister mask) {
3688 ShortBranchVerifier sbv(this);
3689 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3690 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3691 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3692 int stride2x2 = 0x40;
3693 Address::ScaleFactor scale = Address::no_scale;
3694 Address::ScaleFactor scale1 = Address::no_scale;
3695 Address::ScaleFactor scale2 = Address::no_scale;
3696
3697 if (ae != StrIntrinsicNode::LL) {
3698 stride2x2 = 0x20;
3699 }
3700
3701 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3702 shrl(cnt2, 1);
3703 }
3704 // Compute the minimum of the string lengths and the
3705 // difference of the string lengths (stack).
3706 // Do the conditional move stuff
3707 movl(result, cnt1);
3708 subl(cnt1, cnt2);
3709 push(cnt1);
3710 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3711
3712 // Is the minimum length zero?
3713 testl(cnt2, cnt2);
3714 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3715 if (ae == StrIntrinsicNode::LL) {
3716 // Load first bytes
3717 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3718 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3719 } else if (ae == StrIntrinsicNode::UU) {
3720 // Load first characters
3721 load_unsigned_short(result, Address(str1, 0));
3722 load_unsigned_short(cnt1, Address(str2, 0));
3723 } else {
3724 load_unsigned_byte(result, Address(str1, 0));
3725 load_unsigned_short(cnt1, Address(str2, 0));
3726 }
3727 subl(result, cnt1);
3728 jcc(Assembler::notZero, POP_LABEL);
3729
3730 if (ae == StrIntrinsicNode::UU) {
3731 // Divide length by 2 to get number of chars
3732 shrl(cnt2, 1);
3733 }
3734 cmpl(cnt2, 1);
3735 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3736
3737 // Check if the strings start at the same location and setup scale and stride
3738 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3739 cmpptr(str1, str2);
3740 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3741 if (ae == StrIntrinsicNode::LL) {
3742 scale = Address::times_1;
3743 stride = 16;
3744 } else {
3745 scale = Address::times_2;
3746 stride = 8;
3747 }
3748 } else {
3749 scale1 = Address::times_1;
3750 scale2 = Address::times_2;
3751 // scale not used
3752 stride = 8;
3753 }
3754
3755 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3756 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3757 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3758 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3759 Label COMPARE_TAIL_LONG;
3760 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3761
3762 int pcmpmask = 0x19;
3763 if (ae == StrIntrinsicNode::LL) {
3764 pcmpmask &= ~0x01;
3765 }
3766
3767 // Setup to compare 16-chars (32-bytes) vectors,
3768 // start from first character again because it has aligned address.
3769 if (ae == StrIntrinsicNode::LL) {
3770 stride2 = 32;
3771 } else {
3772 stride2 = 16;
3773 }
3774 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3775 adr_stride = stride << scale;
3776 } else {
3777 adr_stride1 = 8; //stride << scale1;
3778 adr_stride2 = 16; //stride << scale2;
3779 }
3780
3781 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3782 // rax and rdx are used by pcmpestri as elements counters
3783 movl(result, cnt2);
3784 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3785 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3786
3787 // fast path : compare first 2 8-char vectors.
3788 bind(COMPARE_16_CHARS);
3789 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3790 movdqu(vec1, Address(str1, 0));
3791 } else {
3792 pmovzxbw(vec1, Address(str1, 0));
3793 }
3794 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3795 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3796
3797 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3798 movdqu(vec1, Address(str1, adr_stride));
3799 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3800 } else {
3801 pmovzxbw(vec1, Address(str1, adr_stride1));
3802 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3803 }
3804 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3805 addl(cnt1, stride);
3806
3807 // Compare the characters at index in cnt1
3808 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3809 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3810 subl(result, cnt2);
3811 jmp(POP_LABEL);
3812
3813 // Setup the registers to start vector comparison loop
3814 bind(COMPARE_WIDE_VECTORS);
3815 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3816 lea(str1, Address(str1, result, scale));
3817 lea(str2, Address(str2, result, scale));
3818 } else {
3819 lea(str1, Address(str1, result, scale1));
3820 lea(str2, Address(str2, result, scale2));
3821 }
3822 subl(result, stride2);
3823 subl(cnt2, stride2);
3824 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3825 negptr(result);
3826
3827 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3828 bind(COMPARE_WIDE_VECTORS_LOOP);
3829
3830 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3831 cmpl(cnt2, stride2x2);
3832 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3833 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3834 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3835
3836 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3837 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3838 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3839 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3840 } else {
3841 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3842 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3843 }
3844 kortestql(mask, mask);
3845 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3846 addptr(result, stride2x2); // update since we already compared at this addr
3847 subl(cnt2, stride2x2); // and sub the size too
3848 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3849
3850 vpxor(vec1, vec1);
3851 jmpb(COMPARE_WIDE_TAIL);
3852 }//if (VM_Version::supports_avx512vlbw())
3853
3854 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3855 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3856 vmovdqu(vec1, Address(str1, result, scale));
3857 vpxor(vec1, Address(str2, result, scale));
3858 } else {
3859 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3860 vpxor(vec1, Address(str2, result, scale2));
3861 }
3862 vptest(vec1, vec1);
3863 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3864 addptr(result, stride2);
3865 subl(cnt2, stride2);
3866 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3867 // clean upper bits of YMM registers
3868 vpxor(vec1, vec1);
3869
3870 // compare wide vectors tail
3871 bind(COMPARE_WIDE_TAIL);
3872 testptr(result, result);
3873 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3874
3875 movl(result, stride2);
3876 movl(cnt2, result);
3877 negptr(result);
3878 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3879
3880 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3881 bind(VECTOR_NOT_EQUAL);
3882 // clean upper bits of YMM registers
3883 vpxor(vec1, vec1);
3884 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3885 lea(str1, Address(str1, result, scale));
3886 lea(str2, Address(str2, result, scale));
3887 } else {
3888 lea(str1, Address(str1, result, scale1));
3889 lea(str2, Address(str2, result, scale2));
3890 }
3891 jmp(COMPARE_16_CHARS);
3892
3893 // Compare tail chars, length between 1 to 15 chars
3894 bind(COMPARE_TAIL_LONG);
3895 movl(cnt2, result);
3896 cmpl(cnt2, stride);
3897 jcc(Assembler::less, COMPARE_SMALL_STR);
3898
3899 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3900 movdqu(vec1, Address(str1, 0));
3901 } else {
3902 pmovzxbw(vec1, Address(str1, 0));
3903 }
3904 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3905 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3906 subptr(cnt2, stride);
3907 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3908 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3909 lea(str1, Address(str1, result, scale));
3910 lea(str2, Address(str2, result, scale));
3911 } else {
3912 lea(str1, Address(str1, result, scale1));
3913 lea(str2, Address(str2, result, scale2));
3914 }
3915 negptr(cnt2);
3916 jmpb(WHILE_HEAD_LABEL);
3917
3918 bind(COMPARE_SMALL_STR);
3919 } else if (UseSSE42Intrinsics) {
3920 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3921 int pcmpmask = 0x19;
3922 // Setup to compare 8-char (16-byte) vectors,
3923 // start from first character again because it has aligned address.
3924 movl(result, cnt2);
3925 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3926 if (ae == StrIntrinsicNode::LL) {
3927 pcmpmask &= ~0x01;
3928 }
3929 jcc(Assembler::zero, COMPARE_TAIL);
3930 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3931 lea(str1, Address(str1, result, scale));
3932 lea(str2, Address(str2, result, scale));
3933 } else {
3934 lea(str1, Address(str1, result, scale1));
3935 lea(str2, Address(str2, result, scale2));
3936 }
3937 negptr(result);
3938
3939 // pcmpestri
3940 // inputs:
3941 // vec1- substring
3942 // rax - negative string length (elements count)
3943 // mem - scanned string
3944 // rdx - string length (elements count)
3945 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3946 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3947 // outputs:
3948 // rcx - first mismatched element index
3949 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3950
3951 bind(COMPARE_WIDE_VECTORS);
3952 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3953 movdqu(vec1, Address(str1, result, scale));
3954 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3955 } else {
3956 pmovzxbw(vec1, Address(str1, result, scale1));
3957 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3958 }
3959 // After pcmpestri cnt1(rcx) contains mismatched element index
3960
3961 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3962 addptr(result, stride);
3963 subptr(cnt2, stride);
3964 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3965
3966 // compare wide vectors tail
3967 testptr(result, result);
3968 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3969
3970 movl(cnt2, stride);
3971 movl(result, stride);
3972 negptr(result);
3973 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3974 movdqu(vec1, Address(str1, result, scale));
3975 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3976 } else {
3977 pmovzxbw(vec1, Address(str1, result, scale1));
3978 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3979 }
3980 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3981
3982 // Mismatched characters in the vectors
3983 bind(VECTOR_NOT_EQUAL);
3984 addptr(cnt1, result);
3985 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3986 subl(result, cnt2);
3987 jmpb(POP_LABEL);
3988
3989 bind(COMPARE_TAIL); // limit is zero
3990 movl(cnt2, result);
3991 // Fallthru to tail compare
3992 }
3993 // Shift str2 and str1 to the end of the arrays, negate min
3994 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3995 lea(str1, Address(str1, cnt2, scale));
3996 lea(str2, Address(str2, cnt2, scale));
3997 } else {
3998 lea(str1, Address(str1, cnt2, scale1));
3999 lea(str2, Address(str2, cnt2, scale2));
4000 }
4001 decrementl(cnt2); // first character was compared already
4002 negptr(cnt2);
4003
4004 // Compare the rest of the elements
4005 bind(WHILE_HEAD_LABEL);
4006 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4007 subl(result, cnt1);
4008 jccb(Assembler::notZero, POP_LABEL);
4009 increment(cnt2);
4010 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4011
4012 // Strings are equal up to min length. Return the length difference.
4013 bind(LENGTH_DIFF_LABEL);
4014 pop(result);
4015 if (ae == StrIntrinsicNode::UU) {
4016 // Divide diff by 2 to get number of chars
4017 sarl(result, 1);
4018 }
4019 jmpb(DONE_LABEL);
4020
4021 if (VM_Version::supports_avx512vlbw()) {
4022
4023 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4024
4025 kmovql(cnt1, mask);
4026 notq(cnt1);
4027 bsfq(cnt2, cnt1);
4028 if (ae != StrIntrinsicNode::LL) {
4029 // Divide diff by 2 to get number of chars
4030 sarl(cnt2, 1);
4031 }
4032 addq(result, cnt2);
4033 if (ae == StrIntrinsicNode::LL) {
4034 load_unsigned_byte(cnt1, Address(str2, result));
4035 load_unsigned_byte(result, Address(str1, result));
4036 } else if (ae == StrIntrinsicNode::UU) {
4037 load_unsigned_short(cnt1, Address(str2, result, scale));
4038 load_unsigned_short(result, Address(str1, result, scale));
4039 } else {
4040 load_unsigned_short(cnt1, Address(str2, result, scale2));
4041 load_unsigned_byte(result, Address(str1, result, scale1));
4042 }
4043 subl(result, cnt1);
4044 jmpb(POP_LABEL);
4045 }//if (VM_Version::supports_avx512vlbw())
4046
4047 // Discard the stored length difference
4048 bind(POP_LABEL);
4049 pop(cnt1);
4050
4051 // That's it
4052 bind(DONE_LABEL);
4053 if(ae == StrIntrinsicNode::UL) {
4054 negl(result);
4055 }
4056
4057 }
4058
4059 // Search for Non-ASCII character (Negative byte value) in a byte array,
4060 // return the index of the first such character, otherwise the length
4061 // of the array segment searched.
4062 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4063 // @IntrinsicCandidate
4064 // public static int countPositives(byte[] ba, int off, int len) {
4065 // for (int i = off; i < off + len; i++) {
4066 // if (ba[i] < 0) {
4067 // return i - off;
4068 // }
4069 // }
4070 // return len;
4071 // }
4072 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4073 Register result, Register tmp1,
4074 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4075 // rsi: byte array
4076 // rcx: len
4077 // rax: result
4078 ShortBranchVerifier sbv(this);
4079 assert_different_registers(ary1, len, result, tmp1);
4080 assert_different_registers(vec1, vec2);
4081 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4082
4083 movl(result, len); // copy
4084 // len == 0
4085 testl(len, len);
4086 jcc(Assembler::zero, DONE);
4087
4088 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4089 VM_Version::supports_avx512vlbw() &&
4090 VM_Version::supports_bmi2()) {
4091
4092 Label test_64_loop, test_tail, BREAK_LOOP;
4093 movl(tmp1, len);
4094 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4095
4096 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4097 andl(len, 0xffffffc0); // vector count (in chars)
4098 jccb(Assembler::zero, test_tail);
4099
4100 lea(ary1, Address(ary1, len, Address::times_1));
4101 negptr(len);
4102
4103 bind(test_64_loop);
4104 // Check whether our 64 elements of size byte contain negatives
4105 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4106 kortestql(mask1, mask1);
4107 jcc(Assembler::notZero, BREAK_LOOP);
4108
4109 addptr(len, 64);
4110 jccb(Assembler::notZero, test_64_loop);
4111
4112 bind(test_tail);
4113 // bail out when there is nothing to be done
4114 testl(tmp1, -1);
4115 jcc(Assembler::zero, DONE);
4116
4117
4118 // check the tail for absense of negatives
4119 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4120 {
4121 Register tmp3_aliased = len;
4122 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4123 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4124 notq(tmp3_aliased);
4125 kmovql(mask2, tmp3_aliased);
4126 }
4127
4128 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4129 ktestq(mask1, mask2);
4130 jcc(Assembler::zero, DONE);
4131
4132 // do a full check for negative registers in the tail
4133 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4134 // ary1 already pointing to the right place
4135 jmpb(TAIL_START);
4136
4137 bind(BREAK_LOOP);
4138 // At least one byte in the last 64 byte block was negative.
4139 // Set up to look at the last 64 bytes as if they were a tail
4140 lea(ary1, Address(ary1, len, Address::times_1));
4141 addptr(result, len);
4142 // Ignore the very last byte: if all others are positive,
4143 // it must be negative, so we can skip right to the 2+1 byte
4144 // end comparison at this point
4145 orl(result, 63);
4146 movl(len, 63);
4147 // Fallthru to tail compare
4148 } else {
4149
4150 if (UseAVX >= 2) {
4151 // With AVX2, use 32-byte vector compare
4152 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4153
4154 // Compare 32-byte vectors
4155 testl(len, 0xffffffe0); // vector count (in bytes)
4156 jccb(Assembler::zero, TAIL_START);
4157
4158 andl(len, 0xffffffe0);
4159 lea(ary1, Address(ary1, len, Address::times_1));
4160 negptr(len);
4161
4162 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4163 movdl(vec2, tmp1);
4164 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4165
4166 bind(COMPARE_WIDE_VECTORS);
4167 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4168 vptest(vec1, vec2);
4169 jccb(Assembler::notZero, BREAK_LOOP);
4170 addptr(len, 32);
4171 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4172
4173 testl(result, 0x0000001f); // any bytes remaining?
4174 jcc(Assembler::zero, DONE);
4175
4176 // Quick test using the already prepared vector mask
4177 movl(len, result);
4178 andl(len, 0x0000001f);
4179 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4180 vptest(vec1, vec2);
4181 jcc(Assembler::zero, DONE);
4182 // There are zeros, jump to the tail to determine exactly where
4183 jmpb(TAIL_START);
4184
4185 bind(BREAK_LOOP);
4186 // At least one byte in the last 32-byte vector is negative.
4187 // Set up to look at the last 32 bytes as if they were a tail
4188 lea(ary1, Address(ary1, len, Address::times_1));
4189 addptr(result, len);
4190 // Ignore the very last byte: if all others are positive,
4191 // it must be negative, so we can skip right to the 2+1 byte
4192 // end comparison at this point
4193 orl(result, 31);
4194 movl(len, 31);
4195 // Fallthru to tail compare
4196 } else if (UseSSE42Intrinsics) {
4197 // With SSE4.2, use double quad vector compare
4198 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4199
4200 // Compare 16-byte vectors
4201 testl(len, 0xfffffff0); // vector count (in bytes)
4202 jcc(Assembler::zero, TAIL_START);
4203
4204 andl(len, 0xfffffff0);
4205 lea(ary1, Address(ary1, len, Address::times_1));
4206 negptr(len);
4207
4208 movl(tmp1, 0x80808080);
4209 movdl(vec2, tmp1);
4210 pshufd(vec2, vec2, 0);
4211
4212 bind(COMPARE_WIDE_VECTORS);
4213 movdqu(vec1, Address(ary1, len, Address::times_1));
4214 ptest(vec1, vec2);
4215 jccb(Assembler::notZero, BREAK_LOOP);
4216 addptr(len, 16);
4217 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4218
4219 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4220 jcc(Assembler::zero, DONE);
4221
4222 // Quick test using the already prepared vector mask
4223 movl(len, result);
4224 andl(len, 0x0000000f); // tail count (in bytes)
4225 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4226 ptest(vec1, vec2);
4227 jcc(Assembler::zero, DONE);
4228 jmpb(TAIL_START);
4229
4230 bind(BREAK_LOOP);
4231 // At least one byte in the last 16-byte vector is negative.
4232 // Set up and look at the last 16 bytes as if they were a tail
4233 lea(ary1, Address(ary1, len, Address::times_1));
4234 addptr(result, len);
4235 // Ignore the very last byte: if all others are positive,
4236 // it must be negative, so we can skip right to the 2+1 byte
4237 // end comparison at this point
4238 orl(result, 15);
4239 movl(len, 15);
4240 // Fallthru to tail compare
4241 }
4242 }
4243
4244 bind(TAIL_START);
4245 // Compare 4-byte vectors
4246 andl(len, 0xfffffffc); // vector count (in bytes)
4247 jccb(Assembler::zero, COMPARE_CHAR);
4248
4249 lea(ary1, Address(ary1, len, Address::times_1));
4250 negptr(len);
4251
4252 bind(COMPARE_VECTORS);
4253 movl(tmp1, Address(ary1, len, Address::times_1));
4254 andl(tmp1, 0x80808080);
4255 jccb(Assembler::notZero, TAIL_ADJUST);
4256 addptr(len, 4);
4257 jccb(Assembler::notZero, COMPARE_VECTORS);
4258
4259 // Compare trailing char (final 2-3 bytes), if any
4260 bind(COMPARE_CHAR);
4261
4262 testl(result, 0x2); // tail char
4263 jccb(Assembler::zero, COMPARE_BYTE);
4264 load_unsigned_short(tmp1, Address(ary1, 0));
4265 andl(tmp1, 0x00008080);
4266 jccb(Assembler::notZero, CHAR_ADJUST);
4267 lea(ary1, Address(ary1, 2));
4268
4269 bind(COMPARE_BYTE);
4270 testl(result, 0x1); // tail byte
4271 jccb(Assembler::zero, DONE);
4272 load_unsigned_byte(tmp1, Address(ary1, 0));
4273 testl(tmp1, 0x00000080);
4274 jccb(Assembler::zero, DONE);
4275 subptr(result, 1);
4276 jmpb(DONE);
4277
4278 bind(TAIL_ADJUST);
4279 // there are negative bits in the last 4 byte block.
4280 // Adjust result and check the next three bytes
4281 addptr(result, len);
4282 orl(result, 3);
4283 lea(ary1, Address(ary1, len, Address::times_1));
4284 jmpb(COMPARE_CHAR);
4285
4286 bind(CHAR_ADJUST);
4287 // We are looking at a char + optional byte tail, and found that one
4288 // of the bytes in the char is negative. Adjust the result, check the
4289 // first byte and readjust if needed.
4290 andl(result, 0xfffffffc);
4291 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4292 jccb(Assembler::notZero, DONE);
4293 addptr(result, 1);
4294
4295 // That's it
4296 bind(DONE);
4297 if (UseAVX >= 2) {
4298 // clean upper bits of YMM registers
4299 vpxor(vec1, vec1);
4300 vpxor(vec2, vec2);
4301 }
4302 }
4303
4304 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4305 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4306 Register limit, Register result, Register chr,
4307 XMMRegister vec1, XMMRegister vec2, bool is_char,
4308 KRegister mask, bool expand_ary2) {
4309 // for expand_ary2, limit is the (smaller) size of the second array.
4310 ShortBranchVerifier sbv(this);
4311 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4312
4313 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4314 "Expansion only implemented for AVX2");
4315
4316 int length_offset = arrayOopDesc::length_offset_in_bytes();
4317 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4318
4319 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4320 int scaleIncr = expand_ary2 ? 8 : 16;
4321
4322 if (is_array_equ) {
4323 // Check the input args
4324 cmpoop(ary1, ary2);
4325 jcc(Assembler::equal, TRUE_LABEL);
4326
4327 // Need additional checks for arrays_equals.
4328 testptr(ary1, ary1);
4329 jcc(Assembler::zero, FALSE_LABEL);
4330 testptr(ary2, ary2);
4331 jcc(Assembler::zero, FALSE_LABEL);
4332
4333 // Check the lengths
4334 movl(limit, Address(ary1, length_offset));
4335 cmpl(limit, Address(ary2, length_offset));
4336 jcc(Assembler::notEqual, FALSE_LABEL);
4337 }
4338
4339 // count == 0
4340 testl(limit, limit);
4341 jcc(Assembler::zero, TRUE_LABEL);
4342
4343 if (is_array_equ) {
4344 // Load array address
4345 lea(ary1, Address(ary1, base_offset));
4346 lea(ary2, Address(ary2, base_offset));
4347 }
4348
4349 if (is_array_equ && is_char) {
4350 // arrays_equals when used for char[].
4351 shll(limit, 1); // byte count != 0
4352 }
4353 movl(result, limit); // copy
4354
4355 if (UseAVX >= 2) {
4356 // With AVX2, use 32-byte vector compare
4357 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4358
4359 // Compare 32-byte vectors
4360 if (expand_ary2) {
4361 andl(result, 0x0000000f); // tail count (in bytes)
4362 andl(limit, 0xfffffff0); // vector count (in bytes)
4363 jcc(Assembler::zero, COMPARE_TAIL);
4364 } else {
4365 andl(result, 0x0000001f); // tail count (in bytes)
4366 andl(limit, 0xffffffe0); // vector count (in bytes)
4367 jcc(Assembler::zero, COMPARE_TAIL_16);
4368 }
4369
4370 lea(ary1, Address(ary1, limit, scaleFactor));
4371 lea(ary2, Address(ary2, limit, Address::times_1));
4372 negptr(limit);
4373
4374 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4375 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4376
4377 cmpl(limit, -64);
4378 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4379
4380 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4381
4382 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4383 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4384 kortestql(mask, mask);
4385 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4386 addptr(limit, 64); // update since we already compared at this addr
4387 cmpl(limit, -64);
4388 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4389
4390 // At this point we may still need to compare -limit+result bytes.
4391 // We could execute the next two instruction and just continue via non-wide path:
4392 // cmpl(limit, 0);
4393 // jcc(Assembler::equal, COMPARE_TAIL); // true
4394 // But since we stopped at the points ary{1,2}+limit which are
4395 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4396 // (|limit| <= 32 and result < 32),
4397 // we may just compare the last 64 bytes.
4398 //
4399 addptr(result, -64); // it is safe, bc we just came from this area
4400 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4401 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4402 kortestql(mask, mask);
4403 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4404
4405 jmp(TRUE_LABEL);
4406
4407 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4408
4409 }//if (VM_Version::supports_avx512vlbw())
4410
4411 bind(COMPARE_WIDE_VECTORS);
4412 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4413 if (expand_ary2) {
4414 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4415 } else {
4416 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4417 }
4418 vpxor(vec1, vec2);
4419
4420 vptest(vec1, vec1);
4421 jcc(Assembler::notZero, FALSE_LABEL);
4422 addptr(limit, scaleIncr * 2);
4423 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4424
4425 testl(result, result);
4426 jcc(Assembler::zero, TRUE_LABEL);
4427
4428 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4429 if (expand_ary2) {
4430 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4431 } else {
4432 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4433 }
4434 vpxor(vec1, vec2);
4435
4436 vptest(vec1, vec1);
4437 jcc(Assembler::notZero, FALSE_LABEL);
4438 jmp(TRUE_LABEL);
4439
4440 bind(COMPARE_TAIL_16); // limit is zero
4441 movl(limit, result);
4442
4443 // Compare 16-byte chunks
4444 andl(result, 0x0000000f); // tail count (in bytes)
4445 andl(limit, 0xfffffff0); // vector count (in bytes)
4446 jcc(Assembler::zero, COMPARE_TAIL);
4447
4448 lea(ary1, Address(ary1, limit, scaleFactor));
4449 lea(ary2, Address(ary2, limit, Address::times_1));
4450 negptr(limit);
4451
4452 bind(COMPARE_WIDE_VECTORS_16);
4453 movdqu(vec1, Address(ary1, limit, scaleFactor));
4454 if (expand_ary2) {
4455 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4456 } else {
4457 movdqu(vec2, Address(ary2, limit, Address::times_1));
4458 }
4459 pxor(vec1, vec2);
4460
4461 ptest(vec1, vec1);
4462 jcc(Assembler::notZero, FALSE_LABEL);
4463 addptr(limit, scaleIncr);
4464 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4465
4466 bind(COMPARE_TAIL); // limit is zero
4467 movl(limit, result);
4468 // Fallthru to tail compare
4469 } else if (UseSSE42Intrinsics) {
4470 // With SSE4.2, use double quad vector compare
4471 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4472
4473 // Compare 16-byte vectors
4474 andl(result, 0x0000000f); // tail count (in bytes)
4475 andl(limit, 0xfffffff0); // vector count (in bytes)
4476 jcc(Assembler::zero, COMPARE_TAIL);
4477
4478 lea(ary1, Address(ary1, limit, Address::times_1));
4479 lea(ary2, Address(ary2, limit, Address::times_1));
4480 negptr(limit);
4481
4482 bind(COMPARE_WIDE_VECTORS);
4483 movdqu(vec1, Address(ary1, limit, Address::times_1));
4484 movdqu(vec2, Address(ary2, limit, Address::times_1));
4485 pxor(vec1, vec2);
4486
4487 ptest(vec1, vec1);
4488 jcc(Assembler::notZero, FALSE_LABEL);
4489 addptr(limit, 16);
4490 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4491
4492 testl(result, result);
4493 jcc(Assembler::zero, TRUE_LABEL);
4494
4495 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4496 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4497 pxor(vec1, vec2);
4498
4499 ptest(vec1, vec1);
4500 jccb(Assembler::notZero, FALSE_LABEL);
4501 jmpb(TRUE_LABEL);
4502
4503 bind(COMPARE_TAIL); // limit is zero
4504 movl(limit, result);
4505 // Fallthru to tail compare
4506 }
4507
4508 // Compare 4-byte vectors
4509 if (expand_ary2) {
4510 testl(result, result);
4511 jccb(Assembler::zero, TRUE_LABEL);
4512 } else {
4513 andl(limit, 0xfffffffc); // vector count (in bytes)
4514 jccb(Assembler::zero, COMPARE_CHAR);
4515 }
4516
4517 lea(ary1, Address(ary1, limit, scaleFactor));
4518 lea(ary2, Address(ary2, limit, Address::times_1));
4519 negptr(limit);
4520
4521 bind(COMPARE_VECTORS);
4522 if (expand_ary2) {
4523 // There are no "vector" operations for bytes to shorts
4524 movzbl(chr, Address(ary2, limit, Address::times_1));
4525 cmpw(Address(ary1, limit, Address::times_2), chr);
4526 jccb(Assembler::notEqual, FALSE_LABEL);
4527 addptr(limit, 1);
4528 jcc(Assembler::notZero, COMPARE_VECTORS);
4529 jmp(TRUE_LABEL);
4530 } else {
4531 movl(chr, Address(ary1, limit, Address::times_1));
4532 cmpl(chr, Address(ary2, limit, Address::times_1));
4533 jccb(Assembler::notEqual, FALSE_LABEL);
4534 addptr(limit, 4);
4535 jcc(Assembler::notZero, COMPARE_VECTORS);
4536 }
4537
4538 // Compare trailing char (final 2 bytes), if any
4539 bind(COMPARE_CHAR);
4540 testl(result, 0x2); // tail char
4541 jccb(Assembler::zero, COMPARE_BYTE);
4542 load_unsigned_short(chr, Address(ary1, 0));
4543 load_unsigned_short(limit, Address(ary2, 0));
4544 cmpl(chr, limit);
4545 jccb(Assembler::notEqual, FALSE_LABEL);
4546
4547 if (is_array_equ && is_char) {
4548 bind(COMPARE_BYTE);
4549 } else {
4550 lea(ary1, Address(ary1, 2));
4551 lea(ary2, Address(ary2, 2));
4552
4553 bind(COMPARE_BYTE);
4554 testl(result, 0x1); // tail byte
4555 jccb(Assembler::zero, TRUE_LABEL);
4556 load_unsigned_byte(chr, Address(ary1, 0));
4557 load_unsigned_byte(limit, Address(ary2, 0));
4558 cmpl(chr, limit);
4559 jccb(Assembler::notEqual, FALSE_LABEL);
4560 }
4561 bind(TRUE_LABEL);
4562 movl(result, 1); // return true
4563 jmpb(DONE);
4564
4565 bind(FALSE_LABEL);
4566 xorl(result, result); // return false
4567
4568 // That's it
4569 bind(DONE);
4570 if (UseAVX >= 2) {
4571 // clean upper bits of YMM registers
4572 vpxor(vec1, vec1);
4573 vpxor(vec2, vec2);
4574 }
4575 }
4576
4577 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4578 #define __ masm.
4579 Register dst = stub.data<0>();
4580 XMMRegister src = stub.data<1>();
4581 address target = stub.data<2>();
4582 __ bind(stub.entry());
4583 __ subptr(rsp, 8);
4584 __ movdbl(Address(rsp), src);
4585 __ call(RuntimeAddress(target));
4586 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4587 __ pop(dst);
4588 __ jmp(stub.continuation());
4589 #undef __
4590 }
4591
4592 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4593 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4594 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4595
4596 address slowpath_target;
4597 if (dst_bt == T_INT) {
4598 if (src_bt == T_FLOAT) {
4599 cvttss2sil(dst, src);
4600 cmpl(dst, 0x80000000);
4601 slowpath_target = StubRoutines::x86::f2i_fixup();
4602 } else {
4603 cvttsd2sil(dst, src);
4604 cmpl(dst, 0x80000000);
4605 slowpath_target = StubRoutines::x86::d2i_fixup();
4606 }
4607 } else {
4608 if (src_bt == T_FLOAT) {
4609 cvttss2siq(dst, src);
4610 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4611 slowpath_target = StubRoutines::x86::f2l_fixup();
4612 } else {
4613 cvttsd2siq(dst, src);
4614 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4615 slowpath_target = StubRoutines::x86::d2l_fixup();
4616 }
4617 }
4618
4619 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4620 int max_size = 23 + (UseAPX ? 1 : 0);
4621 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4622 jcc(Assembler::equal, stub->entry());
4623 bind(stub->continuation());
4624 }
4625
4626 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4627 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4628 switch(ideal_opc) {
4629 case Op_LShiftVS:
4630 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4631 case Op_LShiftVI:
4632 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4633 case Op_LShiftVL:
4634 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4635 case Op_RShiftVS:
4636 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4637 case Op_RShiftVI:
4638 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4639 case Op_RShiftVL:
4640 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4641 case Op_URShiftVS:
4642 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4643 case Op_URShiftVI:
4644 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4645 case Op_URShiftVL:
4646 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4647 case Op_RotateRightV:
4648 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4649 case Op_RotateLeftV:
4650 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4651 default:
4652 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4653 break;
4654 }
4655 }
4656
4657 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4658 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4659 if (is_unsigned) {
4660 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4661 } else {
4662 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4663 }
4664 }
4665
4666 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4667 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4668 switch (elem_bt) {
4669 case T_BYTE:
4670 if (ideal_opc == Op_SaturatingAddV) {
4671 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4672 } else {
4673 assert(ideal_opc == Op_SaturatingSubV, "");
4674 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4675 }
4676 break;
4677 case T_SHORT:
4678 if (ideal_opc == Op_SaturatingAddV) {
4679 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4680 } else {
4681 assert(ideal_opc == Op_SaturatingSubV, "");
4682 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4683 }
4684 break;
4685 default:
4686 fatal("Unsupported type %s", type2name(elem_bt));
4687 break;
4688 }
4689 }
4690
4691 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4692 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4693 switch (elem_bt) {
4694 case T_BYTE:
4695 if (ideal_opc == Op_SaturatingAddV) {
4696 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4697 } else {
4698 assert(ideal_opc == Op_SaturatingSubV, "");
4699 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4700 }
4701 break;
4702 case T_SHORT:
4703 if (ideal_opc == Op_SaturatingAddV) {
4704 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4705 } else {
4706 assert(ideal_opc == Op_SaturatingSubV, "");
4707 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4708 }
4709 break;
4710 default:
4711 fatal("Unsupported type %s", type2name(elem_bt));
4712 break;
4713 }
4714 }
4715
4716 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4717 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4718 if (is_unsigned) {
4719 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4720 } else {
4721 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4722 }
4723 }
4724
4725 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4726 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4727 switch (elem_bt) {
4728 case T_BYTE:
4729 if (ideal_opc == Op_SaturatingAddV) {
4730 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4731 } else {
4732 assert(ideal_opc == Op_SaturatingSubV, "");
4733 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4734 }
4735 break;
4736 case T_SHORT:
4737 if (ideal_opc == Op_SaturatingAddV) {
4738 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4739 } else {
4740 assert(ideal_opc == Op_SaturatingSubV, "");
4741 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4742 }
4743 break;
4744 default:
4745 fatal("Unsupported type %s", type2name(elem_bt));
4746 break;
4747 }
4748 }
4749
4750 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4751 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4752 switch (elem_bt) {
4753 case T_BYTE:
4754 if (ideal_opc == Op_SaturatingAddV) {
4755 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4756 } else {
4757 assert(ideal_opc == Op_SaturatingSubV, "");
4758 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4759 }
4760 break;
4761 case T_SHORT:
4762 if (ideal_opc == Op_SaturatingAddV) {
4763 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4764 } else {
4765 assert(ideal_opc == Op_SaturatingSubV, "");
4766 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4767 }
4768 break;
4769 default:
4770 fatal("Unsupported type %s", type2name(elem_bt));
4771 break;
4772 }
4773 }
4774
4775 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4776 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4777 bool is_varshift) {
4778 switch (ideal_opc) {
4779 case Op_AddVB:
4780 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4781 case Op_AddVS:
4782 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4783 case Op_AddVI:
4784 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4785 case Op_AddVL:
4786 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4787 case Op_AddVF:
4788 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4789 case Op_AddVD:
4790 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4791 case Op_SubVB:
4792 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4793 case Op_SubVS:
4794 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4795 case Op_SubVI:
4796 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4797 case Op_SubVL:
4798 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4799 case Op_SubVF:
4800 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4801 case Op_SubVD:
4802 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4803 case Op_MulVS:
4804 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4805 case Op_MulVI:
4806 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4807 case Op_MulVL:
4808 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4809 case Op_MulVF:
4810 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4811 case Op_MulVD:
4812 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4813 case Op_DivVF:
4814 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4815 case Op_DivVD:
4816 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4817 case Op_SqrtVF:
4818 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4819 case Op_SqrtVD:
4820 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4821 case Op_AbsVB:
4822 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4823 case Op_AbsVS:
4824 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4825 case Op_AbsVI:
4826 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4827 case Op_AbsVL:
4828 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4829 case Op_FmaVF:
4830 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4831 case Op_FmaVD:
4832 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4833 case Op_VectorRearrange:
4834 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4835 case Op_LShiftVS:
4836 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4837 case Op_LShiftVI:
4838 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4839 case Op_LShiftVL:
4840 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4841 case Op_RShiftVS:
4842 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4843 case Op_RShiftVI:
4844 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4845 case Op_RShiftVL:
4846 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4847 case Op_URShiftVS:
4848 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4849 case Op_URShiftVI:
4850 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4851 case Op_URShiftVL:
4852 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4853 case Op_RotateLeftV:
4854 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4855 case Op_RotateRightV:
4856 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4857 case Op_MaxV:
4858 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4859 case Op_MinV:
4860 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4861 case Op_UMinV:
4862 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4863 case Op_UMaxV:
4864 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4865 case Op_XorV:
4866 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4867 case Op_OrV:
4868 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4869 case Op_AndV:
4870 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4871 default:
4872 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4873 break;
4874 }
4875 }
4876
4877 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4878 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4879 switch (ideal_opc) {
4880 case Op_AddVB:
4881 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4882 case Op_AddVS:
4883 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4884 case Op_AddVI:
4885 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4886 case Op_AddVL:
4887 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4888 case Op_AddVF:
4889 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4890 case Op_AddVD:
4891 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4892 case Op_SubVB:
4893 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4894 case Op_SubVS:
4895 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4896 case Op_SubVI:
4897 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4898 case Op_SubVL:
4899 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4900 case Op_SubVF:
4901 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4902 case Op_SubVD:
4903 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4904 case Op_MulVS:
4905 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4906 case Op_MulVI:
4907 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4908 case Op_MulVL:
4909 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4910 case Op_MulVF:
4911 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4912 case Op_MulVD:
4913 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4914 case Op_DivVF:
4915 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4916 case Op_DivVD:
4917 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4918 case Op_FmaVF:
4919 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4920 case Op_FmaVD:
4921 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4922 case Op_MaxV:
4923 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4924 case Op_MinV:
4925 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4926 case Op_UMaxV:
4927 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4928 case Op_UMinV:
4929 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4930 case Op_XorV:
4931 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4932 case Op_OrV:
4933 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4934 case Op_AndV:
4935 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4936 default:
4937 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4938 break;
4939 }
4940 }
4941
4942 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4943 KRegister src1, KRegister src2) {
4944 BasicType etype = T_ILLEGAL;
4945 switch(mask_len) {
4946 case 2:
4947 case 4:
4948 case 8: etype = T_BYTE; break;
4949 case 16: etype = T_SHORT; break;
4950 case 32: etype = T_INT; break;
4951 case 64: etype = T_LONG; break;
4952 default: fatal("Unsupported type"); break;
4953 }
4954 assert(etype != T_ILLEGAL, "");
4955 switch(ideal_opc) {
4956 case Op_AndVMask:
4957 kand(etype, dst, src1, src2); break;
4958 case Op_OrVMask:
4959 kor(etype, dst, src1, src2); break;
4960 case Op_XorVMask:
4961 kxor(etype, dst, src1, src2); break;
4962 default:
4963 fatal("Unsupported masked operation"); break;
4964 }
4965 }
4966
4967 /*
4968 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4969 * If src is NaN, the result is 0.
4970 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4971 * the result is equal to the value of Integer.MIN_VALUE.
4972 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4973 * the result is equal to the value of Integer.MAX_VALUE.
4974 */
4975 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4976 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4977 Register rscratch, AddressLiteral float_sign_flip,
4978 int vec_enc) {
4979 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4980 Label done;
4981 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4982 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4983 vptest(xtmp2, xtmp2, vec_enc);
4984 jccb(Assembler::equal, done);
4985
4986 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4987 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4988
4989 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4990 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4991 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4992
4993 // Recompute the mask for remaining special value.
4994 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4995 // Extract SRC values corresponding to TRUE mask lanes.
4996 vpand(xtmp4, xtmp2, src, vec_enc);
4997 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4998 // values are set.
4999 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5000
5001 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5002 bind(done);
5003 }
5004
5005 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5006 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5007 Register rscratch, AddressLiteral float_sign_flip,
5008 int vec_enc) {
5009 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5010 Label done;
5011 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5012 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5013 kortestwl(ktmp1, ktmp1);
5014 jccb(Assembler::equal, done);
5015
5016 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5017 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5018 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5019
5020 kxorwl(ktmp1, ktmp1, ktmp2);
5021 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5022 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5023 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5024 bind(done);
5025 }
5026
5027 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5028 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5029 Register rscratch, AddressLiteral double_sign_flip,
5030 int vec_enc) {
5031 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5032
5033 Label done;
5034 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5035 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5036 kortestwl(ktmp1, ktmp1);
5037 jccb(Assembler::equal, done);
5038
5039 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5040 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5041 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5042
5043 kxorwl(ktmp1, ktmp1, ktmp2);
5044 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5045 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5046 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5047 bind(done);
5048 }
5049
5050 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5051 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5052 Register rscratch, AddressLiteral float_sign_flip,
5053 int vec_enc) {
5054 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5055 Label done;
5056 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5057 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5058 kortestwl(ktmp1, ktmp1);
5059 jccb(Assembler::equal, done);
5060
5061 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5062 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5063 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5064
5065 kxorwl(ktmp1, ktmp1, ktmp2);
5066 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5067 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5068 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5069 bind(done);
5070 }
5071
5072 /*
5073 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5074 * If src is NaN, the result is 0.
5075 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5076 * the result is equal to the value of Long.MIN_VALUE.
5077 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5078 * the result is equal to the value of Long.MAX_VALUE.
5079 */
5080 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5081 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5082 Register rscratch, AddressLiteral double_sign_flip,
5083 int vec_enc) {
5084 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5085
5086 Label done;
5087 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5088 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5089 kortestwl(ktmp1, ktmp1);
5090 jccb(Assembler::equal, done);
5091
5092 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5093 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5094 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5095
5096 kxorwl(ktmp1, ktmp1, ktmp2);
5097 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5098 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5099 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5100 bind(done);
5101 }
5102
5103 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5104 XMMRegister xtmp, int index, int vec_enc) {
5105 assert(vec_enc < Assembler::AVX_512bit, "");
5106 if (vec_enc == Assembler::AVX_256bit) {
5107 vextractf128_high(xtmp, src);
5108 vshufps(dst, src, xtmp, index, vec_enc);
5109 } else {
5110 vshufps(dst, src, zero, index, vec_enc);
5111 }
5112 }
5113
5114 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5115 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5116 AddressLiteral float_sign_flip, int src_vec_enc) {
5117 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5118
5119 Label done;
5120 // Compare the destination lanes with float_sign_flip
5121 // value to get mask for all special values.
5122 movdqu(xtmp1, float_sign_flip, rscratch);
5123 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5124 ptest(xtmp2, xtmp2);
5125 jccb(Assembler::equal, done);
5126
5127 // Flip float_sign_flip to get max integer value.
5128 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5129 pxor(xtmp1, xtmp4);
5130
5131 // Set detination lanes corresponding to unordered source lanes as zero.
5132 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5133 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5134
5135 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5136 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5137 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5138
5139 // Recompute the mask for remaining special value.
5140 pxor(xtmp2, xtmp3);
5141 // Extract mask corresponding to non-negative source lanes.
5142 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5143
5144 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5145 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5146 pand(xtmp3, xtmp2);
5147
5148 // Replace destination lanes holding special value(0x80000000) with max int
5149 // if corresponding source lane holds a +ve value.
5150 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5151 bind(done);
5152 }
5153
5154
5155 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5156 XMMRegister xtmp, Register rscratch, int vec_enc) {
5157 switch(to_elem_bt) {
5158 case T_SHORT:
5159 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5160 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5161 vpackusdw(dst, dst, zero, vec_enc);
5162 if (vec_enc == Assembler::AVX_256bit) {
5163 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5164 }
5165 break;
5166 case T_BYTE:
5167 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5168 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5169 vpackusdw(dst, dst, zero, vec_enc);
5170 if (vec_enc == Assembler::AVX_256bit) {
5171 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5172 }
5173 vpackuswb(dst, dst, zero, vec_enc);
5174 break;
5175 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5176 }
5177 }
5178
5179 /*
5180 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5181 * a) Perform vector D2L/F2I cast.
5182 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5183 * It signifies that source value could be any of the special floating point
5184 * values(NaN,-Inf,Inf,Max,-Min).
5185 * c) Set destination to zero if source is NaN value.
5186 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5187 */
5188
5189 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5190 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5191 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5192 int to_elem_sz = type2aelembytes(to_elem_bt);
5193 assert(to_elem_sz <= 4, "");
5194 vcvttps2dq(dst, src, vec_enc);
5195 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5196 if (to_elem_sz < 4) {
5197 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5198 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5199 }
5200 }
5201
5202 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5203 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5204 Register rscratch, int vec_enc) {
5205 int to_elem_sz = type2aelembytes(to_elem_bt);
5206 assert(to_elem_sz <= 4, "");
5207 vcvttps2dq(dst, src, vec_enc);
5208 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5209 switch(to_elem_bt) {
5210 case T_INT:
5211 break;
5212 case T_SHORT:
5213 evpmovdw(dst, dst, vec_enc);
5214 break;
5215 case T_BYTE:
5216 evpmovdb(dst, dst, vec_enc);
5217 break;
5218 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5219 }
5220 }
5221
5222 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5223 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5224 Register rscratch, int vec_enc) {
5225 evcvttps2qq(dst, src, vec_enc);
5226 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5227 }
5228
5229 // Handling for downcasting from double to integer or sub-word types on AVX2.
5230 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5231 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5232 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5233 int to_elem_sz = type2aelembytes(to_elem_bt);
5234 assert(to_elem_sz < 8, "");
5235 vcvttpd2dq(dst, src, vec_enc);
5236 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5237 float_sign_flip, vec_enc);
5238 if (to_elem_sz < 4) {
5239 // xtmp4 holds all zero lanes.
5240 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5241 }
5242 }
5243
5244 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5245 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5246 KRegister ktmp2, AddressLiteral sign_flip,
5247 Register rscratch, int vec_enc) {
5248 if (VM_Version::supports_avx512dq()) {
5249 evcvttpd2qq(dst, src, vec_enc);
5250 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5251 switch(to_elem_bt) {
5252 case T_LONG:
5253 break;
5254 case T_INT:
5255 evpmovsqd(dst, dst, vec_enc);
5256 break;
5257 case T_SHORT:
5258 evpmovsqd(dst, dst, vec_enc);
5259 evpmovdw(dst, dst, vec_enc);
5260 break;
5261 case T_BYTE:
5262 evpmovsqd(dst, dst, vec_enc);
5263 evpmovdb(dst, dst, vec_enc);
5264 break;
5265 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5266 }
5267 } else {
5268 assert(type2aelembytes(to_elem_bt) <= 4, "");
5269 vcvttpd2dq(dst, src, vec_enc);
5270 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5271 switch(to_elem_bt) {
5272 case T_INT:
5273 break;
5274 case T_SHORT:
5275 evpmovdw(dst, dst, vec_enc);
5276 break;
5277 case T_BYTE:
5278 evpmovdb(dst, dst, vec_enc);
5279 break;
5280 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5281 }
5282 }
5283 }
5284
5285 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5286 switch(to_elem_bt) {
5287 case T_LONG:
5288 evcvttps2qqs(dst, src, vec_enc);
5289 break;
5290 case T_INT:
5291 evcvttps2dqs(dst, src, vec_enc);
5292 break;
5293 case T_SHORT:
5294 evcvttps2dqs(dst, src, vec_enc);
5295 evpmovdw(dst, dst, vec_enc);
5296 break;
5297 case T_BYTE:
5298 evcvttps2dqs(dst, src, vec_enc);
5299 evpmovdb(dst, dst, vec_enc);
5300 break;
5301 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5302 }
5303 }
5304
5305 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5306 switch(to_elem_bt) {
5307 case T_LONG:
5308 evcvttps2qqs(dst, src, vec_enc);
5309 break;
5310 case T_INT:
5311 evcvttps2dqs(dst, src, vec_enc);
5312 break;
5313 case T_SHORT:
5314 evcvttps2dqs(dst, src, vec_enc);
5315 evpmovdw(dst, dst, vec_enc);
5316 break;
5317 case T_BYTE:
5318 evcvttps2dqs(dst, src, vec_enc);
5319 evpmovdb(dst, dst, vec_enc);
5320 break;
5321 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5322 }
5323 }
5324
5325 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5326 switch(to_elem_bt) {
5327 case T_LONG:
5328 evcvttpd2qqs(dst, src, vec_enc);
5329 break;
5330 case T_INT:
5331 evcvttpd2dqs(dst, src, vec_enc);
5332 break;
5333 case T_SHORT:
5334 evcvttpd2dqs(dst, src, vec_enc);
5335 evpmovdw(dst, dst, vec_enc);
5336 break;
5337 case T_BYTE:
5338 evcvttpd2dqs(dst, src, vec_enc);
5339 evpmovdb(dst, dst, vec_enc);
5340 break;
5341 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5342 }
5343 }
5344
5345 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5346 switch(to_elem_bt) {
5347 case T_LONG:
5348 evcvttpd2qqs(dst, src, vec_enc);
5349 break;
5350 case T_INT:
5351 evcvttpd2dqs(dst, src, vec_enc);
5352 break;
5353 case T_SHORT:
5354 evcvttpd2dqs(dst, src, vec_enc);
5355 evpmovdw(dst, dst, vec_enc);
5356 break;
5357 case T_BYTE:
5358 evcvttpd2dqs(dst, src, vec_enc);
5359 evpmovdb(dst, dst, vec_enc);
5360 break;
5361 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5362 }
5363 }
5364
5365 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5366 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5367 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5368 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5369 // and re-instantiate original MXCSR.RC mode after that.
5370 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5371
5372 mov64(tmp, julong_cast(0.5L));
5373 evpbroadcastq(xtmp1, tmp, vec_enc);
5374 vaddpd(xtmp1, src , xtmp1, vec_enc);
5375 evcvtpd2qq(dst, xtmp1, vec_enc);
5376 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5377 double_sign_flip, vec_enc);;
5378
5379 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5380 }
5381
5382 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5383 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5384 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5385 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5386 // and re-instantiate original MXCSR.RC mode after that.
5387 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5388
5389 movl(tmp, jint_cast(0.5));
5390 movq(xtmp1, tmp);
5391 vbroadcastss(xtmp1, xtmp1, vec_enc);
5392 vaddps(xtmp1, src , xtmp1, vec_enc);
5393 vcvtps2dq(dst, xtmp1, vec_enc);
5394 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5395 float_sign_flip, vec_enc);
5396
5397 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5398 }
5399
5400 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5401 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5402 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5403 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5404 // and re-instantiate original MXCSR.RC mode after that.
5405 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5406
5407 movl(tmp, jint_cast(0.5));
5408 movq(xtmp1, tmp);
5409 vbroadcastss(xtmp1, xtmp1, vec_enc);
5410 vaddps(xtmp1, src , xtmp1, vec_enc);
5411 vcvtps2dq(dst, xtmp1, vec_enc);
5412 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5413
5414 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5415 }
5416
5417 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5418 BasicType from_elem_bt, BasicType to_elem_bt) {
5419 switch (from_elem_bt) {
5420 case T_BYTE:
5421 switch (to_elem_bt) {
5422 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5423 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5424 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5425 default: ShouldNotReachHere();
5426 }
5427 break;
5428 case T_SHORT:
5429 switch (to_elem_bt) {
5430 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5431 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5432 default: ShouldNotReachHere();
5433 }
5434 break;
5435 case T_INT:
5436 assert(to_elem_bt == T_LONG, "");
5437 vpmovzxdq(dst, src, vlen_enc);
5438 break;
5439 default:
5440 ShouldNotReachHere();
5441 }
5442 }
5443
5444 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5445 BasicType from_elem_bt, BasicType to_elem_bt) {
5446 switch (from_elem_bt) {
5447 case T_BYTE:
5448 switch (to_elem_bt) {
5449 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5450 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5451 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5452 default: ShouldNotReachHere();
5453 }
5454 break;
5455 case T_SHORT:
5456 switch (to_elem_bt) {
5457 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5458 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5459 default: ShouldNotReachHere();
5460 }
5461 break;
5462 case T_INT:
5463 assert(to_elem_bt == T_LONG, "");
5464 vpmovsxdq(dst, src, vlen_enc);
5465 break;
5466 default:
5467 ShouldNotReachHere();
5468 }
5469 }
5470
5471 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5472 BasicType dst_bt, BasicType src_bt, int vlen) {
5473 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5474 assert(vlen_enc != AVX_512bit, "");
5475
5476 int dst_bt_size = type2aelembytes(dst_bt);
5477 int src_bt_size = type2aelembytes(src_bt);
5478 if (dst_bt_size > src_bt_size) {
5479 switch (dst_bt_size / src_bt_size) {
5480 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5481 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5482 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5483 default: ShouldNotReachHere();
5484 }
5485 } else {
5486 assert(dst_bt_size < src_bt_size, "");
5487 switch (src_bt_size / dst_bt_size) {
5488 case 2: {
5489 if (vlen_enc == AVX_128bit) {
5490 vpacksswb(dst, src, src, vlen_enc);
5491 } else {
5492 vpacksswb(dst, src, src, vlen_enc);
5493 vpermq(dst, dst, 0x08, vlen_enc);
5494 }
5495 break;
5496 }
5497 case 4: {
5498 if (vlen_enc == AVX_128bit) {
5499 vpackssdw(dst, src, src, vlen_enc);
5500 vpacksswb(dst, dst, dst, vlen_enc);
5501 } else {
5502 vpackssdw(dst, src, src, vlen_enc);
5503 vpermq(dst, dst, 0x08, vlen_enc);
5504 vpacksswb(dst, dst, dst, AVX_128bit);
5505 }
5506 break;
5507 }
5508 case 8: {
5509 if (vlen_enc == AVX_128bit) {
5510 vpshufd(dst, src, 0x08, vlen_enc);
5511 vpackssdw(dst, dst, dst, vlen_enc);
5512 vpacksswb(dst, dst, dst, vlen_enc);
5513 } else {
5514 vpshufd(dst, src, 0x08, vlen_enc);
5515 vpermq(dst, dst, 0x08, vlen_enc);
5516 vpackssdw(dst, dst, dst, AVX_128bit);
5517 vpacksswb(dst, dst, dst, AVX_128bit);
5518 }
5519 break;
5520 }
5521 default: ShouldNotReachHere();
5522 }
5523 }
5524 }
5525
5526 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5527 bool merge, BasicType bt, int vlen_enc) {
5528 if (bt == T_INT) {
5529 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5530 } else {
5531 assert(bt == T_LONG, "");
5532 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5533 }
5534 }
5535
5536 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5537 bool merge, BasicType bt, int vlen_enc) {
5538 if (bt == T_INT) {
5539 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5540 } else {
5541 assert(bt == T_LONG, "");
5542 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5543 }
5544 }
5545
5546 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5547 Register rtmp2, XMMRegister xtmp, int mask_len,
5548 int vec_enc) {
5549 int index = 0;
5550 int vindex = 0;
5551 mov64(rtmp1, 0x0101010101010101L);
5552 pdepq(rtmp1, src, rtmp1);
5553 if (mask_len > 8) {
5554 movq(rtmp2, src);
5555 vpxor(xtmp, xtmp, xtmp, vec_enc);
5556 movq(xtmp, rtmp1);
5557 }
5558 movq(dst, rtmp1);
5559
5560 mask_len -= 8;
5561 while (mask_len > 0) {
5562 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5563 index++;
5564 if ((index % 2) == 0) {
5565 pxor(xtmp, xtmp);
5566 }
5567 mov64(rtmp1, 0x0101010101010101L);
5568 shrq(rtmp2, 8);
5569 pdepq(rtmp1, rtmp2, rtmp1);
5570 pinsrq(xtmp, rtmp1, index % 2);
5571 vindex = index / 2;
5572 if (vindex) {
5573 // Write entire 16 byte vector when both 64 bit
5574 // lanes are update to save redundant instructions.
5575 if (index % 2) {
5576 vinsertf128(dst, dst, xtmp, vindex);
5577 }
5578 } else {
5579 vmovdqu(dst, xtmp);
5580 }
5581 mask_len -= 8;
5582 }
5583 }
5584
5585 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5586 switch(opc) {
5587 case Op_VectorMaskTrueCount:
5588 popcntq(dst, tmp);
5589 break;
5590 case Op_VectorMaskLastTrue:
5591 if (VM_Version::supports_lzcnt()) {
5592 lzcntq(tmp, tmp);
5593 movl(dst, 63);
5594 subl(dst, tmp);
5595 } else {
5596 movl(dst, -1);
5597 bsrq(tmp, tmp);
5598 cmov32(Assembler::notZero, dst, tmp);
5599 }
5600 break;
5601 case Op_VectorMaskFirstTrue:
5602 if (VM_Version::supports_bmi1()) {
5603 if (masklen < 32) {
5604 orl(tmp, 1 << masklen);
5605 tzcntl(dst, tmp);
5606 } else if (masklen == 32) {
5607 tzcntl(dst, tmp);
5608 } else {
5609 assert(masklen == 64, "");
5610 tzcntq(dst, tmp);
5611 }
5612 } else {
5613 if (masklen < 32) {
5614 orl(tmp, 1 << masklen);
5615 bsfl(dst, tmp);
5616 } else {
5617 assert(masklen == 32 || masklen == 64, "");
5618 movl(dst, masklen);
5619 if (masklen == 32) {
5620 bsfl(tmp, tmp);
5621 } else {
5622 bsfq(tmp, tmp);
5623 }
5624 cmov32(Assembler::notZero, dst, tmp);
5625 }
5626 }
5627 break;
5628 case Op_VectorMaskToLong:
5629 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5630 break;
5631 default: assert(false, "Unhandled mask operation");
5632 }
5633 }
5634
5635 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5636 int masklen, int masksize, int vec_enc) {
5637 assert(VM_Version::supports_popcnt(), "");
5638
5639 if(VM_Version::supports_avx512bw()) {
5640 kmovql(tmp, mask);
5641 } else {
5642 assert(masklen <= 16, "");
5643 kmovwl(tmp, mask);
5644 }
5645
5646 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5647 // operations needs to be clipped.
5648 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5649 andq(tmp, (1 << masklen) - 1);
5650 }
5651
5652 vector_mask_operation_helper(opc, dst, tmp, masklen);
5653 }
5654
5655 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5656 Register tmp, int masklen, BasicType bt, int vec_enc) {
5657 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5658 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5659 assert(VM_Version::supports_popcnt(), "");
5660
5661 bool need_clip = false;
5662 switch(bt) {
5663 case T_BOOLEAN:
5664 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5665 vpxor(xtmp, xtmp, xtmp, vec_enc);
5666 vpsubb(xtmp, xtmp, mask, vec_enc);
5667 vpmovmskb(tmp, xtmp, vec_enc);
5668 need_clip = masklen < 16;
5669 break;
5670 case T_BYTE:
5671 vpmovmskb(tmp, mask, vec_enc);
5672 need_clip = masklen < 16;
5673 break;
5674 case T_SHORT:
5675 vpacksswb(xtmp, mask, mask, vec_enc);
5676 if (masklen >= 16) {
5677 vpermpd(xtmp, xtmp, 8, vec_enc);
5678 }
5679 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5680 need_clip = masklen < 16;
5681 break;
5682 case T_INT:
5683 case T_FLOAT:
5684 vmovmskps(tmp, mask, vec_enc);
5685 need_clip = masklen < 4;
5686 break;
5687 case T_LONG:
5688 case T_DOUBLE:
5689 vmovmskpd(tmp, mask, vec_enc);
5690 need_clip = masklen < 2;
5691 break;
5692 default: assert(false, "Unhandled type, %s", type2name(bt));
5693 }
5694
5695 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5696 // operations needs to be clipped.
5697 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5698 // need_clip implies masklen < 32
5699 andq(tmp, (1 << masklen) - 1);
5700 }
5701
5702 vector_mask_operation_helper(opc, dst, tmp, masklen);
5703 }
5704
5705 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5706 Register rtmp2, int mask_len) {
5707 kmov(rtmp1, src);
5708 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5709 mov64(rtmp2, -1L);
5710 pextq(rtmp2, rtmp2, rtmp1);
5711 kmov(dst, rtmp2);
5712 }
5713
5714 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5715 XMMRegister mask, Register rtmp, Register rscratch,
5716 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5717 int vec_enc) {
5718 assert(type2aelembytes(bt) >= 4, "");
5719 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5720 address compress_perm_table = nullptr;
5721 address expand_perm_table = nullptr;
5722 if (type2aelembytes(bt) == 8) {
5723 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5724 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5725 vmovmskpd(rtmp, mask, vec_enc);
5726 } else {
5727 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5728 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5729 vmovmskps(rtmp, mask, vec_enc);
5730 }
5731 shlq(rtmp, 5); // for 32 byte permute row.
5732 if (opcode == Op_CompressV) {
5733 lea(rscratch, ExternalAddress(compress_perm_table));
5734 } else {
5735 lea(rscratch, ExternalAddress(expand_perm_table));
5736 }
5737 addptr(rtmp, rscratch);
5738 vmovdqu(permv, Address(rtmp));
5739 vpermps(dst, permv, src, Assembler::AVX_256bit);
5740 vpxor(xtmp, xtmp, xtmp, vec_enc);
5741 // Blend the result with zero vector using permute mask, each column entry
5742 // in a permute table row contains either a valid permute index or a -1 (default)
5743 // value, this can potentially be used as a blending mask after
5744 // compressing/expanding the source vector lanes.
5745 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5746 }
5747
5748 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5749 bool merge, BasicType bt, int vec_enc) {
5750 if (opcode == Op_CompressV) {
5751 switch(bt) {
5752 case T_BYTE:
5753 evpcompressb(dst, mask, src, merge, vec_enc);
5754 break;
5755 case T_CHAR:
5756 case T_SHORT:
5757 evpcompressw(dst, mask, src, merge, vec_enc);
5758 break;
5759 case T_INT:
5760 evpcompressd(dst, mask, src, merge, vec_enc);
5761 break;
5762 case T_FLOAT:
5763 evcompressps(dst, mask, src, merge, vec_enc);
5764 break;
5765 case T_LONG:
5766 evpcompressq(dst, mask, src, merge, vec_enc);
5767 break;
5768 case T_DOUBLE:
5769 evcompresspd(dst, mask, src, merge, vec_enc);
5770 break;
5771 default:
5772 fatal("Unsupported type %s", type2name(bt));
5773 break;
5774 }
5775 } else {
5776 assert(opcode == Op_ExpandV, "");
5777 switch(bt) {
5778 case T_BYTE:
5779 evpexpandb(dst, mask, src, merge, vec_enc);
5780 break;
5781 case T_CHAR:
5782 case T_SHORT:
5783 evpexpandw(dst, mask, src, merge, vec_enc);
5784 break;
5785 case T_INT:
5786 evpexpandd(dst, mask, src, merge, vec_enc);
5787 break;
5788 case T_FLOAT:
5789 evexpandps(dst, mask, src, merge, vec_enc);
5790 break;
5791 case T_LONG:
5792 evpexpandq(dst, mask, src, merge, vec_enc);
5793 break;
5794 case T_DOUBLE:
5795 evexpandpd(dst, mask, src, merge, vec_enc);
5796 break;
5797 default:
5798 fatal("Unsupported type %s", type2name(bt));
5799 break;
5800 }
5801 }
5802 }
5803
5804 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5805 KRegister ktmp1, int vec_enc) {
5806 if (opcode == Op_SignumVD) {
5807 vsubpd(dst, zero, one, vec_enc);
5808 // if src < 0 ? -1 : 1
5809 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5810 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5811 // if src == NaN, -0.0 or 0.0 return src.
5812 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5813 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5814 } else {
5815 assert(opcode == Op_SignumVF, "");
5816 vsubps(dst, zero, one, vec_enc);
5817 // if src < 0 ? -1 : 1
5818 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5819 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5820 // if src == NaN, -0.0 or 0.0 return src.
5821 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5822 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5823 }
5824 }
5825
5826 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5827 XMMRegister xtmp1, int vec_enc) {
5828 if (opcode == Op_SignumVD) {
5829 vsubpd(dst, zero, one, vec_enc);
5830 // if src < 0 ? -1 : 1
5831 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5832 // if src == NaN, -0.0 or 0.0 return src.
5833 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5834 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5835 } else {
5836 assert(opcode == Op_SignumVF, "");
5837 vsubps(dst, zero, one, vec_enc);
5838 // if src < 0 ? -1 : 1
5839 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5840 // if src == NaN, -0.0 or 0.0 return src.
5841 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5842 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5843 }
5844 }
5845
5846 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5847 if (VM_Version::supports_avx512bw()) {
5848 if (mask_len > 32) {
5849 kmovql(dst, src);
5850 } else {
5851 kmovdl(dst, src);
5852 if (mask_len != 32) {
5853 kshiftrdl(dst, dst, 32 - mask_len);
5854 }
5855 }
5856 } else {
5857 assert(mask_len <= 16, "");
5858 kmovwl(dst, src);
5859 if (mask_len != 16) {
5860 kshiftrwl(dst, dst, 16 - mask_len);
5861 }
5862 }
5863 }
5864
5865 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5866 int lane_size = type2aelembytes(bt);
5867 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5868 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5869 movptr(rtmp, imm32);
5870 switch(lane_size) {
5871 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5872 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5873 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5874 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5875 fatal("Unsupported lane size %d", lane_size);
5876 break;
5877 }
5878 } else {
5879 movptr(rtmp, imm32);
5880 movq(dst, rtmp);
5881 switch(lane_size) {
5882 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5883 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5884 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5885 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5886 fatal("Unsupported lane size %d", lane_size);
5887 break;
5888 }
5889 }
5890 }
5891
5892 //
5893 // Following is lookup table based popcount computation algorithm:-
5894 // Index Bit set count
5895 // [ 0000 -> 0,
5896 // 0001 -> 1,
5897 // 0010 -> 1,
5898 // 0011 -> 2,
5899 // 0100 -> 1,
5900 // 0101 -> 2,
5901 // 0110 -> 2,
5902 // 0111 -> 3,
5903 // 1000 -> 1,
5904 // 1001 -> 2,
5905 // 1010 -> 3,
5906 // 1011 -> 3,
5907 // 1100 -> 2,
5908 // 1101 -> 3,
5909 // 1111 -> 4 ]
5910 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5911 // shuffle indices for lookup table access.
5912 // b. Right shift each byte of vector lane by 4 positions.
5913 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5914 // shuffle indices for lookup table access.
5915 // d. Add the bitset count of upper and lower 4 bits of each byte.
5916 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5917 // count of all the bytes of a quadword.
5918 // f. Perform step e. for upper 128bit vector lane.
5919 // g. Pack the bitset count of quadwords back to double word.
5920 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5921
5922 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5923 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5924 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5925 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5926 vpsrlw(dst, src, 4, vec_enc);
5927 vpand(dst, dst, xtmp1, vec_enc);
5928 vpand(xtmp1, src, xtmp1, vec_enc);
5929 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5930 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5931 vpshufb(dst, xtmp2, dst, vec_enc);
5932 vpaddb(dst, dst, xtmp1, vec_enc);
5933 }
5934
5935 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5936 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5937 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5938 // Following code is as per steps e,f,g and h of above algorithm.
5939 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5940 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5941 vpsadbw(dst, dst, xtmp2, vec_enc);
5942 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5943 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5944 vpackuswb(dst, xtmp1, dst, vec_enc);
5945 }
5946
5947 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5948 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5949 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5950 // Add the popcount of upper and lower bytes of word.
5951 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5952 vpsrlw(dst, xtmp1, 8, vec_enc);
5953 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5954 vpaddw(dst, dst, xtmp1, vec_enc);
5955 }
5956
5957 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5958 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5959 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5960 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5961 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5962 }
5963
5964 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5965 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5966 switch(bt) {
5967 case T_LONG:
5968 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5969 break;
5970 case T_INT:
5971 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5972 break;
5973 case T_CHAR:
5974 case T_SHORT:
5975 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5976 break;
5977 case T_BYTE:
5978 case T_BOOLEAN:
5979 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5980 break;
5981 default:
5982 fatal("Unsupported type %s", type2name(bt));
5983 break;
5984 }
5985 }
5986
5987 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5988 KRegister mask, bool merge, int vec_enc) {
5989 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5990 switch(bt) {
5991 case T_LONG:
5992 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5993 evpopcntq(dst, mask, src, merge, vec_enc);
5994 break;
5995 case T_INT:
5996 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5997 evpopcntd(dst, mask, src, merge, vec_enc);
5998 break;
5999 case T_CHAR:
6000 case T_SHORT:
6001 assert(VM_Version::supports_avx512_bitalg(), "");
6002 evpopcntw(dst, mask, src, merge, vec_enc);
6003 break;
6004 case T_BYTE:
6005 case T_BOOLEAN:
6006 assert(VM_Version::supports_avx512_bitalg(), "");
6007 evpopcntb(dst, mask, src, merge, vec_enc);
6008 break;
6009 default:
6010 fatal("Unsupported type %s", type2name(bt));
6011 break;
6012 }
6013 }
6014
6015 // Bit reversal algorithm first reverses the bits of each byte followed by
6016 // a byte level reversal for multi-byte primitive types (short/int/long).
6017 // Algorithm performs a lookup table access to get reverse bit sequence
6018 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6019 // is obtained by swapping the reverse bit sequences of upper and lower
6020 // nibble of a byte.
6021 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6022 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6023 if (VM_Version::supports_avx512vlbw()) {
6024
6025 // Get the reverse bit sequence of lower nibble of each byte.
6026 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6027 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6028 evpandq(dst, xtmp2, src, vec_enc);
6029 vpshufb(dst, xtmp1, dst, vec_enc);
6030 vpsllq(dst, dst, 4, vec_enc);
6031
6032 // Get the reverse bit sequence of upper nibble of each byte.
6033 vpandn(xtmp2, xtmp2, src, vec_enc);
6034 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6035 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6036
6037 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6038 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6039 evporq(xtmp2, dst, xtmp2, vec_enc);
6040 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6041
6042 } else if(vec_enc == Assembler::AVX_512bit) {
6043 // Shift based bit reversal.
6044 assert(bt == T_LONG || bt == T_INT, "");
6045
6046 // Swap lower and upper nibble of each byte.
6047 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6048
6049 // Swap two least and most significant bits of each nibble.
6050 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6051
6052 // Swap adjacent pair of bits.
6053 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6054 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6055
6056 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6057 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6058 } else {
6059 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6060 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6061
6062 // Get the reverse bit sequence of lower nibble of each byte.
6063 vpand(dst, xtmp2, src, vec_enc);
6064 vpshufb(dst, xtmp1, dst, vec_enc);
6065 vpsllq(dst, dst, 4, vec_enc);
6066
6067 // Get the reverse bit sequence of upper nibble of each byte.
6068 vpandn(xtmp2, xtmp2, src, vec_enc);
6069 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6070 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6071
6072 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6073 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6074 vpor(xtmp2, dst, xtmp2, vec_enc);
6075 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6076 }
6077 }
6078
6079 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6080 XMMRegister xtmp, Register rscratch) {
6081 assert(VM_Version::supports_gfni(), "");
6082 assert(rscratch != noreg || always_reachable(mask), "missing");
6083
6084 // Galois field instruction based bit reversal based on following algorithm.
6085 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6086 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6087 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6088 vector_reverse_byte(bt, dst, xtmp, vec_enc);
6089 }
6090
6091 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6092 XMMRegister xtmp1, Register rtmp, int vec_enc) {
6093 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6094 evpandq(dst, xtmp1, src, vec_enc);
6095 vpsllq(dst, dst, nbits, vec_enc);
6096 vpandn(xtmp1, xtmp1, src, vec_enc);
6097 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6098 evporq(dst, dst, xtmp1, vec_enc);
6099 }
6100
6101 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6102 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6103 // Shift based bit reversal.
6104 assert(VM_Version::supports_evex(), "");
6105 switch(bt) {
6106 case T_LONG:
6107 // Swap upper and lower double word of each quad word.
6108 evprorq(xtmp1, k0, src, 32, true, vec_enc);
6109 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6110 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6111 break;
6112 case T_INT:
6113 // Swap upper and lower word of each double word.
6114 evprord(xtmp1, k0, src, 16, true, vec_enc);
6115 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6116 break;
6117 case T_CHAR:
6118 case T_SHORT:
6119 // Swap upper and lower byte of each word.
6120 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6121 break;
6122 case T_BYTE:
6123 evmovdquq(dst, k0, src, true, vec_enc);
6124 break;
6125 default:
6126 fatal("Unsupported type %s", type2name(bt));
6127 break;
6128 }
6129 }
6130
6131 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6132 if (bt == T_BYTE) {
6133 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6134 evmovdquq(dst, k0, src, true, vec_enc);
6135 } else {
6136 vmovdqu(dst, src);
6137 }
6138 return;
6139 }
6140 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6141 // pre-computed shuffle indices.
6142 switch(bt) {
6143 case T_LONG:
6144 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6145 break;
6146 case T_INT:
6147 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6148 break;
6149 case T_CHAR:
6150 case T_SHORT:
6151 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6152 break;
6153 default:
6154 fatal("Unsupported type %s", type2name(bt));
6155 break;
6156 }
6157 vpshufb(dst, src, dst, vec_enc);
6158 }
6159
6160 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6161 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6162 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6163 assert(is_integral_type(bt), "");
6164 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6165 assert(VM_Version::supports_avx512cd(), "");
6166 switch(bt) {
6167 case T_LONG:
6168 evplzcntq(dst, ktmp, src, merge, vec_enc);
6169 break;
6170 case T_INT:
6171 evplzcntd(dst, ktmp, src, merge, vec_enc);
6172 break;
6173 case T_SHORT:
6174 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6175 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6176 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6177 vpunpckhwd(dst, xtmp1, src, vec_enc);
6178 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6179 vpackusdw(dst, xtmp2, dst, vec_enc);
6180 break;
6181 case T_BYTE:
6182 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6183 // accessing the lookup table.
6184 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6185 // accessing the lookup table.
6186 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6187 assert(VM_Version::supports_avx512bw(), "");
6188 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6189 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6190 vpand(xtmp2, dst, src, vec_enc);
6191 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6192 vpsrlw(xtmp3, src, 4, vec_enc);
6193 vpand(xtmp3, dst, xtmp3, vec_enc);
6194 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6195 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6196 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6197 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6198 break;
6199 default:
6200 fatal("Unsupported type %s", type2name(bt));
6201 break;
6202 }
6203 }
6204
6205 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6206 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6207 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6208 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6209 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6210 // accessing the lookup table.
6211 vpand(dst, xtmp2, src, vec_enc);
6212 vpshufb(dst, xtmp1, dst, vec_enc);
6213 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6214 // accessing the lookup table.
6215 vpsrlw(xtmp3, src, 4, vec_enc);
6216 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6217 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6218 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6219 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6220 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6221 vpaddb(dst, dst, xtmp2, vec_enc);
6222 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6223 }
6224
6225 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6226 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6227 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6228 // Add zero counts of lower byte and upper byte of a word if
6229 // upper byte holds a zero value.
6230 vpsrlw(xtmp3, src, 8, vec_enc);
6231 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6232 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6233 vpsllw(xtmp2, dst, 8, vec_enc);
6234 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6235 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6236 vpsrlw(dst, dst, 8, vec_enc);
6237 }
6238
6239 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6240 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6241 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6242 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6243 // exponent as the leading zero count.
6244
6245 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6246 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6247 // contributes to the leading number of zeros.
6248 vpsrld(dst, src, 1, vec_enc);
6249 vpandn(dst, dst, src, vec_enc);
6250
6251 vcvtdq2ps(dst, dst, vec_enc);
6252
6253 // By comparing the register to itself, all the bits in the destination are set.
6254 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6255
6256 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6257 vpsrld(xtmp2, xtmp1, 24, vec_enc);
6258 vpsrld(dst, dst, 23, vec_enc);
6259 vpand(dst, xtmp2, dst, vec_enc);
6260
6261 // Subtract 127 from the exponent, which removes the bias from the exponent.
6262 vpsrld(xtmp2, xtmp1, 25, vec_enc);
6263 vpsubd(dst, dst, xtmp2, vec_enc);
6264
6265 vpsrld(xtmp2, xtmp1, 27, vec_enc);
6266
6267 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6268 // is found in any of the lanes, replace the lane with -1 from xtmp1.
6269 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6270
6271 // If the original value is negative, replace the lane with 31.
6272 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6273
6274 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6275 // and for negative numbers the result is 0 as the exponent was replaced with 31.
6276 vpsubd(dst, xtmp2, dst, vec_enc);
6277 }
6278
6279 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6280 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6281 // Find the leading zeros of the top and bottom halves of the long individually.
6282 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6283
6284 // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6285 vpsrlq(xtmp1, dst, 32, vec_enc);
6286 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6287 // be in the most significant position of the bottom half.
6288 vpsrlq(xtmp2, dst, 6, vec_enc);
6289
6290 // In the bottom half, add the top half and bottom half results.
6291 vpaddq(dst, xtmp1, dst, vec_enc);
6292
6293 // For the bottom half, choose between the values using the most significant bit of xtmp2.
6294 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6295 // which contains only the top half result.
6296 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6297 // the lane as required.
6298 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6299 }
6300
6301 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6302 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6303 Register rtmp, int vec_enc) {
6304 assert(is_integral_type(bt), "unexpected type");
6305 assert(vec_enc < Assembler::AVX_512bit, "");
6306 switch(bt) {
6307 case T_LONG:
6308 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6309 break;
6310 case T_INT:
6311 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6312 break;
6313 case T_SHORT:
6314 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6315 break;
6316 case T_BYTE:
6317 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6318 break;
6319 default:
6320 fatal("Unsupported type %s", type2name(bt));
6321 break;
6322 }
6323 }
6324
6325 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6326 switch(bt) {
6327 case T_BYTE:
6328 vpsubb(dst, src1, src2, vec_enc);
6329 break;
6330 case T_SHORT:
6331 vpsubw(dst, src1, src2, vec_enc);
6332 break;
6333 case T_INT:
6334 vpsubd(dst, src1, src2, vec_enc);
6335 break;
6336 case T_LONG:
6337 vpsubq(dst, src1, src2, vec_enc);
6338 break;
6339 default:
6340 fatal("Unsupported type %s", type2name(bt));
6341 break;
6342 }
6343 }
6344
6345 // Trailing zero count computation is based on leading zero count operation as per
6346 // following equation. All AVX3 targets support AVX512CD feature which offers
6347 // direct vector instruction to compute leading zero count.
6348 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6349 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6350 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6351 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6352 assert(is_integral_type(bt), "");
6353 // xtmp = -1
6354 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6355 // xtmp = xtmp + src
6356 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6357 // xtmp = xtmp & ~src
6358 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6359 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6360 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6361 vpsub(bt, dst, xtmp4, dst, vec_enc);
6362 }
6363
6364 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6365 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6366 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6367 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6368 assert(is_integral_type(bt), "");
6369 // xtmp = 0
6370 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6371 // xtmp = 0 - src
6372 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6373 // xtmp = xtmp | src
6374 vpor(xtmp3, xtmp3, src, vec_enc);
6375 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6376 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6377 vpsub(bt, dst, xtmp1, dst, vec_enc);
6378 }
6379
6380 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6381 Label done;
6382 Label neg_divisor_fastpath;
6383 cmpl(divisor, 0);
6384 jccb(Assembler::less, neg_divisor_fastpath);
6385 xorl(rdx, rdx);
6386 divl(divisor);
6387 jmpb(done);
6388 bind(neg_divisor_fastpath);
6389 // Fastpath for divisor < 0:
6390 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6391 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6392 movl(rdx, rax);
6393 subl(rdx, divisor);
6394 if (VM_Version::supports_bmi1()) {
6395 andnl(rax, rdx, rax);
6396 } else {
6397 notl(rdx);
6398 andl(rax, rdx);
6399 }
6400 shrl(rax, 31);
6401 bind(done);
6402 }
6403
6404 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6405 Label done;
6406 Label neg_divisor_fastpath;
6407 cmpl(divisor, 0);
6408 jccb(Assembler::less, neg_divisor_fastpath);
6409 xorl(rdx, rdx);
6410 divl(divisor);
6411 jmpb(done);
6412 bind(neg_divisor_fastpath);
6413 // Fastpath when divisor < 0:
6414 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6415 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6416 movl(rdx, rax);
6417 subl(rax, divisor);
6418 if (VM_Version::supports_bmi1()) {
6419 andnl(rax, rax, rdx);
6420 } else {
6421 notl(rax);
6422 andl(rax, rdx);
6423 }
6424 sarl(rax, 31);
6425 andl(rax, divisor);
6426 subl(rdx, rax);
6427 bind(done);
6428 }
6429
6430 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6431 Label done;
6432 Label neg_divisor_fastpath;
6433
6434 cmpl(divisor, 0);
6435 jccb(Assembler::less, neg_divisor_fastpath);
6436 xorl(rdx, rdx);
6437 divl(divisor);
6438 jmpb(done);
6439 bind(neg_divisor_fastpath);
6440 // Fastpath for divisor < 0:
6441 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6442 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6443 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6444 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6445 movl(rdx, rax);
6446 subl(rax, divisor);
6447 if (VM_Version::supports_bmi1()) {
6448 andnl(rax, rax, rdx);
6449 } else {
6450 notl(rax);
6451 andl(rax, rdx);
6452 }
6453 movl(tmp, rax);
6454 shrl(rax, 31); // quotient
6455 sarl(tmp, 31);
6456 andl(tmp, divisor);
6457 subl(rdx, tmp); // remainder
6458 bind(done);
6459 }
6460
6461 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6462 XMMRegister xtmp2, Register rtmp) {
6463 if(VM_Version::supports_gfni()) {
6464 // Galois field instruction based bit reversal based on following algorithm.
6465 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6466 mov64(rtmp, 0x8040201008040201L);
6467 movq(xtmp1, src);
6468 movq(xtmp2, rtmp);
6469 gf2p8affineqb(xtmp1, xtmp2, 0);
6470 movq(dst, xtmp1);
6471 } else {
6472 // Swap even and odd numbered bits.
6473 movl(rtmp, src);
6474 andl(rtmp, 0x55555555);
6475 shll(rtmp, 1);
6476 movl(dst, src);
6477 andl(dst, 0xAAAAAAAA);
6478 shrl(dst, 1);
6479 orl(dst, rtmp);
6480
6481 // Swap LSB and MSB 2 bits of each nibble.
6482 movl(rtmp, dst);
6483 andl(rtmp, 0x33333333);
6484 shll(rtmp, 2);
6485 andl(dst, 0xCCCCCCCC);
6486 shrl(dst, 2);
6487 orl(dst, rtmp);
6488
6489 // Swap LSB and MSB 4 bits of each byte.
6490 movl(rtmp, dst);
6491 andl(rtmp, 0x0F0F0F0F);
6492 shll(rtmp, 4);
6493 andl(dst, 0xF0F0F0F0);
6494 shrl(dst, 4);
6495 orl(dst, rtmp);
6496 }
6497 bswapl(dst);
6498 }
6499
6500 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6501 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6502 if(VM_Version::supports_gfni()) {
6503 // Galois field instruction based bit reversal based on following algorithm.
6504 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6505 mov64(rtmp1, 0x8040201008040201L);
6506 movq(xtmp1, src);
6507 movq(xtmp2, rtmp1);
6508 gf2p8affineqb(xtmp1, xtmp2, 0);
6509 movq(dst, xtmp1);
6510 } else {
6511 // Swap even and odd numbered bits.
6512 movq(rtmp1, src);
6513 mov64(rtmp2, 0x5555555555555555L);
6514 andq(rtmp1, rtmp2);
6515 shlq(rtmp1, 1);
6516 movq(dst, src);
6517 notq(rtmp2);
6518 andq(dst, rtmp2);
6519 shrq(dst, 1);
6520 orq(dst, rtmp1);
6521
6522 // Swap LSB and MSB 2 bits of each nibble.
6523 movq(rtmp1, dst);
6524 mov64(rtmp2, 0x3333333333333333L);
6525 andq(rtmp1, rtmp2);
6526 shlq(rtmp1, 2);
6527 notq(rtmp2);
6528 andq(dst, rtmp2);
6529 shrq(dst, 2);
6530 orq(dst, rtmp1);
6531
6532 // Swap LSB and MSB 4 bits of each byte.
6533 movq(rtmp1, dst);
6534 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6535 andq(rtmp1, rtmp2);
6536 shlq(rtmp1, 4);
6537 notq(rtmp2);
6538 andq(dst, rtmp2);
6539 shrq(dst, 4);
6540 orq(dst, rtmp1);
6541 }
6542 bswapq(dst);
6543 }
6544
6545 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6546 Label done;
6547 Label neg_divisor_fastpath;
6548 cmpq(divisor, 0);
6549 jccb(Assembler::less, neg_divisor_fastpath);
6550 xorl(rdx, rdx);
6551 divq(divisor);
6552 jmpb(done);
6553 bind(neg_divisor_fastpath);
6554 // Fastpath for divisor < 0:
6555 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6556 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6557 movq(rdx, rax);
6558 subq(rdx, divisor);
6559 if (VM_Version::supports_bmi1()) {
6560 andnq(rax, rdx, rax);
6561 } else {
6562 notq(rdx);
6563 andq(rax, rdx);
6564 }
6565 shrq(rax, 63);
6566 bind(done);
6567 }
6568
6569 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6570 Label done;
6571 Label neg_divisor_fastpath;
6572 cmpq(divisor, 0);
6573 jccb(Assembler::less, neg_divisor_fastpath);
6574 xorq(rdx, rdx);
6575 divq(divisor);
6576 jmp(done);
6577 bind(neg_divisor_fastpath);
6578 // Fastpath when divisor < 0:
6579 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6580 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6581 movq(rdx, rax);
6582 subq(rax, divisor);
6583 if (VM_Version::supports_bmi1()) {
6584 andnq(rax, rax, rdx);
6585 } else {
6586 notq(rax);
6587 andq(rax, rdx);
6588 }
6589 sarq(rax, 63);
6590 andq(rax, divisor);
6591 subq(rdx, rax);
6592 bind(done);
6593 }
6594
6595 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6596 Label done;
6597 Label neg_divisor_fastpath;
6598 cmpq(divisor, 0);
6599 jccb(Assembler::less, neg_divisor_fastpath);
6600 xorq(rdx, rdx);
6601 divq(divisor);
6602 jmp(done);
6603 bind(neg_divisor_fastpath);
6604 // Fastpath for divisor < 0:
6605 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6606 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6607 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6608 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6609 movq(rdx, rax);
6610 subq(rax, divisor);
6611 if (VM_Version::supports_bmi1()) {
6612 andnq(rax, rax, rdx);
6613 } else {
6614 notq(rax);
6615 andq(rax, rdx);
6616 }
6617 movq(tmp, rax);
6618 shrq(rax, 63); // quotient
6619 sarq(tmp, 63);
6620 andq(tmp, divisor);
6621 subq(rdx, tmp); // remainder
6622 bind(done);
6623 }
6624
6625 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6626 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6627 int vlen_enc) {
6628 assert(VM_Version::supports_avx512bw(), "");
6629 // Byte shuffles are inlane operations and indices are determined using
6630 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6631 // normalized to index range 0-15. This makes sure that all the multiples
6632 // of an index value are placed at same relative position in 128 bit
6633 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6634 // will be 16th element in their respective 128 bit lanes.
6635 movl(rtmp, 16);
6636 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6637
6638 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6639 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6640 // original shuffle indices and move the shuffled lanes corresponding to true
6641 // mask to destination vector.
6642 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6643 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6644 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6645
6646 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6647 // and broadcasting second 128 bit lane.
6648 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6649 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6650 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6651 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6652 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6653
6654 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6655 // and broadcasting third 128 bit lane.
6656 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6657 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6658 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6659 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6660 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6661
6662 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6663 // and broadcasting third 128 bit lane.
6664 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6665 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6666 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6667 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6668 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6669 }
6670
6671 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6672 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6673 if (vlen_enc == AVX_128bit) {
6674 vpermilps(dst, src, shuffle, vlen_enc);
6675 } else if (bt == T_INT) {
6676 vpermd(dst, shuffle, src, vlen_enc);
6677 } else {
6678 assert(bt == T_FLOAT, "");
6679 vpermps(dst, shuffle, src, vlen_enc);
6680 }
6681 }
6682
6683 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6684 switch(opcode) {
6685 case Op_AddHF: vaddsh(dst, src1, src2); break;
6686 case Op_SubHF: vsubsh(dst, src1, src2); break;
6687 case Op_MulHF: vmulsh(dst, src1, src2); break;
6688 case Op_DivHF: vdivsh(dst, src1, src2); break;
6689 default: assert(false, "%s", NodeClassNames[opcode]); break;
6690 }
6691 }
6692
6693 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6694 switch(elem_bt) {
6695 case T_BYTE:
6696 if (ideal_opc == Op_SaturatingAddV) {
6697 vpaddsb(dst, src1, src2, vlen_enc);
6698 } else {
6699 assert(ideal_opc == Op_SaturatingSubV, "");
6700 vpsubsb(dst, src1, src2, vlen_enc);
6701 }
6702 break;
6703 case T_SHORT:
6704 if (ideal_opc == Op_SaturatingAddV) {
6705 vpaddsw(dst, src1, src2, vlen_enc);
6706 } else {
6707 assert(ideal_opc == Op_SaturatingSubV, "");
6708 vpsubsw(dst, src1, src2, vlen_enc);
6709 }
6710 break;
6711 default:
6712 fatal("Unsupported type %s", type2name(elem_bt));
6713 break;
6714 }
6715 }
6716
6717 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6718 switch(elem_bt) {
6719 case T_BYTE:
6720 if (ideal_opc == Op_SaturatingAddV) {
6721 vpaddusb(dst, src1, src2, vlen_enc);
6722 } else {
6723 assert(ideal_opc == Op_SaturatingSubV, "");
6724 vpsubusb(dst, src1, src2, vlen_enc);
6725 }
6726 break;
6727 case T_SHORT:
6728 if (ideal_opc == Op_SaturatingAddV) {
6729 vpaddusw(dst, src1, src2, vlen_enc);
6730 } else {
6731 assert(ideal_opc == Op_SaturatingSubV, "");
6732 vpsubusw(dst, src1, src2, vlen_enc);
6733 }
6734 break;
6735 default:
6736 fatal("Unsupported type %s", type2name(elem_bt));
6737 break;
6738 }
6739 }
6740
6741 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6742 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6743 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6744 // overflow_mask = Inp1 <u Inp2
6745 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6746 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6747 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6748 }
6749
6750 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6751 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6752 // Emulate unsigned comparison using signed comparison
6753 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6754 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6755 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6756 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6757
6758 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6759
6760 // Res = INP1 - INP2 (non-commutative and non-associative)
6761 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6762 // Res = Mask ? Zero : Res
6763 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6764 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6765 }
6766
6767 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6768 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6769 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6770 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6771 // Res = Signed Add INP1, INP2
6772 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6773 // T1 = SRC1 | SRC2
6774 vpor(xtmp1, src1, src2, vlen_enc);
6775 // Max_Unsigned = -1
6776 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6777 // Unsigned compare: Mask = Res <u T1
6778 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6779 // res = Mask ? Max_Unsigned : Res
6780 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6781 }
6782
6783 //
6784 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6785 // unsigned addition operation.
6786 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6787 //
6788 // We empirically determined its semantic equivalence to following reduced expression
6789 // overflow_mask = (a + b) <u (a | b)
6790 //
6791 // and also verified it though Alive2 solver.
6792 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6793 //
6794
6795 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6796 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6797 // Res = Signed Add INP1, INP2
6798 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6799 // Compute T1 = INP1 | INP2
6800 vpor(xtmp3, src1, src2, vlen_enc);
6801 // T1 = Minimum signed value.
6802 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6803 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6804 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6805 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6806 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6807 // Compute overflow detection mask = Res<1> <s T1
6808 if (elem_bt == T_INT) {
6809 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6810 } else {
6811 assert(elem_bt == T_LONG, "");
6812 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6813 }
6814 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6815 }
6816
6817 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6818 int vlen_enc, bool xtmp2_hold_M1) {
6819 if (VM_Version::supports_avx512dq()) {
6820 evpmovq2m(ktmp, src, vlen_enc);
6821 } else {
6822 assert(VM_Version::supports_evex(), "");
6823 if (!xtmp2_hold_M1) {
6824 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6825 }
6826 evpsraq(xtmp1, src, 63, vlen_enc);
6827 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6828 }
6829 }
6830
6831 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6832 int vlen_enc, bool xtmp2_hold_M1) {
6833 if (VM_Version::supports_avx512dq()) {
6834 evpmovd2m(ktmp, src, vlen_enc);
6835 } else {
6836 assert(VM_Version::supports_evex(), "");
6837 if (!xtmp2_hold_M1) {
6838 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6839 }
6840 vpsrad(xtmp1, src, 31, vlen_enc);
6841 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6842 }
6843 }
6844
6845
6846 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6847 if (elem_bt == T_LONG) {
6848 if (VM_Version::supports_evex()) {
6849 evpsraq(dst, src, 63, vlen_enc);
6850 } else {
6851 vpsrad(dst, src, 31, vlen_enc);
6852 vpshufd(dst, dst, 0xF5, vlen_enc);
6853 }
6854 } else {
6855 assert(elem_bt == T_INT, "");
6856 vpsrad(dst, src, 31, vlen_enc);
6857 }
6858 }
6859
6860 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6861 if (compute_allones) {
6862 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6863 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6864 } else {
6865 vpcmpeqq(allones, allones, allones, vlen_enc);
6866 }
6867 }
6868 if (elem_bt == T_LONG) {
6869 vpsrlq(dst, allones, 1, vlen_enc);
6870 } else {
6871 assert(elem_bt == T_INT, "");
6872 vpsrld(dst, allones, 1, vlen_enc);
6873 }
6874 }
6875
6876 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6877 if (compute_allones) {
6878 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6879 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6880 } else {
6881 vpcmpeqq(allones, allones, allones, vlen_enc);
6882 }
6883 }
6884 if (elem_bt == T_LONG) {
6885 vpsllq(dst, allones, 63, vlen_enc);
6886 } else {
6887 assert(elem_bt == T_INT, "");
6888 vpslld(dst, allones, 31, vlen_enc);
6889 }
6890 }
6891
6892 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6893 Assembler::ComparisonPredicate cond, int vlen_enc) {
6894 switch(elem_bt) {
6895 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6896 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6897 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6898 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6899 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6900 }
6901 }
6902
6903 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6904 switch(elem_bt) {
6905 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6906 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6907 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6908 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6909 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6910 }
6911 }
6912
6913 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6914 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6915 if (elem_bt == T_LONG) {
6916 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6917 } else {
6918 assert(elem_bt == T_INT, "");
6919 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6920 }
6921 }
6922
6923 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6924 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6925 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6926 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6927 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6928 // Overflow detection based on Hacker's delight section 2-13.
6929 if (ideal_opc == Op_SaturatingAddV) {
6930 // res = src1 + src2
6931 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6932 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6933 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6934 vpxor(xtmp1, dst, src1, vlen_enc);
6935 vpxor(xtmp2, dst, src2, vlen_enc);
6936 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6937 } else {
6938 assert(ideal_opc == Op_SaturatingSubV, "");
6939 // res = src1 - src2
6940 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6941 // Overflow occurs when both inputs have opposite polarity and
6942 // result polarity does not comply with first input polarity.
6943 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6944 vpxor(xtmp1, src1, src2, vlen_enc);
6945 vpxor(xtmp2, dst, src1, vlen_enc);
6946 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6947 }
6948
6949 // Compute overflow detection mask.
6950 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6951 // Note: xtmp1 hold -1 in all its lanes after above call.
6952
6953 // Compute mask based on first input polarity.
6954 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6955
6956 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6957 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6958
6959 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6960 // set bits in first input polarity mask holds a min value.
6961 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6962 // Blend destination lanes with saturated values using overflow detection mask.
6963 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6964 }
6965
6966
6967 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6968 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6969 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6970 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6971 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6972 // Overflow detection based on Hacker's delight section 2-13.
6973 if (ideal_opc == Op_SaturatingAddV) {
6974 // res = src1 + src2
6975 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6976 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6977 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6978 vpxor(xtmp1, dst, src1, vlen_enc);
6979 vpxor(xtmp2, dst, src2, vlen_enc);
6980 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6981 } else {
6982 assert(ideal_opc == Op_SaturatingSubV, "");
6983 // res = src1 - src2
6984 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6985 // Overflow occurs when both inputs have opposite polarity and
6986 // result polarity does not comply with first input polarity.
6987 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6988 vpxor(xtmp1, src1, src2, vlen_enc);
6989 vpxor(xtmp2, dst, src1, vlen_enc);
6990 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6991 }
6992
6993 // Sign-extend to compute overflow detection mask.
6994 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6995
6996 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6997 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6998 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6999
7000 // Compose saturating min/max vector using first input polarity mask.
7001 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7002 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7003
7004 // Blend result with saturating vector using overflow detection mask.
7005 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7006 }
7007
7008 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7009 switch(elem_bt) {
7010 case T_BYTE:
7011 if (ideal_opc == Op_SaturatingAddV) {
7012 vpaddsb(dst, src1, src2, vlen_enc);
7013 } else {
7014 assert(ideal_opc == Op_SaturatingSubV, "");
7015 vpsubsb(dst, src1, src2, vlen_enc);
7016 }
7017 break;
7018 case T_SHORT:
7019 if (ideal_opc == Op_SaturatingAddV) {
7020 vpaddsw(dst, src1, src2, vlen_enc);
7021 } else {
7022 assert(ideal_opc == Op_SaturatingSubV, "");
7023 vpsubsw(dst, src1, src2, vlen_enc);
7024 }
7025 break;
7026 default:
7027 fatal("Unsupported type %s", type2name(elem_bt));
7028 break;
7029 }
7030 }
7031
7032 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7033 switch(elem_bt) {
7034 case T_BYTE:
7035 if (ideal_opc == Op_SaturatingAddV) {
7036 vpaddusb(dst, src1, src2, vlen_enc);
7037 } else {
7038 assert(ideal_opc == Op_SaturatingSubV, "");
7039 vpsubusb(dst, src1, src2, vlen_enc);
7040 }
7041 break;
7042 case T_SHORT:
7043 if (ideal_opc == Op_SaturatingAddV) {
7044 vpaddusw(dst, src1, src2, vlen_enc);
7045 } else {
7046 assert(ideal_opc == Op_SaturatingSubV, "");
7047 vpsubusw(dst, src1, src2, vlen_enc);
7048 }
7049 break;
7050 default:
7051 fatal("Unsupported type %s", type2name(elem_bt));
7052 break;
7053 }
7054 }
7055
7056 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7057 XMMRegister src2, int vlen_enc) {
7058 switch(elem_bt) {
7059 case T_BYTE:
7060 evpermi2b(dst, src1, src2, vlen_enc);
7061 break;
7062 case T_SHORT:
7063 evpermi2w(dst, src1, src2, vlen_enc);
7064 break;
7065 case T_INT:
7066 evpermi2d(dst, src1, src2, vlen_enc);
7067 break;
7068 case T_LONG:
7069 evpermi2q(dst, src1, src2, vlen_enc);
7070 break;
7071 case T_FLOAT:
7072 evpermi2ps(dst, src1, src2, vlen_enc);
7073 break;
7074 case T_DOUBLE:
7075 evpermi2pd(dst, src1, src2, vlen_enc);
7076 break;
7077 default:
7078 fatal("Unsupported type %s", type2name(elem_bt));
7079 break;
7080 }
7081 }
7082
7083 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7084 if (is_unsigned) {
7085 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7086 } else {
7087 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7088 }
7089 }
7090
7091 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7092 if (is_unsigned) {
7093 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7094 } else {
7095 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7096 }
7097 }
7098
7099 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7100 switch(opcode) {
7101 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7102 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7103 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7104 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7105 default: assert(false, "%s", NodeClassNames[opcode]); break;
7106 }
7107 }
7108
7109 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7110 switch(opcode) {
7111 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7112 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7113 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7114 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7115 default: assert(false, "%s", NodeClassNames[opcode]); break;
7116 }
7117 }
7118
7119 void C2_MacroAssembler::sminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7120 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7121 vminmax_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7122 }
7123
7124 void C2_MacroAssembler::sminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7125 KRegister ktmp) {
7126 if (opcode == Op_MaxHF) {
7127 // dst = max(src1, src2)
7128 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN);
7129 } else {
7130 assert(opcode == Op_MinHF, "");
7131 // dst = min(src1, src2)
7132 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN);
7133 }
7134 }
7135
7136 void C2_MacroAssembler::vminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7137 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7138 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7139 // Move sign bits of src2 to mask register.
7140 evpmovw2m(ktmp, src2, vlen_enc);
7141 // xtmp1 = src2 < 0 ? src2 : src1
7142 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7143 // xtmp2 = src2 < 0 ? ? src1 : src2
7144 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7145 // Idea behind above swapping is to make seconds source operand a +ve value.
7146 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7147 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7148 // the second source operand, either a NaN or a valid floating-point value, is returned
7149 // dst = max(xtmp1, xtmp2)
7150 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7151 // isNaN = is_unordered_quiet(xtmp1)
7152 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7153 // Final result is same as first source if its a NaN value,
7154 // in case second operand holds a NaN value then as per above semantics
7155 // result is same as second operand.
7156 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7157 } else {
7158 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7159 // Move sign bits of src1 to mask register.
7160 evpmovw2m(ktmp, src1, vlen_enc);
7161 // xtmp1 = src1 < 0 ? src2 : src1
7162 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7163 // xtmp2 = src1 < 0 ? src1 : src2
7164 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7165 // Idea behind above swapping is to make seconds source operand a -ve value.
7166 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7167 // the second source operand is returned.
7168 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7169 // or a valid floating-point value, is written to the result.
7170 // dst = min(xtmp1, xtmp2)
7171 evminph(dst, xtmp1, xtmp2, vlen_enc);
7172 // isNaN = is_unordered_quiet(xtmp1)
7173 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7174 // Final result is same as first source if its a NaN value,
7175 // in case second operand holds a NaN value then as per above semantics
7176 // result is same as second operand.
7177 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7178 }
7179 }
7180
7181 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7182 KRegister ktmp, int vlen_enc) {
7183 if (opcode == Op_MaxVHF) {
7184 // dst = max(src1, src2)
7185 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7186 } else {
7187 assert(opcode == Op_MinVHF, "");
7188 // dst = min(src1, src2)
7189 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7190 }
7191 }
7192
7193 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, Address src2,
7194 KRegister ktmp, int vlen_enc) {
7195 if (opcode == Op_MaxVHF) {
7196 // dst = max(src1, src2)
7197 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7198 } else {
7199 assert(opcode == Op_MinVHF, "");
7200 // dst = min(src1, src2)
7201 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7202 }
7203 }
7204
7205 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
7206 // The vector iota entries array is ordered by type B/S/I/L/F/D, and
7207 // the offset between two types is 16.
7208 switch(bt) {
7209 case T_BYTE:
7210 return 0;
7211 case T_SHORT:
7212 return 1;
7213 case T_INT:
7214 return 2;
7215 case T_LONG:
7216 return 3;
7217 case T_FLOAT:
7218 return 4;
7219 case T_DOUBLE:
7220 return 5;
7221 default:
7222 ShouldNotReachHere();
7223 }
7224 }