1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 53 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 54 55 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 56 // Remove word for return addr 57 framesize -= wordSize; 58 stack_bang_size -= wordSize; 59 60 // Calls to C2R adapters often do not accept exceptional returns. 61 // We require that their callers must bang for them. But be careful, because 62 // some VM calls (such as call site linkage) can use several kilobytes of 63 // stack. But the stack safety zone should account for that. 64 // See bugs 4446381, 4468289, 4497237. 65 if (stack_bang_size > 0) { 66 generate_stack_overflow_check(stack_bang_size); 67 68 // We always push rbp, so that on return to interpreter rbp, will be 69 // restored correctly and we can correct the stack. 70 push(rbp); 71 // Save caller's stack pointer into RBP if the frame pointer is preserved. 72 if (PreserveFramePointer) { 73 mov(rbp, rsp); 74 } 75 // Remove word for ebp 76 framesize -= wordSize; 77 78 // Create frame 79 if (framesize) { 80 subptr(rsp, framesize); 81 } 82 } else { 83 subptr(rsp, framesize); 84 85 // Save RBP register now. 86 framesize -= wordSize; 87 movptr(Address(rsp, framesize), rbp); 88 // Save caller's stack pointer into RBP if the frame pointer is preserved. 89 if (PreserveFramePointer) { 90 movptr(rbp, rsp); 91 if (framesize > 0) { 92 addptr(rbp, framesize); 93 } 94 } 95 } 96 97 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 98 framesize -= wordSize; 99 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 100 } 101 102 #ifdef ASSERT 103 if (VerifyStackAtCalls) { 104 Label L; 105 push(rax); 106 mov(rax, rsp); 107 andptr(rax, StackAlignmentInBytes-1); 108 cmpptr(rax, StackAlignmentInBytes-wordSize); 109 pop(rax); 110 jcc(Assembler::equal, L); 111 STOP("Stack is not properly aligned!"); 112 bind(L); 113 } 114 #endif 115 116 if (!is_stub) { 117 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 118 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 119 Label dummy_slow_path; 120 Label dummy_continuation; 121 Label* slow_path = &dummy_slow_path; 122 Label* continuation = &dummy_continuation; 123 if (!Compile::current()->output()->in_scratch_emit_size()) { 124 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 125 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 126 Compile::current()->output()->add_stub(stub); 127 slow_path = &stub->entry(); 128 continuation = &stub->continuation(); 129 } 130 bs->nmethod_entry_barrier(this, slow_path, continuation); 131 } 132 } 133 134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 135 switch (vlen_in_bytes) { 136 case 4: // fall-through 137 case 8: // fall-through 138 case 16: return Assembler::AVX_128bit; 139 case 32: return Assembler::AVX_256bit; 140 case 64: return Assembler::AVX_512bit; 141 142 default: { 143 ShouldNotReachHere(); 144 return Assembler::AVX_NoVec; 145 } 146 } 147 } 148 149 // fast_lock and fast_unlock used by C2 150 151 // Because the transitions from emitted code to the runtime 152 // monitorenter/exit helper stubs are so slow it's critical that 153 // we inline both the stack-locking fast path and the inflated fast path. 154 // 155 // See also: cmpFastLock and cmpFastUnlock. 156 // 157 // What follows is a specialized inline transliteration of the code 158 // in enter() and exit(). If we're concerned about I$ bloat another 159 // option would be to emit TrySlowEnter and TrySlowExit methods 160 // at startup-time. These methods would accept arguments as 161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 162 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 164 // In practice, however, the # of lock sites is bounded and is usually small. 165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 166 // if the processor uses simple bimodal branch predictors keyed by EIP 167 // Since the helper routines would be called from multiple synchronization 168 // sites. 169 // 170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 172 // to those specialized methods. That'd give us a mostly platform-independent 173 // implementation that the JITs could optimize and inline at their pleasure. 174 // Done correctly, the only time we'd need to cross to native could would be 175 // to park() or unpark() threads. We'd also need a few more unsafe operators 176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 177 // (b) explicit barriers or fence operations. 178 // 179 // TODO: 180 // 181 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 182 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 183 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 184 // the lock operators would typically be faster than reifying Self. 185 // 186 // * Ideally I'd define the primitives as: 187 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 188 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 189 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 190 // Instead, we're stuck with a rather awkward and brittle register assignments below. 191 // Furthermore the register assignments are overconstrained, possibly resulting in 192 // sub-optimal code near the synchronization site. 193 // 194 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 195 // Alternately, use a better sp-proximity test. 196 // 197 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 198 // Either one is sufficient to uniquely identify a thread. 199 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 200 // 201 // * Intrinsify notify() and notifyAll() for the common cases where the 202 // object is locked by the calling thread but the waitlist is empty. 203 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 204 // 205 // * use jccb and jmpb instead of jcc and jmp to improve code density. 206 // But beware of excessive branch density on AMD Opterons. 207 // 208 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 209 // or failure of the fast path. If the fast path fails then we pass 210 // control to the slow path, typically in C. In fast_lock and 211 // fast_unlock we often branch to DONE_LABEL, just to find that C2 212 // will emit a conditional branch immediately after the node. 213 // So we have branches to branches and lots of ICC.ZF games. 214 // Instead, it might be better to have C2 pass a "FailureLabel" 215 // into fast_lock and fast_unlock. In the case of success, control 216 // will drop through the node. ICC.ZF is undefined at exit. 217 // In the case of failure, the node will branch directly to the 218 // FailureLabel 219 220 221 // obj: object to lock 222 // box: on-stack box address -- KILLED 223 // rax: tmp -- KILLED 224 // t : tmp -- KILLED 225 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 226 Register t, Register thread) { 227 assert(rax_reg == rax, "Used for CAS"); 228 assert_different_registers(obj, box, rax_reg, t, thread); 229 230 // Handle inflated monitor. 231 Label inflated; 232 // Finish fast lock successfully. ZF value is irrelevant. 233 Label locked; 234 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 235 Label slow_path; 236 237 if (UseObjectMonitorTable) { 238 // Clear cache in case fast locking succeeds or we need to take the slow-path. 239 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 240 } 241 242 if (DiagnoseSyncOnValueBasedClasses != 0) { 243 load_klass(rax_reg, obj, t); 244 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 245 jcc(Assembler::notZero, slow_path); 246 } 247 248 const Register mark = t; 249 250 { // Lightweight Lock 251 252 Label push; 253 254 const Register top = UseObjectMonitorTable ? rax_reg : box; 255 256 // Load the mark. 257 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 258 259 // Prefetch top. 260 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 261 262 // Check for monitor (0b10). 263 testptr(mark, markWord::monitor_value); 264 jcc(Assembler::notZero, inflated); 265 266 // Check if lock-stack is full. 267 cmpl(top, LockStack::end_offset() - 1); 268 jcc(Assembler::greater, slow_path); 269 270 // Check if recursive. 271 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 272 jccb(Assembler::equal, push); 273 274 // Try to lock. Transition lock bits 0b01 => 0b00 275 movptr(rax_reg, mark); 276 orptr(rax_reg, markWord::unlocked_value); 277 andptr(mark, ~(int32_t)markWord::unlocked_value); 278 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 279 jcc(Assembler::notEqual, slow_path); 280 281 if (UseObjectMonitorTable) { 282 // Need to reload top, clobbered by CAS. 283 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 284 } 285 bind(push); 286 // After successful lock, push object on lock-stack. 287 movptr(Address(thread, top), obj); 288 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 289 jmpb(locked); 290 } 291 292 { // Handle inflated monitor. 293 bind(inflated); 294 295 const Register monitor = t; 296 297 if (!UseObjectMonitorTable) { 298 assert(mark == monitor, "should be the same here"); 299 } else { 300 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 301 // Fetch ObjectMonitor* from the cache or take the slow-path. 302 Label monitor_found; 303 304 // Load cache address 305 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 306 307 const int num_unrolled = 2; 308 for (int i = 0; i < num_unrolled; i++) { 309 cmpptr(obj, Address(t)); 310 jccb(Assembler::equal, monitor_found); 311 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 312 } 313 314 Label loop; 315 316 // Search for obj in cache. 317 bind(loop); 318 319 // Check for match. 320 cmpptr(obj, Address(t)); 321 jccb(Assembler::equal, monitor_found); 322 323 // Search until null encountered, guaranteed _null_sentinel at end. 324 cmpptr(Address(t), 1); 325 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 326 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 327 jmpb(loop); 328 329 // Cache hit. 330 bind(monitor_found); 331 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 332 } 333 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 334 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 335 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 336 337 Label monitor_locked; 338 // Lock the monitor. 339 340 if (UseObjectMonitorTable) { 341 // Cache the monitor for unlock before trashing box. On failure to acquire 342 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 343 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 344 } 345 346 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 347 xorptr(rax_reg, rax_reg); 348 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 349 lock(); cmpxchgptr(box, owner_address); 350 jccb(Assembler::equal, monitor_locked); 351 352 // Check if recursive. 353 cmpptr(box, rax_reg); 354 jccb(Assembler::notEqual, slow_path); 355 356 // Recursive. 357 increment(recursions_address); 358 359 bind(monitor_locked); 360 } 361 362 bind(locked); 363 // Set ZF = 1 364 xorl(rax_reg, rax_reg); 365 366 #ifdef ASSERT 367 // Check that locked label is reached with ZF set. 368 Label zf_correct; 369 Label zf_bad_zero; 370 jcc(Assembler::zero, zf_correct); 371 jmp(zf_bad_zero); 372 #endif 373 374 bind(slow_path); 375 #ifdef ASSERT 376 // Check that slow_path label is reached with ZF not set. 377 jcc(Assembler::notZero, zf_correct); 378 stop("Fast Lock ZF != 0"); 379 bind(zf_bad_zero); 380 stop("Fast Lock ZF != 1"); 381 bind(zf_correct); 382 #endif 383 // C2 uses the value of ZF to determine the continuation. 384 } 385 386 // obj: object to lock 387 // rax: tmp -- KILLED 388 // t : tmp - cannot be obj nor rax -- KILLED 389 // 390 // Some commentary on balanced locking: 391 // 392 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 393 // Methods that don't have provably balanced locking are forced to run in the 394 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 395 // The interpreter provides two properties: 396 // I1: At return-time the interpreter automatically and quietly unlocks any 397 // objects acquired in the current activation (frame). Recall that the 398 // interpreter maintains an on-stack list of locks currently held by 399 // a frame. 400 // I2: If a method attempts to unlock an object that is not held by the 401 // frame the interpreter throws IMSX. 402 // 403 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 404 // B() doesn't have provably balanced locking so it runs in the interpreter. 405 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 406 // is still locked by A(). 407 // 408 // The only other source of unbalanced locking would be JNI. The "Java Native Interface 409 // Specification" states that an object locked by JNI's MonitorEnter should not be 410 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't 411 // specify what will occur if a program engages in such mixed-mode locking, however. 412 // Arguably given that the spec legislates the JNI case as undefined our implementation 413 // could reasonably *avoid* checking owner in fast_unlock(). 414 // In the interest of performance we elide m->Owner==Self check in unlock. 415 // A perfectly viable alternative is to elide the owner check except when 416 // Xcheck:jni is enabled. 417 418 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 419 assert(reg_rax == rax, "Used for CAS"); 420 assert_different_registers(obj, reg_rax, t); 421 422 // Handle inflated monitor. 423 Label inflated, inflated_check_lock_stack; 424 // Finish fast unlock successfully. MUST jump with ZF == 1 425 Label unlocked, slow_path; 426 427 const Register mark = t; 428 const Register monitor = t; 429 const Register top = UseObjectMonitorTable ? t : reg_rax; 430 const Register box = reg_rax; 431 432 Label dummy; 433 C2FastUnlockLightweightStub* stub = nullptr; 434 435 if (!Compile::current()->output()->in_scratch_emit_size()) { 436 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 437 Compile::current()->output()->add_stub(stub); 438 } 439 440 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 441 442 { // Lightweight Unlock 443 444 // Load top. 445 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 446 447 if (!UseObjectMonitorTable) { 448 // Prefetch mark. 449 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 450 } 451 452 // Check if obj is top of lock-stack. 453 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 454 // Top of lock stack was not obj. Must be monitor. 455 jcc(Assembler::notEqual, inflated_check_lock_stack); 456 457 // Pop lock-stack. 458 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 459 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 460 461 // Check if recursive. 462 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 463 jcc(Assembler::equal, unlocked); 464 465 // We elide the monitor check, let the CAS fail instead. 466 467 if (UseObjectMonitorTable) { 468 // Load mark. 469 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 470 } 471 472 // Try to unlock. Transition lock bits 0b00 => 0b01 473 movptr(reg_rax, mark); 474 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 475 orptr(mark, markWord::unlocked_value); 476 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 477 jcc(Assembler::notEqual, push_and_slow_path); 478 jmp(unlocked); 479 } 480 481 482 { // Handle inflated monitor. 483 bind(inflated_check_lock_stack); 484 #ifdef ASSERT 485 Label check_done; 486 subl(top, oopSize); 487 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 488 jcc(Assembler::below, check_done); 489 cmpptr(obj, Address(thread, top)); 490 jccb(Assembler::notEqual, inflated_check_lock_stack); 491 stop("Fast Unlock lock on stack"); 492 bind(check_done); 493 if (UseObjectMonitorTable) { 494 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 495 } 496 testptr(mark, markWord::monitor_value); 497 jccb(Assembler::notZero, inflated); 498 stop("Fast Unlock not monitor"); 499 #endif 500 501 bind(inflated); 502 503 if (!UseObjectMonitorTable) { 504 assert(mark == monitor, "should be the same here"); 505 } else { 506 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 507 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 508 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 509 cmpptr(monitor, alignof(ObjectMonitor*)); 510 jcc(Assembler::below, slow_path); 511 } 512 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 513 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 514 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 515 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 516 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 517 518 Label recursive; 519 520 // Check if recursive. 521 cmpptr(recursions_address, 0); 522 jccb(Assembler::notZero, recursive); 523 524 // Set owner to null. 525 // Release to satisfy the JMM 526 movptr(owner_address, NULL_WORD); 527 // We need a full fence after clearing owner to avoid stranding. 528 // StoreLoad achieves this. 529 membar(StoreLoad); 530 531 // Check if the entry_list is empty. 532 cmpptr(entry_list_address, NULL_WORD); 533 jccb(Assembler::zero, unlocked); // If so we are done. 534 535 // Check if there is a successor. 536 cmpptr(succ_address, NULL_WORD); 537 jccb(Assembler::notZero, unlocked); // If so we are done. 538 539 // Save the monitor pointer in the current thread, so we can try to 540 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 541 if (!UseObjectMonitorTable) { 542 andptr(monitor, ~(int32_t)markWord::monitor_value); 543 } 544 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 545 546 orl(t, 1); // Fast Unlock ZF = 0 547 jmpb(slow_path); 548 549 // Recursive unlock. 550 bind(recursive); 551 decrement(recursions_address); 552 } 553 554 bind(unlocked); 555 xorl(t, t); // Fast Unlock ZF = 1 556 557 #ifdef ASSERT 558 // Check that unlocked label is reached with ZF set. 559 Label zf_correct; 560 Label zf_bad_zero; 561 jcc(Assembler::zero, zf_correct); 562 jmp(zf_bad_zero); 563 #endif 564 565 bind(slow_path); 566 if (stub != nullptr) { 567 bind(stub->slow_path_continuation()); 568 } 569 #ifdef ASSERT 570 // Check that stub->continuation() label is reached with ZF not set. 571 jcc(Assembler::notZero, zf_correct); 572 stop("Fast Unlock ZF != 0"); 573 bind(zf_bad_zero); 574 stop("Fast Unlock ZF != 1"); 575 bind(zf_correct); 576 #endif 577 // C2 uses the value of ZF to determine the continuation. 578 } 579 580 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 581 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 582 } 583 584 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 585 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 586 masm->movptr(dst, rsp); 587 if (framesize > 2 * wordSize) { 588 masm->addptr(dst, framesize - 2 * wordSize); 589 } 590 } 591 592 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 593 if (PreserveFramePointer) { 594 // frame pointer is valid 595 #ifdef ASSERT 596 // Verify frame pointer value in rbp. 597 reconstruct_frame_pointer_helper(this, rtmp); 598 Label L_success; 599 cmpq(rbp, rtmp); 600 jccb(Assembler::equal, L_success); 601 STOP("frame pointer mismatch"); 602 bind(L_success); 603 #endif // ASSERT 604 } else { 605 reconstruct_frame_pointer_helper(this, rbp); 606 } 607 } 608 609 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 610 jint lo = t->_lo; 611 jint hi = t->_hi; 612 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 613 if (t == TypeInt::INT) { 614 return; 615 } 616 617 BLOCK_COMMENT("CastII {"); 618 Label fail; 619 Label succeed; 620 if (hi == max_jint) { 621 cmpl(val, lo); 622 jccb(Assembler::greaterEqual, succeed); 623 } else { 624 if (lo != min_jint) { 625 cmpl(val, lo); 626 jccb(Assembler::less, fail); 627 } 628 cmpl(val, hi); 629 jccb(Assembler::lessEqual, succeed); 630 } 631 632 bind(fail); 633 movl(c_rarg0, idx); 634 movl(c_rarg1, val); 635 movl(c_rarg2, lo); 636 movl(c_rarg3, hi); 637 reconstruct_frame_pointer(rscratch1); 638 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 639 hlt(); 640 bind(succeed); 641 BLOCK_COMMENT("} // CastII"); 642 } 643 644 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 645 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 646 } 647 648 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 649 jlong lo = t->_lo; 650 jlong hi = t->_hi; 651 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 652 if (t == TypeLong::LONG) { 653 return; 654 } 655 656 BLOCK_COMMENT("CastLL {"); 657 Label fail; 658 Label succeed; 659 660 auto cmp_val = [&](jlong bound) { 661 if (is_simm32(bound)) { 662 cmpq(val, checked_cast<int>(bound)); 663 } else { 664 mov64(tmp, bound); 665 cmpq(val, tmp); 666 } 667 }; 668 669 if (hi == max_jlong) { 670 cmp_val(lo); 671 jccb(Assembler::greaterEqual, succeed); 672 } else { 673 if (lo != min_jlong) { 674 cmp_val(lo); 675 jccb(Assembler::less, fail); 676 } 677 cmp_val(hi); 678 jccb(Assembler::lessEqual, succeed); 679 } 680 681 bind(fail); 682 movl(c_rarg0, idx); 683 movq(c_rarg1, val); 684 mov64(c_rarg2, lo); 685 mov64(c_rarg3, hi); 686 reconstruct_frame_pointer(rscratch1); 687 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 688 hlt(); 689 bind(succeed); 690 BLOCK_COMMENT("} // CastLL"); 691 } 692 693 //------------------------------------------------------------------------------------------- 694 // Generic instructions support for use in .ad files C2 code generation 695 696 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 697 if (dst != src) { 698 movdqu(dst, src); 699 } 700 if (opcode == Op_AbsVD) { 701 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 702 } else { 703 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 704 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 705 } 706 } 707 708 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 709 if (opcode == Op_AbsVD) { 710 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 711 } else { 712 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 713 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 714 } 715 } 716 717 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 718 if (dst != src) { 719 movdqu(dst, src); 720 } 721 if (opcode == Op_AbsVF) { 722 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 723 } else { 724 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 725 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 726 } 727 } 728 729 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 730 if (opcode == Op_AbsVF) { 731 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 732 } else { 733 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 734 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 735 } 736 } 737 738 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 739 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 740 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 741 742 if (opcode == Op_MinV) { 743 if (elem_bt == T_BYTE) { 744 pminsb(dst, src); 745 } else if (elem_bt == T_SHORT) { 746 pminsw(dst, src); 747 } else if (elem_bt == T_INT) { 748 pminsd(dst, src); 749 } else { 750 assert(elem_bt == T_LONG, "required"); 751 assert(tmp == xmm0, "required"); 752 assert_different_registers(dst, src, tmp); 753 movdqu(xmm0, dst); 754 pcmpgtq(xmm0, src); 755 blendvpd(dst, src); // xmm0 as mask 756 } 757 } else { // opcode == Op_MaxV 758 if (elem_bt == T_BYTE) { 759 pmaxsb(dst, src); 760 } else if (elem_bt == T_SHORT) { 761 pmaxsw(dst, src); 762 } else if (elem_bt == T_INT) { 763 pmaxsd(dst, src); 764 } else { 765 assert(elem_bt == T_LONG, "required"); 766 assert(tmp == xmm0, "required"); 767 assert_different_registers(dst, src, tmp); 768 movdqu(xmm0, src); 769 pcmpgtq(xmm0, dst); 770 blendvpd(dst, src); // xmm0 as mask 771 } 772 } 773 } 774 775 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 776 XMMRegister src1, Address src2, int vlen_enc) { 777 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 778 if (opcode == Op_UMinV) { 779 switch(elem_bt) { 780 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 781 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 782 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 783 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 784 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 785 } 786 } else { 787 assert(opcode == Op_UMaxV, "required"); 788 switch(elem_bt) { 789 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 790 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 791 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 792 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 793 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 794 } 795 } 796 } 797 798 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 799 // For optimality, leverage a full vector width of 512 bits 800 // for operations over smaller vector sizes on AVX512 targets. 801 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 802 if (opcode == Op_UMaxV) { 803 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 804 } else { 805 assert(opcode == Op_UMinV, "required"); 806 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 807 } 808 } else { 809 // T1 = -1 810 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 811 // T1 = -1 << 63 812 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 813 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 814 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 815 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 816 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 817 // Mask = T2 > T1 818 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 819 if (opcode == Op_UMaxV) { 820 // Res = Mask ? Src2 : Src1 821 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 822 } else { 823 // Res = Mask ? Src1 : Src2 824 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 825 } 826 } 827 } 828 829 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 830 XMMRegister src1, XMMRegister src2, int vlen_enc) { 831 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 832 if (opcode == Op_UMinV) { 833 switch(elem_bt) { 834 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 835 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 836 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 837 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 838 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 839 } 840 } else { 841 assert(opcode == Op_UMaxV, "required"); 842 switch(elem_bt) { 843 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 844 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 845 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 846 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 847 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 848 } 849 } 850 } 851 852 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 853 XMMRegister dst, XMMRegister src1, XMMRegister src2, 854 int vlen_enc) { 855 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 856 857 if (opcode == Op_MinV) { 858 if (elem_bt == T_BYTE) { 859 vpminsb(dst, src1, src2, vlen_enc); 860 } else if (elem_bt == T_SHORT) { 861 vpminsw(dst, src1, src2, vlen_enc); 862 } else if (elem_bt == T_INT) { 863 vpminsd(dst, src1, src2, vlen_enc); 864 } else { 865 assert(elem_bt == T_LONG, "required"); 866 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 867 vpminsq(dst, src1, src2, vlen_enc); 868 } else { 869 assert_different_registers(dst, src1, src2); 870 vpcmpgtq(dst, src1, src2, vlen_enc); 871 vblendvpd(dst, src1, src2, dst, vlen_enc); 872 } 873 } 874 } else { // opcode == Op_MaxV 875 if (elem_bt == T_BYTE) { 876 vpmaxsb(dst, src1, src2, vlen_enc); 877 } else if (elem_bt == T_SHORT) { 878 vpmaxsw(dst, src1, src2, vlen_enc); 879 } else if (elem_bt == T_INT) { 880 vpmaxsd(dst, src1, src2, vlen_enc); 881 } else { 882 assert(elem_bt == T_LONG, "required"); 883 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 884 vpmaxsq(dst, src1, src2, vlen_enc); 885 } else { 886 assert_different_registers(dst, src1, src2); 887 vpcmpgtq(dst, src1, src2, vlen_enc); 888 vblendvpd(dst, src2, src1, dst, vlen_enc); 889 } 890 } 891 } 892 } 893 894 // Float/Double min max 895 896 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 897 XMMRegister dst, XMMRegister a, XMMRegister b, 898 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 899 int vlen_enc) { 900 assert(UseAVX > 0, "required"); 901 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 902 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 903 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 904 assert_different_registers(a, tmp, atmp, btmp); 905 assert_different_registers(b, tmp, atmp, btmp); 906 907 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 908 bool is_double_word = is_double_word_type(elem_bt); 909 910 /* Note on 'non-obvious' assembly sequence: 911 * 912 * While there are vminps/vmaxps instructions, there are two important differences between hardware 913 * and Java on how they handle floats: 914 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 915 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 916 * 917 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 918 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 919 * (only useful when signs differ, noop otherwise) 920 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 921 922 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 923 * btmp = (b < +0.0) ? a : b 924 * atmp = (b < +0.0) ? b : a 925 * Tmp = Max_Float(atmp , btmp) 926 * Res = (atmp == NaN) ? atmp : Tmp 927 */ 928 929 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 930 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 931 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 932 XMMRegister mask; 933 934 if (!is_double_word && is_min) { 935 mask = a; 936 vblend = &MacroAssembler::vblendvps; 937 vmaxmin = &MacroAssembler::vminps; 938 vcmp = &MacroAssembler::vcmpps; 939 } else if (!is_double_word && !is_min) { 940 mask = b; 941 vblend = &MacroAssembler::vblendvps; 942 vmaxmin = &MacroAssembler::vmaxps; 943 vcmp = &MacroAssembler::vcmpps; 944 } else if (is_double_word && is_min) { 945 mask = a; 946 vblend = &MacroAssembler::vblendvpd; 947 vmaxmin = &MacroAssembler::vminpd; 948 vcmp = &MacroAssembler::vcmppd; 949 } else { 950 assert(is_double_word && !is_min, "sanity"); 951 mask = b; 952 vblend = &MacroAssembler::vblendvpd; 953 vmaxmin = &MacroAssembler::vmaxpd; 954 vcmp = &MacroAssembler::vcmppd; 955 } 956 957 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 958 XMMRegister maxmin, scratch; 959 if (dst == btmp) { 960 maxmin = btmp; 961 scratch = tmp; 962 } else { 963 maxmin = tmp; 964 scratch = btmp; 965 } 966 967 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 968 if (precompute_mask && !is_double_word) { 969 vpsrad(tmp, mask, 32, vlen_enc); 970 mask = tmp; 971 } else if (precompute_mask && is_double_word) { 972 vpxor(tmp, tmp, tmp, vlen_enc); 973 vpcmpgtq(tmp, tmp, mask, vlen_enc); 974 mask = tmp; 975 } 976 977 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 978 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 979 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 980 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 981 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 982 } 983 984 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 985 XMMRegister dst, XMMRegister a, XMMRegister b, 986 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 987 int vlen_enc) { 988 assert(UseAVX > 2, "required"); 989 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 990 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 991 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 992 assert_different_registers(dst, a, atmp, btmp); 993 assert_different_registers(dst, b, atmp, btmp); 994 995 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 996 bool is_double_word = is_double_word_type(elem_bt); 997 bool merge = true; 998 999 if (!is_double_word && is_min) { 1000 evpmovd2m(ktmp, a, vlen_enc); 1001 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1002 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1003 vminps(dst, atmp, btmp, vlen_enc); 1004 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1005 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1006 } else if (!is_double_word && !is_min) { 1007 evpmovd2m(ktmp, b, vlen_enc); 1008 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1009 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1010 vmaxps(dst, atmp, btmp, vlen_enc); 1011 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1012 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1013 } else if (is_double_word && is_min) { 1014 evpmovq2m(ktmp, a, vlen_enc); 1015 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1016 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1017 vminpd(dst, atmp, btmp, vlen_enc); 1018 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1019 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1020 } else { 1021 assert(is_double_word && !is_min, "sanity"); 1022 evpmovq2m(ktmp, b, vlen_enc); 1023 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1024 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1025 vmaxpd(dst, atmp, btmp, vlen_enc); 1026 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1027 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1028 } 1029 } 1030 1031 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask, 1032 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1033 assert(opc == Op_MinV || opc == Op_MinReductionV || 1034 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity"); 1035 1036 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN 1037 : AVX10_MINMAX_MAX_COMPARE_SIGN; 1038 if (elem_bt == T_FLOAT) { 1039 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc); 1040 } else { 1041 assert(elem_bt == T_DOUBLE, ""); 1042 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc); 1043 } 1044 } 1045 1046 // Float/Double signum 1047 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1048 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1049 1050 Label DONE_LABEL; 1051 1052 if (opcode == Op_SignumF) { 1053 ucomiss(dst, zero); 1054 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1055 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1056 movflt(dst, one); 1057 jcc(Assembler::above, DONE_LABEL); 1058 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1059 } else if (opcode == Op_SignumD) { 1060 ucomisd(dst, zero); 1061 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1062 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1063 movdbl(dst, one); 1064 jcc(Assembler::above, DONE_LABEL); 1065 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1066 } 1067 1068 bind(DONE_LABEL); 1069 } 1070 1071 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1072 if (sign) { 1073 pmovsxbw(dst, src); 1074 } else { 1075 pmovzxbw(dst, src); 1076 } 1077 } 1078 1079 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1080 if (sign) { 1081 vpmovsxbw(dst, src, vector_len); 1082 } else { 1083 vpmovzxbw(dst, src, vector_len); 1084 } 1085 } 1086 1087 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1088 if (sign) { 1089 vpmovsxbd(dst, src, vector_len); 1090 } else { 1091 vpmovzxbd(dst, src, vector_len); 1092 } 1093 } 1094 1095 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1096 if (sign) { 1097 vpmovsxwd(dst, src, vector_len); 1098 } else { 1099 vpmovzxwd(dst, src, vector_len); 1100 } 1101 } 1102 1103 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1104 int shift, int vector_len) { 1105 if (opcode == Op_RotateLeftV) { 1106 if (etype == T_INT) { 1107 evprold(dst, src, shift, vector_len); 1108 } else { 1109 assert(etype == T_LONG, "expected type T_LONG"); 1110 evprolq(dst, src, shift, vector_len); 1111 } 1112 } else { 1113 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1114 if (etype == T_INT) { 1115 evprord(dst, src, shift, vector_len); 1116 } else { 1117 assert(etype == T_LONG, "expected type T_LONG"); 1118 evprorq(dst, src, shift, vector_len); 1119 } 1120 } 1121 } 1122 1123 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1124 XMMRegister shift, int vector_len) { 1125 if (opcode == Op_RotateLeftV) { 1126 if (etype == T_INT) { 1127 evprolvd(dst, src, shift, vector_len); 1128 } else { 1129 assert(etype == T_LONG, "expected type T_LONG"); 1130 evprolvq(dst, src, shift, vector_len); 1131 } 1132 } else { 1133 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1134 if (etype == T_INT) { 1135 evprorvd(dst, src, shift, vector_len); 1136 } else { 1137 assert(etype == T_LONG, "expected type T_LONG"); 1138 evprorvq(dst, src, shift, vector_len); 1139 } 1140 } 1141 } 1142 1143 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1144 if (opcode == Op_RShiftVI) { 1145 psrad(dst, shift); 1146 } else if (opcode == Op_LShiftVI) { 1147 pslld(dst, shift); 1148 } else { 1149 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1150 psrld(dst, shift); 1151 } 1152 } 1153 1154 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1155 switch (opcode) { 1156 case Op_RShiftVI: psrad(dst, shift); break; 1157 case Op_LShiftVI: pslld(dst, shift); break; 1158 case Op_URShiftVI: psrld(dst, shift); break; 1159 1160 default: assert(false, "%s", NodeClassNames[opcode]); 1161 } 1162 } 1163 1164 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1165 if (opcode == Op_RShiftVI) { 1166 vpsrad(dst, nds, shift, vector_len); 1167 } else if (opcode == Op_LShiftVI) { 1168 vpslld(dst, nds, shift, vector_len); 1169 } else { 1170 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1171 vpsrld(dst, nds, shift, vector_len); 1172 } 1173 } 1174 1175 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1176 switch (opcode) { 1177 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1178 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1179 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1180 1181 default: assert(false, "%s", NodeClassNames[opcode]); 1182 } 1183 } 1184 1185 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1186 switch (opcode) { 1187 case Op_RShiftVB: // fall-through 1188 case Op_RShiftVS: psraw(dst, shift); break; 1189 1190 case Op_LShiftVB: // fall-through 1191 case Op_LShiftVS: psllw(dst, shift); break; 1192 1193 case Op_URShiftVS: // fall-through 1194 case Op_URShiftVB: psrlw(dst, shift); break; 1195 1196 default: assert(false, "%s", NodeClassNames[opcode]); 1197 } 1198 } 1199 1200 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1201 switch (opcode) { 1202 case Op_RShiftVB: // fall-through 1203 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1204 1205 case Op_LShiftVB: // fall-through 1206 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1207 1208 case Op_URShiftVS: // fall-through 1209 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1210 1211 default: assert(false, "%s", NodeClassNames[opcode]); 1212 } 1213 } 1214 1215 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1216 switch (opcode) { 1217 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1218 case Op_LShiftVL: psllq(dst, shift); break; 1219 case Op_URShiftVL: psrlq(dst, shift); break; 1220 1221 default: assert(false, "%s", NodeClassNames[opcode]); 1222 } 1223 } 1224 1225 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1226 if (opcode == Op_RShiftVL) { 1227 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1228 } else if (opcode == Op_LShiftVL) { 1229 psllq(dst, shift); 1230 } else { 1231 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1232 psrlq(dst, shift); 1233 } 1234 } 1235 1236 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1237 switch (opcode) { 1238 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1239 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1240 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1241 1242 default: assert(false, "%s", NodeClassNames[opcode]); 1243 } 1244 } 1245 1246 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1247 if (opcode == Op_RShiftVL) { 1248 evpsraq(dst, nds, shift, vector_len); 1249 } else if (opcode == Op_LShiftVL) { 1250 vpsllq(dst, nds, shift, vector_len); 1251 } else { 1252 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1253 vpsrlq(dst, nds, shift, vector_len); 1254 } 1255 } 1256 1257 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1258 switch (opcode) { 1259 case Op_RShiftVB: // fall-through 1260 case Op_RShiftVS: // fall-through 1261 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1262 1263 case Op_LShiftVB: // fall-through 1264 case Op_LShiftVS: // fall-through 1265 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1266 1267 case Op_URShiftVB: // fall-through 1268 case Op_URShiftVS: // fall-through 1269 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1270 1271 default: assert(false, "%s", NodeClassNames[opcode]); 1272 } 1273 } 1274 1275 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1276 switch (opcode) { 1277 case Op_RShiftVB: // fall-through 1278 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1279 1280 case Op_LShiftVB: // fall-through 1281 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1282 1283 case Op_URShiftVB: // fall-through 1284 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1285 1286 default: assert(false, "%s", NodeClassNames[opcode]); 1287 } 1288 } 1289 1290 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1291 assert(UseAVX >= 2, "required"); 1292 switch (opcode) { 1293 case Op_RShiftVL: { 1294 if (UseAVX > 2) { 1295 assert(tmp == xnoreg, "not used"); 1296 if (!VM_Version::supports_avx512vl()) { 1297 vlen_enc = Assembler::AVX_512bit; 1298 } 1299 evpsravq(dst, src, shift, vlen_enc); 1300 } else { 1301 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1302 vpsrlvq(dst, src, shift, vlen_enc); 1303 vpsrlvq(tmp, tmp, shift, vlen_enc); 1304 vpxor(dst, dst, tmp, vlen_enc); 1305 vpsubq(dst, dst, tmp, vlen_enc); 1306 } 1307 break; 1308 } 1309 case Op_LShiftVL: { 1310 assert(tmp == xnoreg, "not used"); 1311 vpsllvq(dst, src, shift, vlen_enc); 1312 break; 1313 } 1314 case Op_URShiftVL: { 1315 assert(tmp == xnoreg, "not used"); 1316 vpsrlvq(dst, src, shift, vlen_enc); 1317 break; 1318 } 1319 default: assert(false, "%s", NodeClassNames[opcode]); 1320 } 1321 } 1322 1323 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1324 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1325 assert(opcode == Op_LShiftVB || 1326 opcode == Op_RShiftVB || 1327 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1328 bool sign = (opcode != Op_URShiftVB); 1329 assert(vector_len == 0, "required"); 1330 vextendbd(sign, dst, src, 1); 1331 vpmovzxbd(vtmp, shift, 1); 1332 varshiftd(opcode, dst, dst, vtmp, 1); 1333 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1334 vextracti128_high(vtmp, dst); 1335 vpackusdw(dst, dst, vtmp, 0); 1336 } 1337 1338 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1339 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1340 assert(opcode == Op_LShiftVB || 1341 opcode == Op_RShiftVB || 1342 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1343 bool sign = (opcode != Op_URShiftVB); 1344 int ext_vector_len = vector_len + 1; 1345 vextendbw(sign, dst, src, ext_vector_len); 1346 vpmovzxbw(vtmp, shift, ext_vector_len); 1347 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1348 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1349 if (vector_len == 0) { 1350 vextracti128_high(vtmp, dst); 1351 vpackuswb(dst, dst, vtmp, vector_len); 1352 } else { 1353 vextracti64x4_high(vtmp, dst); 1354 vpackuswb(dst, dst, vtmp, vector_len); 1355 vpermq(dst, dst, 0xD8, vector_len); 1356 } 1357 } 1358 1359 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1360 switch(typ) { 1361 case T_BYTE: 1362 pinsrb(dst, val, idx); 1363 break; 1364 case T_SHORT: 1365 pinsrw(dst, val, idx); 1366 break; 1367 case T_INT: 1368 pinsrd(dst, val, idx); 1369 break; 1370 case T_LONG: 1371 pinsrq(dst, val, idx); 1372 break; 1373 default: 1374 assert(false,"Should not reach here."); 1375 break; 1376 } 1377 } 1378 1379 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1380 switch(typ) { 1381 case T_BYTE: 1382 vpinsrb(dst, src, val, idx); 1383 break; 1384 case T_SHORT: 1385 vpinsrw(dst, src, val, idx); 1386 break; 1387 case T_INT: 1388 vpinsrd(dst, src, val, idx); 1389 break; 1390 case T_LONG: 1391 vpinsrq(dst, src, val, idx); 1392 break; 1393 default: 1394 assert(false,"Should not reach here."); 1395 break; 1396 } 1397 } 1398 1399 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst, 1400 Register base, Register idx_base, 1401 Register mask, Register mask_idx, 1402 Register rtmp, int vlen_enc) { 1403 vpxor(dst, dst, dst, vlen_enc); 1404 if (elem_bt == T_SHORT) { 1405 for (int i = 0; i < 4; i++) { 1406 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1407 Label skip_load; 1408 btq(mask, mask_idx); 1409 jccb(Assembler::carryClear, skip_load); 1410 movl(rtmp, Address(idx_base, i * 4)); 1411 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1412 bind(skip_load); 1413 incq(mask_idx); 1414 } 1415 } else { 1416 assert(elem_bt == T_BYTE, ""); 1417 for (int i = 0; i < 8; i++) { 1418 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1419 Label skip_load; 1420 btq(mask, mask_idx); 1421 jccb(Assembler::carryClear, skip_load); 1422 movl(rtmp, Address(idx_base, i * 4)); 1423 pinsrb(dst, Address(base, rtmp), i); 1424 bind(skip_load); 1425 incq(mask_idx); 1426 } 1427 } 1428 } 1429 1430 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst, 1431 Register base, Register idx_base, 1432 Register rtmp, int vlen_enc) { 1433 vpxor(dst, dst, dst, vlen_enc); 1434 if (elem_bt == T_SHORT) { 1435 for (int i = 0; i < 4; i++) { 1436 // dst[i] = src[idx_base[i]] 1437 movl(rtmp, Address(idx_base, i * 4)); 1438 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1439 } 1440 } else { 1441 assert(elem_bt == T_BYTE, ""); 1442 for (int i = 0; i < 8; i++) { 1443 // dst[i] = src[idx_base[i]] 1444 movl(rtmp, Address(idx_base, i * 4)); 1445 pinsrb(dst, Address(base, rtmp), i); 1446 } 1447 } 1448 } 1449 1450 /* 1451 * Gather using hybrid algorithm, first partially unroll scalar loop 1452 * to accumulate values from gather indices into a quad-word(64bit) slice. 1453 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1454 * permutation to place the slice into appropriate vector lane 1455 * locations in destination vector. Following pseudo code describes the 1456 * algorithm in detail: 1457 * 1458 * DST_VEC = ZERO_VEC 1459 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1460 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1461 * FOREACH_ITER: 1462 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1463 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1464 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1465 * PERM_INDEX = PERM_INDEX - TWO_VEC 1466 * 1467 * With each iteration, doubleword permute indices (0,1) corresponding 1468 * to gathered quadword gets right shifted by two lane positions. 1469 * 1470 */ 1471 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1472 Register base, Register idx_base, 1473 Register mask, XMMRegister xtmp1, 1474 XMMRegister xtmp2, XMMRegister temp_dst, 1475 Register rtmp, Register mask_idx, 1476 Register length, int vector_len, int vlen_enc) { 1477 Label GATHER8_LOOP; 1478 assert(is_subword_type(elem_ty), ""); 1479 movl(length, vector_len); 1480 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1481 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1482 vallones(xtmp2, vlen_enc); 1483 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1484 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1485 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1486 1487 bind(GATHER8_LOOP); 1488 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1489 if (mask == noreg) { 1490 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc); 1491 } else { 1492 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc); 1493 } 1494 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1495 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1496 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1497 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1498 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1499 vpor(dst, dst, temp_dst, vlen_enc); 1500 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1501 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1502 jcc(Assembler::notEqual, GATHER8_LOOP); 1503 } 1504 1505 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1506 switch(typ) { 1507 case T_INT: 1508 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1509 break; 1510 case T_FLOAT: 1511 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1512 break; 1513 case T_LONG: 1514 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1515 break; 1516 case T_DOUBLE: 1517 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1518 break; 1519 default: 1520 assert(false,"Should not reach here."); 1521 break; 1522 } 1523 } 1524 1525 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1526 switch(typ) { 1527 case T_INT: 1528 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1529 break; 1530 case T_FLOAT: 1531 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1532 break; 1533 case T_LONG: 1534 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1535 break; 1536 case T_DOUBLE: 1537 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1538 break; 1539 default: 1540 assert(false,"Should not reach here."); 1541 break; 1542 } 1543 } 1544 1545 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1546 switch(typ) { 1547 case T_INT: 1548 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1549 break; 1550 case T_FLOAT: 1551 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1552 break; 1553 case T_LONG: 1554 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1555 break; 1556 case T_DOUBLE: 1557 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1558 break; 1559 default: 1560 assert(false,"Should not reach here."); 1561 break; 1562 } 1563 } 1564 1565 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1566 if (vlen_in_bytes <= 16) { 1567 pxor (dst, dst); 1568 psubb(dst, src); 1569 switch (elem_bt) { 1570 case T_BYTE: /* nothing to do */ break; 1571 case T_SHORT: pmovsxbw(dst, dst); break; 1572 case T_INT: pmovsxbd(dst, dst); break; 1573 case T_FLOAT: pmovsxbd(dst, dst); break; 1574 case T_LONG: pmovsxbq(dst, dst); break; 1575 case T_DOUBLE: pmovsxbq(dst, dst); break; 1576 1577 default: assert(false, "%s", type2name(elem_bt)); 1578 } 1579 } else { 1580 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1581 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1582 1583 vpxor (dst, dst, dst, vlen_enc); 1584 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1585 1586 switch (elem_bt) { 1587 case T_BYTE: /* nothing to do */ break; 1588 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1589 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1590 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1591 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1592 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1593 1594 default: assert(false, "%s", type2name(elem_bt)); 1595 } 1596 } 1597 } 1598 1599 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1600 if (novlbwdq) { 1601 vpmovsxbd(xtmp, src, vlen_enc); 1602 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1603 Assembler::eq, true, vlen_enc, noreg); 1604 } else { 1605 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1606 vpsubb(xtmp, xtmp, src, vlen_enc); 1607 evpmovb2m(dst, xtmp, vlen_enc); 1608 } 1609 } 1610 1611 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1612 if (is_integral_type(bt)) { 1613 switch (vlen_in_bytes) { 1614 case 4: movdl(dst, src); break; 1615 case 8: movq(dst, src); break; 1616 case 16: movdqu(dst, src); break; 1617 case 32: vmovdqu(dst, src); break; 1618 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1619 default: ShouldNotReachHere(); 1620 } 1621 } else { 1622 switch (vlen_in_bytes) { 1623 case 4: movflt(dst, src); break; 1624 case 8: movdbl(dst, src); break; 1625 case 16: movups(dst, src); break; 1626 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1627 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1628 default: ShouldNotReachHere(); 1629 } 1630 } 1631 } 1632 1633 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1634 assert(rscratch != noreg || always_reachable(src), "missing"); 1635 1636 if (reachable(src)) { 1637 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1638 } else { 1639 lea(rscratch, src); 1640 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1641 } 1642 } 1643 1644 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1645 int vlen_enc = vector_length_encoding(vlen); 1646 if (VM_Version::supports_avx()) { 1647 if (bt == T_LONG) { 1648 if (VM_Version::supports_avx2()) { 1649 vpbroadcastq(dst, src, vlen_enc); 1650 } else { 1651 vmovddup(dst, src, vlen_enc); 1652 } 1653 } else if (bt == T_DOUBLE) { 1654 if (vlen_enc != Assembler::AVX_128bit) { 1655 vbroadcastsd(dst, src, vlen_enc, noreg); 1656 } else { 1657 vmovddup(dst, src, vlen_enc); 1658 } 1659 } else { 1660 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1661 vpbroadcastd(dst, src, vlen_enc); 1662 } else { 1663 vbroadcastss(dst, src, vlen_enc); 1664 } 1665 } 1666 } else if (VM_Version::supports_sse3()) { 1667 movddup(dst, src); 1668 } else { 1669 load_vector(bt, dst, src, vlen); 1670 } 1671 } 1672 1673 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1674 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1675 int offset = exact_log2(type2aelembytes(bt)) << 6; 1676 if (is_floating_point_type(bt)) { 1677 offset += 128; 1678 } 1679 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1680 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1681 } 1682 1683 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1684 1685 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1686 int vector_len = Assembler::AVX_128bit; 1687 1688 switch (opcode) { 1689 case Op_AndReductionV: pand(dst, src); break; 1690 case Op_OrReductionV: por (dst, src); break; 1691 case Op_XorReductionV: pxor(dst, src); break; 1692 case Op_MinReductionV: 1693 switch (typ) { 1694 case T_BYTE: pminsb(dst, src); break; 1695 case T_SHORT: pminsw(dst, src); break; 1696 case T_INT: pminsd(dst, src); break; 1697 case T_LONG: assert(UseAVX > 2, "required"); 1698 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1699 default: assert(false, "wrong type"); 1700 } 1701 break; 1702 case Op_MaxReductionV: 1703 switch (typ) { 1704 case T_BYTE: pmaxsb(dst, src); break; 1705 case T_SHORT: pmaxsw(dst, src); break; 1706 case T_INT: pmaxsd(dst, src); break; 1707 case T_LONG: assert(UseAVX > 2, "required"); 1708 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1709 default: assert(false, "wrong type"); 1710 } 1711 break; 1712 case Op_AddReductionVF: addss(dst, src); break; 1713 case Op_AddReductionVD: addsd(dst, src); break; 1714 case Op_AddReductionVI: 1715 switch (typ) { 1716 case T_BYTE: paddb(dst, src); break; 1717 case T_SHORT: paddw(dst, src); break; 1718 case T_INT: paddd(dst, src); break; 1719 default: assert(false, "wrong type"); 1720 } 1721 break; 1722 case Op_AddReductionVL: paddq(dst, src); break; 1723 case Op_MulReductionVF: mulss(dst, src); break; 1724 case Op_MulReductionVD: mulsd(dst, src); break; 1725 case Op_MulReductionVI: 1726 switch (typ) { 1727 case T_SHORT: pmullw(dst, src); break; 1728 case T_INT: pmulld(dst, src); break; 1729 default: assert(false, "wrong type"); 1730 } 1731 break; 1732 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1733 evpmullq(dst, dst, src, vector_len); break; 1734 default: assert(false, "wrong opcode"); 1735 } 1736 } 1737 1738 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1739 switch (opcode) { 1740 case Op_AddReductionVF: addps(dst, src); break; 1741 case Op_AddReductionVD: addpd(dst, src); break; 1742 case Op_MulReductionVF: mulps(dst, src); break; 1743 case Op_MulReductionVD: mulpd(dst, src); break; 1744 default: assert(false, "%s", NodeClassNames[opcode]); 1745 } 1746 } 1747 1748 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1749 int vector_len = Assembler::AVX_256bit; 1750 1751 switch (opcode) { 1752 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1753 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1754 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1755 case Op_MinReductionV: 1756 switch (typ) { 1757 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1758 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1759 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1760 case T_LONG: assert(UseAVX > 2, "required"); 1761 vpminsq(dst, src1, src2, vector_len); break; 1762 default: assert(false, "wrong type"); 1763 } 1764 break; 1765 case Op_MaxReductionV: 1766 switch (typ) { 1767 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1768 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1769 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1770 case T_LONG: assert(UseAVX > 2, "required"); 1771 vpmaxsq(dst, src1, src2, vector_len); break; 1772 default: assert(false, "wrong type"); 1773 } 1774 break; 1775 case Op_AddReductionVI: 1776 switch (typ) { 1777 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1778 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1779 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1780 default: assert(false, "wrong type"); 1781 } 1782 break; 1783 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1784 case Op_MulReductionVI: 1785 switch (typ) { 1786 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1787 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1788 default: assert(false, "wrong type"); 1789 } 1790 break; 1791 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1792 default: assert(false, "wrong opcode"); 1793 } 1794 } 1795 1796 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1797 int vector_len = Assembler::AVX_256bit; 1798 1799 switch (opcode) { 1800 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1801 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1802 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1803 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1804 default: assert(false, "%s", NodeClassNames[opcode]); 1805 } 1806 } 1807 1808 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1809 XMMRegister dst, XMMRegister src, 1810 XMMRegister vtmp1, XMMRegister vtmp2) { 1811 switch (opcode) { 1812 case Op_AddReductionVF: 1813 case Op_MulReductionVF: 1814 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1815 break; 1816 1817 case Op_AddReductionVD: 1818 case Op_MulReductionVD: 1819 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1820 break; 1821 1822 default: assert(false, "wrong opcode"); 1823 } 1824 } 1825 1826 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1827 XMMRegister dst, XMMRegister src, 1828 XMMRegister vtmp1, XMMRegister vtmp2) { 1829 switch (opcode) { 1830 case Op_AddReductionVF: 1831 case Op_MulReductionVF: 1832 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1833 break; 1834 1835 case Op_AddReductionVD: 1836 case Op_MulReductionVD: 1837 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1838 break; 1839 1840 default: assert(false, "%s", NodeClassNames[opcode]); 1841 } 1842 } 1843 1844 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1845 Register dst, Register src1, XMMRegister src2, 1846 XMMRegister vtmp1, XMMRegister vtmp2) { 1847 switch (vlen) { 1848 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1849 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1850 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1851 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1852 1853 default: assert(false, "wrong vector length"); 1854 } 1855 } 1856 1857 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1858 Register dst, Register src1, XMMRegister src2, 1859 XMMRegister vtmp1, XMMRegister vtmp2) { 1860 switch (vlen) { 1861 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1862 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1863 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1864 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1865 1866 default: assert(false, "wrong vector length"); 1867 } 1868 } 1869 1870 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1871 Register dst, Register src1, XMMRegister src2, 1872 XMMRegister vtmp1, XMMRegister vtmp2) { 1873 switch (vlen) { 1874 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1875 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1876 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1877 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1878 1879 default: assert(false, "wrong vector length"); 1880 } 1881 } 1882 1883 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1884 Register dst, Register src1, XMMRegister src2, 1885 XMMRegister vtmp1, XMMRegister vtmp2) { 1886 switch (vlen) { 1887 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1888 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1889 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1890 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1891 1892 default: assert(false, "wrong vector length"); 1893 } 1894 } 1895 1896 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1897 Register dst, Register src1, XMMRegister src2, 1898 XMMRegister vtmp1, XMMRegister vtmp2) { 1899 switch (vlen) { 1900 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1901 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1902 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1903 1904 default: assert(false, "wrong vector length"); 1905 } 1906 } 1907 1908 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1909 switch (vlen) { 1910 case 2: 1911 assert(vtmp2 == xnoreg, ""); 1912 reduce2F(opcode, dst, src, vtmp1); 1913 break; 1914 case 4: 1915 assert(vtmp2 == xnoreg, ""); 1916 reduce4F(opcode, dst, src, vtmp1); 1917 break; 1918 case 8: 1919 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1920 break; 1921 case 16: 1922 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1923 break; 1924 default: assert(false, "wrong vector length"); 1925 } 1926 } 1927 1928 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1929 switch (vlen) { 1930 case 2: 1931 assert(vtmp2 == xnoreg, ""); 1932 reduce2D(opcode, dst, src, vtmp1); 1933 break; 1934 case 4: 1935 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1936 break; 1937 case 8: 1938 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1939 break; 1940 default: assert(false, "wrong vector length"); 1941 } 1942 } 1943 1944 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1945 switch (vlen) { 1946 case 2: 1947 assert(vtmp1 == xnoreg, ""); 1948 assert(vtmp2 == xnoreg, ""); 1949 unorderedReduce2F(opcode, dst, src); 1950 break; 1951 case 4: 1952 assert(vtmp2 == xnoreg, ""); 1953 unorderedReduce4F(opcode, dst, src, vtmp1); 1954 break; 1955 case 8: 1956 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 1957 break; 1958 case 16: 1959 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 1960 break; 1961 default: assert(false, "wrong vector length"); 1962 } 1963 } 1964 1965 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1966 switch (vlen) { 1967 case 2: 1968 assert(vtmp1 == xnoreg, ""); 1969 assert(vtmp2 == xnoreg, ""); 1970 unorderedReduce2D(opcode, dst, src); 1971 break; 1972 case 4: 1973 assert(vtmp2 == xnoreg, ""); 1974 unorderedReduce4D(opcode, dst, src, vtmp1); 1975 break; 1976 case 8: 1977 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 1978 break; 1979 default: assert(false, "wrong vector length"); 1980 } 1981 } 1982 1983 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1984 if (opcode == Op_AddReductionVI) { 1985 if (vtmp1 != src2) { 1986 movdqu(vtmp1, src2); 1987 } 1988 phaddd(vtmp1, vtmp1); 1989 } else { 1990 pshufd(vtmp1, src2, 0x1); 1991 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1992 } 1993 movdl(vtmp2, src1); 1994 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1995 movdl(dst, vtmp1); 1996 } 1997 1998 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1999 if (opcode == Op_AddReductionVI) { 2000 if (vtmp1 != src2) { 2001 movdqu(vtmp1, src2); 2002 } 2003 phaddd(vtmp1, src2); 2004 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2005 } else { 2006 pshufd(vtmp2, src2, 0xE); 2007 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2008 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2009 } 2010 } 2011 2012 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2013 if (opcode == Op_AddReductionVI) { 2014 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2015 vextracti128_high(vtmp2, vtmp1); 2016 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2017 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2018 } else { 2019 vextracti128_high(vtmp1, src2); 2020 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2021 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2022 } 2023 } 2024 2025 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2026 vextracti64x4_high(vtmp2, src2); 2027 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2028 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2029 } 2030 2031 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2032 pshufd(vtmp2, src2, 0x1); 2033 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2034 movdqu(vtmp1, vtmp2); 2035 psrldq(vtmp1, 2); 2036 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2037 movdqu(vtmp2, vtmp1); 2038 psrldq(vtmp2, 1); 2039 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2040 movdl(vtmp2, src1); 2041 pmovsxbd(vtmp1, vtmp1); 2042 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2043 pextrb(dst, vtmp1, 0x0); 2044 movsbl(dst, dst); 2045 } 2046 2047 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2048 pshufd(vtmp1, src2, 0xE); 2049 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2050 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2051 } 2052 2053 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2054 vextracti128_high(vtmp2, src2); 2055 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2056 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2057 } 2058 2059 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2060 vextracti64x4_high(vtmp1, src2); 2061 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2062 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2063 } 2064 2065 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2066 pmovsxbw(vtmp2, src2); 2067 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2068 } 2069 2070 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2071 if (UseAVX > 1) { 2072 int vector_len = Assembler::AVX_256bit; 2073 vpmovsxbw(vtmp1, src2, vector_len); 2074 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2075 } else { 2076 pmovsxbw(vtmp2, src2); 2077 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2078 pshufd(vtmp2, src2, 0x1); 2079 pmovsxbw(vtmp2, src2); 2080 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2081 } 2082 } 2083 2084 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2085 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2086 int vector_len = Assembler::AVX_512bit; 2087 vpmovsxbw(vtmp1, src2, vector_len); 2088 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2089 } else { 2090 assert(UseAVX >= 2,"Should not reach here."); 2091 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2092 vextracti128_high(vtmp2, src2); 2093 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2094 } 2095 } 2096 2097 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2098 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2099 vextracti64x4_high(vtmp2, src2); 2100 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2101 } 2102 2103 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2104 if (opcode == Op_AddReductionVI) { 2105 if (vtmp1 != src2) { 2106 movdqu(vtmp1, src2); 2107 } 2108 phaddw(vtmp1, vtmp1); 2109 phaddw(vtmp1, vtmp1); 2110 } else { 2111 pshufd(vtmp2, src2, 0x1); 2112 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2113 movdqu(vtmp1, vtmp2); 2114 psrldq(vtmp1, 2); 2115 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2116 } 2117 movdl(vtmp2, src1); 2118 pmovsxwd(vtmp1, vtmp1); 2119 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2120 pextrw(dst, vtmp1, 0x0); 2121 movswl(dst, dst); 2122 } 2123 2124 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2125 if (opcode == Op_AddReductionVI) { 2126 if (vtmp1 != src2) { 2127 movdqu(vtmp1, src2); 2128 } 2129 phaddw(vtmp1, src2); 2130 } else { 2131 pshufd(vtmp1, src2, 0xE); 2132 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2133 } 2134 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2135 } 2136 2137 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2138 if (opcode == Op_AddReductionVI) { 2139 int vector_len = Assembler::AVX_256bit; 2140 vphaddw(vtmp2, src2, src2, vector_len); 2141 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2142 } else { 2143 vextracti128_high(vtmp2, src2); 2144 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2145 } 2146 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2147 } 2148 2149 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2150 int vector_len = Assembler::AVX_256bit; 2151 vextracti64x4_high(vtmp1, src2); 2152 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2153 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2154 } 2155 2156 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2157 pshufd(vtmp2, src2, 0xE); 2158 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2159 movdq(vtmp1, src1); 2160 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2161 movdq(dst, vtmp1); 2162 } 2163 2164 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2165 vextracti128_high(vtmp1, src2); 2166 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2167 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2168 } 2169 2170 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2171 vextracti64x4_high(vtmp2, src2); 2172 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2173 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2174 } 2175 2176 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2177 mov64(temp, -1L); 2178 bzhiq(temp, temp, len); 2179 kmovql(dst, temp); 2180 } 2181 2182 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2183 reduce_operation_128(T_FLOAT, opcode, dst, src); 2184 pshufd(vtmp, src, 0x1); 2185 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2186 } 2187 2188 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2189 reduce2F(opcode, dst, src, vtmp); 2190 pshufd(vtmp, src, 0x2); 2191 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2192 pshufd(vtmp, src, 0x3); 2193 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2194 } 2195 2196 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2197 reduce4F(opcode, dst, src, vtmp2); 2198 vextractf128_high(vtmp2, src); 2199 reduce4F(opcode, dst, vtmp2, vtmp1); 2200 } 2201 2202 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2203 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2204 vextracti64x4_high(vtmp1, src); 2205 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2206 } 2207 2208 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2209 pshufd(dst, src, 0x1); 2210 reduce_operation_128(T_FLOAT, opcode, dst, src); 2211 } 2212 2213 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2214 pshufd(vtmp, src, 0xE); 2215 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2216 unorderedReduce2F(opcode, dst, vtmp); 2217 } 2218 2219 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2220 vextractf128_high(vtmp1, src); 2221 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2222 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2223 } 2224 2225 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2226 vextractf64x4_high(vtmp2, src); 2227 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2228 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2229 } 2230 2231 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2232 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2233 pshufd(vtmp, src, 0xE); 2234 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2235 } 2236 2237 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2238 reduce2D(opcode, dst, src, vtmp2); 2239 vextractf128_high(vtmp2, src); 2240 reduce2D(opcode, dst, vtmp2, vtmp1); 2241 } 2242 2243 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2244 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2245 vextracti64x4_high(vtmp1, src); 2246 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2247 } 2248 2249 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2250 pshufd(dst, src, 0xE); 2251 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2252 } 2253 2254 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2255 vextractf128_high(vtmp, src); 2256 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2257 unorderedReduce2D(opcode, dst, vtmp); 2258 } 2259 2260 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2261 vextractf64x4_high(vtmp2, src); 2262 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2263 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2264 } 2265 2266 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2267 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2268 } 2269 2270 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2271 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2272 } 2273 2274 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2275 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2276 } 2277 2278 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2279 int vec_enc) { 2280 switch(elem_bt) { 2281 case T_INT: 2282 case T_FLOAT: 2283 vmaskmovps(dst, src, mask, vec_enc); 2284 break; 2285 case T_LONG: 2286 case T_DOUBLE: 2287 vmaskmovpd(dst, src, mask, vec_enc); 2288 break; 2289 default: 2290 fatal("Unsupported type %s", type2name(elem_bt)); 2291 break; 2292 } 2293 } 2294 2295 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2296 int vec_enc) { 2297 switch(elem_bt) { 2298 case T_INT: 2299 case T_FLOAT: 2300 vmaskmovps(dst, src, mask, vec_enc); 2301 break; 2302 case T_LONG: 2303 case T_DOUBLE: 2304 vmaskmovpd(dst, src, mask, vec_enc); 2305 break; 2306 default: 2307 fatal("Unsupported type %s", type2name(elem_bt)); 2308 break; 2309 } 2310 } 2311 2312 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2313 XMMRegister dst, XMMRegister src, 2314 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2315 XMMRegister xmm_0, XMMRegister xmm_1) { 2316 const int permconst[] = {1, 14}; 2317 XMMRegister wsrc = src; 2318 XMMRegister wdst = xmm_0; 2319 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2320 2321 int vlen_enc = Assembler::AVX_128bit; 2322 if (vlen == 16) { 2323 vlen_enc = Assembler::AVX_256bit; 2324 } 2325 2326 for (int i = log2(vlen) - 1; i >=0; i--) { 2327 if (i == 0 && !is_dst_valid) { 2328 wdst = dst; 2329 } 2330 if (i == 3) { 2331 vextracti64x4_high(wtmp, wsrc); 2332 } else if (i == 2) { 2333 vextracti128_high(wtmp, wsrc); 2334 } else { // i = [0,1] 2335 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2336 } 2337 2338 if (VM_Version::supports_avx10_2()) { 2339 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc); 2340 } else { 2341 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2342 } 2343 wsrc = wdst; 2344 vlen_enc = Assembler::AVX_128bit; 2345 } 2346 if (is_dst_valid) { 2347 if (VM_Version::supports_avx10_2()) { 2348 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit); 2349 } else { 2350 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2351 } 2352 } 2353 } 2354 2355 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2356 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2357 XMMRegister xmm_0, XMMRegister xmm_1) { 2358 XMMRegister wsrc = src; 2359 XMMRegister wdst = xmm_0; 2360 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2361 int vlen_enc = Assembler::AVX_128bit; 2362 if (vlen == 8) { 2363 vlen_enc = Assembler::AVX_256bit; 2364 } 2365 for (int i = log2(vlen) - 1; i >=0; i--) { 2366 if (i == 0 && !is_dst_valid) { 2367 wdst = dst; 2368 } 2369 if (i == 1) { 2370 vextracti128_high(wtmp, wsrc); 2371 } else if (i == 2) { 2372 vextracti64x4_high(wtmp, wsrc); 2373 } else { 2374 assert(i == 0, "%d", i); 2375 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2376 } 2377 2378 if (VM_Version::supports_avx10_2()) { 2379 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc); 2380 } else { 2381 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2382 } 2383 2384 wsrc = wdst; 2385 vlen_enc = Assembler::AVX_128bit; 2386 } 2387 2388 if (is_dst_valid) { 2389 if (VM_Version::supports_avx10_2()) { 2390 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit); 2391 } else { 2392 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2393 } 2394 } 2395 } 2396 2397 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2398 switch (bt) { 2399 case T_BYTE: pextrb(dst, src, idx); break; 2400 case T_SHORT: pextrw(dst, src, idx); break; 2401 case T_INT: pextrd(dst, src, idx); break; 2402 case T_LONG: pextrq(dst, src, idx); break; 2403 2404 default: 2405 assert(false,"Should not reach here."); 2406 break; 2407 } 2408 } 2409 2410 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2411 int esize = type2aelembytes(typ); 2412 int elem_per_lane = 16/esize; 2413 int lane = elemindex / elem_per_lane; 2414 int eindex = elemindex % elem_per_lane; 2415 2416 if (lane >= 2) { 2417 assert(UseAVX > 2, "required"); 2418 vextractf32x4(dst, src, lane & 3); 2419 return dst; 2420 } else if (lane > 0) { 2421 assert(UseAVX > 0, "required"); 2422 vextractf128(dst, src, lane); 2423 return dst; 2424 } else { 2425 return src; 2426 } 2427 } 2428 2429 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2430 if (typ == T_BYTE) { 2431 movsbl(dst, dst); 2432 } else if (typ == T_SHORT) { 2433 movswl(dst, dst); 2434 } 2435 } 2436 2437 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2438 int esize = type2aelembytes(typ); 2439 int elem_per_lane = 16/esize; 2440 int eindex = elemindex % elem_per_lane; 2441 assert(is_integral_type(typ),"required"); 2442 2443 if (eindex == 0) { 2444 if (typ == T_LONG) { 2445 movq(dst, src); 2446 } else { 2447 movdl(dst, src); 2448 movsxl(typ, dst); 2449 } 2450 } else { 2451 extract(typ, dst, src, eindex); 2452 movsxl(typ, dst); 2453 } 2454 } 2455 2456 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2457 int esize = type2aelembytes(typ); 2458 int elem_per_lane = 16/esize; 2459 int eindex = elemindex % elem_per_lane; 2460 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2461 2462 if (eindex == 0) { 2463 movq(dst, src); 2464 } else { 2465 if (typ == T_FLOAT) { 2466 if (UseAVX == 0) { 2467 movdqu(dst, src); 2468 shufps(dst, dst, eindex); 2469 } else { 2470 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2471 } 2472 } else { 2473 if (UseAVX == 0) { 2474 movdqu(dst, src); 2475 psrldq(dst, eindex*esize); 2476 } else { 2477 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2478 } 2479 movq(dst, dst); 2480 } 2481 } 2482 // Zero upper bits 2483 if (typ == T_FLOAT) { 2484 if (UseAVX == 0) { 2485 assert(vtmp != xnoreg, "required."); 2486 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2487 pand(dst, vtmp); 2488 } else { 2489 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2490 } 2491 } 2492 } 2493 2494 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2495 switch(typ) { 2496 case T_BYTE: 2497 case T_BOOLEAN: 2498 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2499 break; 2500 case T_SHORT: 2501 case T_CHAR: 2502 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2503 break; 2504 case T_INT: 2505 case T_FLOAT: 2506 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2507 break; 2508 case T_LONG: 2509 case T_DOUBLE: 2510 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2511 break; 2512 default: 2513 assert(false,"Should not reach here."); 2514 break; 2515 } 2516 } 2517 2518 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2519 assert(rscratch != noreg || always_reachable(src2), "missing"); 2520 2521 switch(typ) { 2522 case T_BOOLEAN: 2523 case T_BYTE: 2524 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2525 break; 2526 case T_CHAR: 2527 case T_SHORT: 2528 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2529 break; 2530 case T_INT: 2531 case T_FLOAT: 2532 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2533 break; 2534 case T_LONG: 2535 case T_DOUBLE: 2536 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2537 break; 2538 default: 2539 assert(false,"Should not reach here."); 2540 break; 2541 } 2542 } 2543 2544 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2545 switch(typ) { 2546 case T_BYTE: 2547 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2548 break; 2549 case T_SHORT: 2550 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2551 break; 2552 case T_INT: 2553 case T_FLOAT: 2554 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2555 break; 2556 case T_LONG: 2557 case T_DOUBLE: 2558 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2559 break; 2560 default: 2561 assert(false,"Should not reach here."); 2562 break; 2563 } 2564 } 2565 2566 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2567 assert(vlen_in_bytes <= 32, ""); 2568 int esize = type2aelembytes(bt); 2569 if (vlen_in_bytes == 32) { 2570 assert(vtmp == xnoreg, "required."); 2571 if (esize >= 4) { 2572 vtestps(src1, src2, AVX_256bit); 2573 } else { 2574 vptest(src1, src2, AVX_256bit); 2575 } 2576 return; 2577 } 2578 if (vlen_in_bytes < 16) { 2579 // Duplicate the lower part to fill the whole register, 2580 // Don't need to do so for src2 2581 assert(vtmp != xnoreg, "required"); 2582 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2583 pshufd(vtmp, src1, shuffle_imm); 2584 } else { 2585 assert(vtmp == xnoreg, "required"); 2586 vtmp = src1; 2587 } 2588 if (esize >= 4 && VM_Version::supports_avx()) { 2589 vtestps(vtmp, src2, AVX_128bit); 2590 } else { 2591 ptest(vtmp, src2); 2592 } 2593 } 2594 2595 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2596 #ifdef ASSERT 2597 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2598 bool is_bw_supported = VM_Version::supports_avx512bw(); 2599 if (is_bw && !is_bw_supported) { 2600 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2601 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2602 "XMM register should be 0-15"); 2603 } 2604 #endif // ASSERT 2605 switch (elem_bt) { 2606 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2607 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2608 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2609 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2610 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2611 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2612 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2613 } 2614 } 2615 2616 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2617 assert(UseAVX >= 2, "required"); 2618 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2619 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2620 if ((UseAVX > 2) && 2621 (!is_bw || VM_Version::supports_avx512bw()) && 2622 (!is_vl || VM_Version::supports_avx512vl())) { 2623 switch (elem_bt) { 2624 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2625 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2626 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2627 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2628 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2629 } 2630 } else { 2631 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2632 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2633 switch (elem_bt) { 2634 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2635 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2636 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2637 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2638 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2639 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2640 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2641 } 2642 } 2643 } 2644 2645 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2646 switch (to_elem_bt) { 2647 case T_SHORT: 2648 vpmovsxbw(dst, src, vlen_enc); 2649 break; 2650 case T_INT: 2651 vpmovsxbd(dst, src, vlen_enc); 2652 break; 2653 case T_FLOAT: 2654 vpmovsxbd(dst, src, vlen_enc); 2655 vcvtdq2ps(dst, dst, vlen_enc); 2656 break; 2657 case T_LONG: 2658 vpmovsxbq(dst, src, vlen_enc); 2659 break; 2660 case T_DOUBLE: { 2661 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2662 vpmovsxbd(dst, src, mid_vlen_enc); 2663 vcvtdq2pd(dst, dst, vlen_enc); 2664 break; 2665 } 2666 default: 2667 fatal("Unsupported type %s", type2name(to_elem_bt)); 2668 break; 2669 } 2670 } 2671 2672 //------------------------------------------------------------------------------------------- 2673 2674 // IndexOf for constant substrings with size >= 8 chars 2675 // which don't need to be loaded through stack. 2676 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2677 Register cnt1, Register cnt2, 2678 int int_cnt2, Register result, 2679 XMMRegister vec, Register tmp, 2680 int ae) { 2681 ShortBranchVerifier sbv(this); 2682 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2683 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2684 2685 // This method uses the pcmpestri instruction with bound registers 2686 // inputs: 2687 // xmm - substring 2688 // rax - substring length (elements count) 2689 // mem - scanned string 2690 // rdx - string length (elements count) 2691 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2692 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2693 // outputs: 2694 // rcx - matched index in string 2695 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2696 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2697 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2698 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2699 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2700 2701 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2702 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2703 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2704 2705 // Note, inline_string_indexOf() generates checks: 2706 // if (substr.count > string.count) return -1; 2707 // if (substr.count == 0) return 0; 2708 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2709 2710 // Load substring. 2711 if (ae == StrIntrinsicNode::UL) { 2712 pmovzxbw(vec, Address(str2, 0)); 2713 } else { 2714 movdqu(vec, Address(str2, 0)); 2715 } 2716 movl(cnt2, int_cnt2); 2717 movptr(result, str1); // string addr 2718 2719 if (int_cnt2 > stride) { 2720 jmpb(SCAN_TO_SUBSTR); 2721 2722 // Reload substr for rescan, this code 2723 // is executed only for large substrings (> 8 chars) 2724 bind(RELOAD_SUBSTR); 2725 if (ae == StrIntrinsicNode::UL) { 2726 pmovzxbw(vec, Address(str2, 0)); 2727 } else { 2728 movdqu(vec, Address(str2, 0)); 2729 } 2730 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2731 2732 bind(RELOAD_STR); 2733 // We came here after the beginning of the substring was 2734 // matched but the rest of it was not so we need to search 2735 // again. Start from the next element after the previous match. 2736 2737 // cnt2 is number of substring reminding elements and 2738 // cnt1 is number of string reminding elements when cmp failed. 2739 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2740 subl(cnt1, cnt2); 2741 addl(cnt1, int_cnt2); 2742 movl(cnt2, int_cnt2); // Now restore cnt2 2743 2744 decrementl(cnt1); // Shift to next element 2745 cmpl(cnt1, cnt2); 2746 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2747 2748 addptr(result, (1<<scale1)); 2749 2750 } // (int_cnt2 > 8) 2751 2752 // Scan string for start of substr in 16-byte vectors 2753 bind(SCAN_TO_SUBSTR); 2754 pcmpestri(vec, Address(result, 0), mode); 2755 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2756 subl(cnt1, stride); 2757 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2758 cmpl(cnt1, cnt2); 2759 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2760 addptr(result, 16); 2761 jmpb(SCAN_TO_SUBSTR); 2762 2763 // Found a potential substr 2764 bind(FOUND_CANDIDATE); 2765 // Matched whole vector if first element matched (tmp(rcx) == 0). 2766 if (int_cnt2 == stride) { 2767 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2768 } else { // int_cnt2 > 8 2769 jccb(Assembler::overflow, FOUND_SUBSTR); 2770 } 2771 // After pcmpestri tmp(rcx) contains matched element index 2772 // Compute start addr of substr 2773 lea(result, Address(result, tmp, scale1)); 2774 2775 // Make sure string is still long enough 2776 subl(cnt1, tmp); 2777 cmpl(cnt1, cnt2); 2778 if (int_cnt2 == stride) { 2779 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2780 } else { // int_cnt2 > 8 2781 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2782 } 2783 // Left less then substring. 2784 2785 bind(RET_NOT_FOUND); 2786 movl(result, -1); 2787 jmp(EXIT); 2788 2789 if (int_cnt2 > stride) { 2790 // This code is optimized for the case when whole substring 2791 // is matched if its head is matched. 2792 bind(MATCH_SUBSTR_HEAD); 2793 pcmpestri(vec, Address(result, 0), mode); 2794 // Reload only string if does not match 2795 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2796 2797 Label CONT_SCAN_SUBSTR; 2798 // Compare the rest of substring (> 8 chars). 2799 bind(FOUND_SUBSTR); 2800 // First 8 chars are already matched. 2801 negptr(cnt2); 2802 addptr(cnt2, stride); 2803 2804 bind(SCAN_SUBSTR); 2805 subl(cnt1, stride); 2806 cmpl(cnt2, -stride); // Do not read beyond substring 2807 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2808 // Back-up strings to avoid reading beyond substring: 2809 // cnt1 = cnt1 - cnt2 + 8 2810 addl(cnt1, cnt2); // cnt2 is negative 2811 addl(cnt1, stride); 2812 movl(cnt2, stride); negptr(cnt2); 2813 bind(CONT_SCAN_SUBSTR); 2814 if (int_cnt2 < (int)G) { 2815 int tail_off1 = int_cnt2<<scale1; 2816 int tail_off2 = int_cnt2<<scale2; 2817 if (ae == StrIntrinsicNode::UL) { 2818 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2819 } else { 2820 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2821 } 2822 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2823 } else { 2824 // calculate index in register to avoid integer overflow (int_cnt2*2) 2825 movl(tmp, int_cnt2); 2826 addptr(tmp, cnt2); 2827 if (ae == StrIntrinsicNode::UL) { 2828 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2829 } else { 2830 movdqu(vec, Address(str2, tmp, scale2, 0)); 2831 } 2832 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2833 } 2834 // Need to reload strings pointers if not matched whole vector 2835 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2836 addptr(cnt2, stride); 2837 jcc(Assembler::negative, SCAN_SUBSTR); 2838 // Fall through if found full substring 2839 2840 } // (int_cnt2 > 8) 2841 2842 bind(RET_FOUND); 2843 // Found result if we matched full small substring. 2844 // Compute substr offset 2845 subptr(result, str1); 2846 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2847 shrl(result, 1); // index 2848 } 2849 bind(EXIT); 2850 2851 } // string_indexofC8 2852 2853 // Small strings are loaded through stack if they cross page boundary. 2854 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2855 Register cnt1, Register cnt2, 2856 int int_cnt2, Register result, 2857 XMMRegister vec, Register tmp, 2858 int ae) { 2859 ShortBranchVerifier sbv(this); 2860 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2861 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2862 2863 // 2864 // int_cnt2 is length of small (< 8 chars) constant substring 2865 // or (-1) for non constant substring in which case its length 2866 // is in cnt2 register. 2867 // 2868 // Note, inline_string_indexOf() generates checks: 2869 // if (substr.count > string.count) return -1; 2870 // if (substr.count == 0) return 0; 2871 // 2872 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2873 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2874 // This method uses the pcmpestri instruction with bound registers 2875 // inputs: 2876 // xmm - substring 2877 // rax - substring length (elements count) 2878 // mem - scanned string 2879 // rdx - string length (elements count) 2880 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2881 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2882 // outputs: 2883 // rcx - matched index in string 2884 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2885 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2886 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2887 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2888 2889 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2890 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2891 FOUND_CANDIDATE; 2892 2893 { //======================================================== 2894 // We don't know where these strings are located 2895 // and we can't read beyond them. Load them through stack. 2896 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2897 2898 movptr(tmp, rsp); // save old SP 2899 2900 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2901 if (int_cnt2 == (1>>scale2)) { // One byte 2902 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2903 load_unsigned_byte(result, Address(str2, 0)); 2904 movdl(vec, result); // move 32 bits 2905 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2906 // Not enough header space in 32-bit VM: 12+3 = 15. 2907 movl(result, Address(str2, -1)); 2908 shrl(result, 8); 2909 movdl(vec, result); // move 32 bits 2910 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2911 load_unsigned_short(result, Address(str2, 0)); 2912 movdl(vec, result); // move 32 bits 2913 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2914 movdl(vec, Address(str2, 0)); // move 32 bits 2915 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2916 movq(vec, Address(str2, 0)); // move 64 bits 2917 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2918 // Array header size is 12 bytes in 32-bit VM 2919 // + 6 bytes for 3 chars == 18 bytes, 2920 // enough space to load vec and shift. 2921 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2922 if (ae == StrIntrinsicNode::UL) { 2923 int tail_off = int_cnt2-8; 2924 pmovzxbw(vec, Address(str2, tail_off)); 2925 psrldq(vec, -2*tail_off); 2926 } 2927 else { 2928 int tail_off = int_cnt2*(1<<scale2); 2929 movdqu(vec, Address(str2, tail_off-16)); 2930 psrldq(vec, 16-tail_off); 2931 } 2932 } 2933 } else { // not constant substring 2934 cmpl(cnt2, stride); 2935 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2936 2937 // We can read beyond string if srt+16 does not cross page boundary 2938 // since heaps are aligned and mapped by pages. 2939 assert(os::vm_page_size() < (int)G, "default page should be small"); 2940 movl(result, str2); // We need only low 32 bits 2941 andl(result, ((int)os::vm_page_size()-1)); 2942 cmpl(result, ((int)os::vm_page_size()-16)); 2943 jccb(Assembler::belowEqual, CHECK_STR); 2944 2945 // Move small strings to stack to allow load 16 bytes into vec. 2946 subptr(rsp, 16); 2947 int stk_offset = wordSize-(1<<scale2); 2948 push(cnt2); 2949 2950 bind(COPY_SUBSTR); 2951 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2952 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2953 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2954 } else if (ae == StrIntrinsicNode::UU) { 2955 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2956 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2957 } 2958 decrement(cnt2); 2959 jccb(Assembler::notZero, COPY_SUBSTR); 2960 2961 pop(cnt2); 2962 movptr(str2, rsp); // New substring address 2963 } // non constant 2964 2965 bind(CHECK_STR); 2966 cmpl(cnt1, stride); 2967 jccb(Assembler::aboveEqual, BIG_STRINGS); 2968 2969 // Check cross page boundary. 2970 movl(result, str1); // We need only low 32 bits 2971 andl(result, ((int)os::vm_page_size()-1)); 2972 cmpl(result, ((int)os::vm_page_size()-16)); 2973 jccb(Assembler::belowEqual, BIG_STRINGS); 2974 2975 subptr(rsp, 16); 2976 int stk_offset = -(1<<scale1); 2977 if (int_cnt2 < 0) { // not constant 2978 push(cnt2); 2979 stk_offset += wordSize; 2980 } 2981 movl(cnt2, cnt1); 2982 2983 bind(COPY_STR); 2984 if (ae == StrIntrinsicNode::LL) { 2985 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2986 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2987 } else { 2988 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2989 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2990 } 2991 decrement(cnt2); 2992 jccb(Assembler::notZero, COPY_STR); 2993 2994 if (int_cnt2 < 0) { // not constant 2995 pop(cnt2); 2996 } 2997 movptr(str1, rsp); // New string address 2998 2999 bind(BIG_STRINGS); 3000 // Load substring. 3001 if (int_cnt2 < 0) { // -1 3002 if (ae == StrIntrinsicNode::UL) { 3003 pmovzxbw(vec, Address(str2, 0)); 3004 } else { 3005 movdqu(vec, Address(str2, 0)); 3006 } 3007 push(cnt2); // substr count 3008 push(str2); // substr addr 3009 push(str1); // string addr 3010 } else { 3011 // Small (< 8 chars) constant substrings are loaded already. 3012 movl(cnt2, int_cnt2); 3013 } 3014 push(tmp); // original SP 3015 3016 } // Finished loading 3017 3018 //======================================================== 3019 // Start search 3020 // 3021 3022 movptr(result, str1); // string addr 3023 3024 if (int_cnt2 < 0) { // Only for non constant substring 3025 jmpb(SCAN_TO_SUBSTR); 3026 3027 // SP saved at sp+0 3028 // String saved at sp+1*wordSize 3029 // Substr saved at sp+2*wordSize 3030 // Substr count saved at sp+3*wordSize 3031 3032 // Reload substr for rescan, this code 3033 // is executed only for large substrings (> 8 chars) 3034 bind(RELOAD_SUBSTR); 3035 movptr(str2, Address(rsp, 2*wordSize)); 3036 movl(cnt2, Address(rsp, 3*wordSize)); 3037 if (ae == StrIntrinsicNode::UL) { 3038 pmovzxbw(vec, Address(str2, 0)); 3039 } else { 3040 movdqu(vec, Address(str2, 0)); 3041 } 3042 // We came here after the beginning of the substring was 3043 // matched but the rest of it was not so we need to search 3044 // again. Start from the next element after the previous match. 3045 subptr(str1, result); // Restore counter 3046 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3047 shrl(str1, 1); 3048 } 3049 addl(cnt1, str1); 3050 decrementl(cnt1); // Shift to next element 3051 cmpl(cnt1, cnt2); 3052 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3053 3054 addptr(result, (1<<scale1)); 3055 } // non constant 3056 3057 // Scan string for start of substr in 16-byte vectors 3058 bind(SCAN_TO_SUBSTR); 3059 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3060 pcmpestri(vec, Address(result, 0), mode); 3061 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3062 subl(cnt1, stride); 3063 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3064 cmpl(cnt1, cnt2); 3065 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3066 addptr(result, 16); 3067 3068 bind(ADJUST_STR); 3069 cmpl(cnt1, stride); // Do not read beyond string 3070 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3071 // Back-up string to avoid reading beyond string. 3072 lea(result, Address(result, cnt1, scale1, -16)); 3073 movl(cnt1, stride); 3074 jmpb(SCAN_TO_SUBSTR); 3075 3076 // Found a potential substr 3077 bind(FOUND_CANDIDATE); 3078 // After pcmpestri tmp(rcx) contains matched element index 3079 3080 // Make sure string is still long enough 3081 subl(cnt1, tmp); 3082 cmpl(cnt1, cnt2); 3083 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3084 // Left less then substring. 3085 3086 bind(RET_NOT_FOUND); 3087 movl(result, -1); 3088 jmp(CLEANUP); 3089 3090 bind(FOUND_SUBSTR); 3091 // Compute start addr of substr 3092 lea(result, Address(result, tmp, scale1)); 3093 if (int_cnt2 > 0) { // Constant substring 3094 // Repeat search for small substring (< 8 chars) 3095 // from new point without reloading substring. 3096 // Have to check that we don't read beyond string. 3097 cmpl(tmp, stride-int_cnt2); 3098 jccb(Assembler::greater, ADJUST_STR); 3099 // Fall through if matched whole substring. 3100 } else { // non constant 3101 assert(int_cnt2 == -1, "should be != 0"); 3102 3103 addl(tmp, cnt2); 3104 // Found result if we matched whole substring. 3105 cmpl(tmp, stride); 3106 jcc(Assembler::lessEqual, RET_FOUND); 3107 3108 // Repeat search for small substring (<= 8 chars) 3109 // from new point 'str1' without reloading substring. 3110 cmpl(cnt2, stride); 3111 // Have to check that we don't read beyond string. 3112 jccb(Assembler::lessEqual, ADJUST_STR); 3113 3114 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3115 // Compare the rest of substring (> 8 chars). 3116 movptr(str1, result); 3117 3118 cmpl(tmp, cnt2); 3119 // First 8 chars are already matched. 3120 jccb(Assembler::equal, CHECK_NEXT); 3121 3122 bind(SCAN_SUBSTR); 3123 pcmpestri(vec, Address(str1, 0), mode); 3124 // Need to reload strings pointers if not matched whole vector 3125 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3126 3127 bind(CHECK_NEXT); 3128 subl(cnt2, stride); 3129 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3130 addptr(str1, 16); 3131 if (ae == StrIntrinsicNode::UL) { 3132 addptr(str2, 8); 3133 } else { 3134 addptr(str2, 16); 3135 } 3136 subl(cnt1, stride); 3137 cmpl(cnt2, stride); // Do not read beyond substring 3138 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3139 // Back-up strings to avoid reading beyond substring. 3140 3141 if (ae == StrIntrinsicNode::UL) { 3142 lea(str2, Address(str2, cnt2, scale2, -8)); 3143 lea(str1, Address(str1, cnt2, scale1, -16)); 3144 } else { 3145 lea(str2, Address(str2, cnt2, scale2, -16)); 3146 lea(str1, Address(str1, cnt2, scale1, -16)); 3147 } 3148 subl(cnt1, cnt2); 3149 movl(cnt2, stride); 3150 addl(cnt1, stride); 3151 bind(CONT_SCAN_SUBSTR); 3152 if (ae == StrIntrinsicNode::UL) { 3153 pmovzxbw(vec, Address(str2, 0)); 3154 } else { 3155 movdqu(vec, Address(str2, 0)); 3156 } 3157 jmp(SCAN_SUBSTR); 3158 3159 bind(RET_FOUND_LONG); 3160 movptr(str1, Address(rsp, wordSize)); 3161 } // non constant 3162 3163 bind(RET_FOUND); 3164 // Compute substr offset 3165 subptr(result, str1); 3166 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3167 shrl(result, 1); // index 3168 } 3169 bind(CLEANUP); 3170 pop(rsp); // restore SP 3171 3172 } // string_indexof 3173 3174 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3175 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3176 ShortBranchVerifier sbv(this); 3177 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3178 3179 int stride = 8; 3180 3181 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3182 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3183 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3184 FOUND_SEQ_CHAR, DONE_LABEL; 3185 3186 movptr(result, str1); 3187 if (UseAVX >= 2) { 3188 cmpl(cnt1, stride); 3189 jcc(Assembler::less, SCAN_TO_CHAR); 3190 cmpl(cnt1, 2*stride); 3191 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3192 movdl(vec1, ch); 3193 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3194 vpxor(vec2, vec2); 3195 movl(tmp, cnt1); 3196 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3197 andl(cnt1,0x0000000F); //tail count (in chars) 3198 3199 bind(SCAN_TO_16_CHAR_LOOP); 3200 vmovdqu(vec3, Address(result, 0)); 3201 vpcmpeqw(vec3, vec3, vec1, 1); 3202 vptest(vec2, vec3); 3203 jcc(Assembler::carryClear, FOUND_CHAR); 3204 addptr(result, 32); 3205 subl(tmp, 2*stride); 3206 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3207 jmp(SCAN_TO_8_CHAR); 3208 bind(SCAN_TO_8_CHAR_INIT); 3209 movdl(vec1, ch); 3210 pshuflw(vec1, vec1, 0x00); 3211 pshufd(vec1, vec1, 0); 3212 pxor(vec2, vec2); 3213 } 3214 bind(SCAN_TO_8_CHAR); 3215 cmpl(cnt1, stride); 3216 jcc(Assembler::less, SCAN_TO_CHAR); 3217 if (UseAVX < 2) { 3218 movdl(vec1, ch); 3219 pshuflw(vec1, vec1, 0x00); 3220 pshufd(vec1, vec1, 0); 3221 pxor(vec2, vec2); 3222 } 3223 movl(tmp, cnt1); 3224 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3225 andl(cnt1,0x00000007); //tail count (in chars) 3226 3227 bind(SCAN_TO_8_CHAR_LOOP); 3228 movdqu(vec3, Address(result, 0)); 3229 pcmpeqw(vec3, vec1); 3230 ptest(vec2, vec3); 3231 jcc(Assembler::carryClear, FOUND_CHAR); 3232 addptr(result, 16); 3233 subl(tmp, stride); 3234 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3235 bind(SCAN_TO_CHAR); 3236 testl(cnt1, cnt1); 3237 jcc(Assembler::zero, RET_NOT_FOUND); 3238 bind(SCAN_TO_CHAR_LOOP); 3239 load_unsigned_short(tmp, Address(result, 0)); 3240 cmpl(ch, tmp); 3241 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3242 addptr(result, 2); 3243 subl(cnt1, 1); 3244 jccb(Assembler::zero, RET_NOT_FOUND); 3245 jmp(SCAN_TO_CHAR_LOOP); 3246 3247 bind(RET_NOT_FOUND); 3248 movl(result, -1); 3249 jmpb(DONE_LABEL); 3250 3251 bind(FOUND_CHAR); 3252 if (UseAVX >= 2) { 3253 vpmovmskb(tmp, vec3); 3254 } else { 3255 pmovmskb(tmp, vec3); 3256 } 3257 bsfl(ch, tmp); 3258 addptr(result, ch); 3259 3260 bind(FOUND_SEQ_CHAR); 3261 subptr(result, str1); 3262 shrl(result, 1); 3263 3264 bind(DONE_LABEL); 3265 } // string_indexof_char 3266 3267 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3268 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3269 ShortBranchVerifier sbv(this); 3270 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3271 3272 int stride = 16; 3273 3274 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3275 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3276 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3277 FOUND_SEQ_CHAR, DONE_LABEL; 3278 3279 movptr(result, str1); 3280 if (UseAVX >= 2) { 3281 cmpl(cnt1, stride); 3282 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3283 cmpl(cnt1, stride*2); 3284 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3285 movdl(vec1, ch); 3286 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3287 vpxor(vec2, vec2); 3288 movl(tmp, cnt1); 3289 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3290 andl(cnt1,0x0000001F); //tail count (in chars) 3291 3292 bind(SCAN_TO_32_CHAR_LOOP); 3293 vmovdqu(vec3, Address(result, 0)); 3294 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3295 vptest(vec2, vec3); 3296 jcc(Assembler::carryClear, FOUND_CHAR); 3297 addptr(result, 32); 3298 subl(tmp, stride*2); 3299 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3300 jmp(SCAN_TO_16_CHAR); 3301 3302 bind(SCAN_TO_16_CHAR_INIT); 3303 movdl(vec1, ch); 3304 pxor(vec2, vec2); 3305 pshufb(vec1, vec2); 3306 } 3307 3308 bind(SCAN_TO_16_CHAR); 3309 cmpl(cnt1, stride); 3310 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3311 if (UseAVX < 2) { 3312 movdl(vec1, ch); 3313 pxor(vec2, vec2); 3314 pshufb(vec1, vec2); 3315 } 3316 movl(tmp, cnt1); 3317 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3318 andl(cnt1,0x0000000F); //tail count (in bytes) 3319 3320 bind(SCAN_TO_16_CHAR_LOOP); 3321 movdqu(vec3, Address(result, 0)); 3322 pcmpeqb(vec3, vec1); 3323 ptest(vec2, vec3); 3324 jcc(Assembler::carryClear, FOUND_CHAR); 3325 addptr(result, 16); 3326 subl(tmp, stride); 3327 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3328 3329 bind(SCAN_TO_CHAR_INIT); 3330 testl(cnt1, cnt1); 3331 jcc(Assembler::zero, RET_NOT_FOUND); 3332 bind(SCAN_TO_CHAR_LOOP); 3333 load_unsigned_byte(tmp, Address(result, 0)); 3334 cmpl(ch, tmp); 3335 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3336 addptr(result, 1); 3337 subl(cnt1, 1); 3338 jccb(Assembler::zero, RET_NOT_FOUND); 3339 jmp(SCAN_TO_CHAR_LOOP); 3340 3341 bind(RET_NOT_FOUND); 3342 movl(result, -1); 3343 jmpb(DONE_LABEL); 3344 3345 bind(FOUND_CHAR); 3346 if (UseAVX >= 2) { 3347 vpmovmskb(tmp, vec3); 3348 } else { 3349 pmovmskb(tmp, vec3); 3350 } 3351 bsfl(ch, tmp); 3352 addptr(result, ch); 3353 3354 bind(FOUND_SEQ_CHAR); 3355 subptr(result, str1); 3356 3357 bind(DONE_LABEL); 3358 } // stringL_indexof_char 3359 3360 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3361 switch (eltype) { 3362 case T_BOOLEAN: return sizeof(jboolean); 3363 case T_BYTE: return sizeof(jbyte); 3364 case T_SHORT: return sizeof(jshort); 3365 case T_CHAR: return sizeof(jchar); 3366 case T_INT: return sizeof(jint); 3367 default: 3368 ShouldNotReachHere(); 3369 return -1; 3370 } 3371 } 3372 3373 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3374 switch (eltype) { 3375 // T_BOOLEAN used as surrogate for unsigned byte 3376 case T_BOOLEAN: movzbl(dst, src); break; 3377 case T_BYTE: movsbl(dst, src); break; 3378 case T_SHORT: movswl(dst, src); break; 3379 case T_CHAR: movzwl(dst, src); break; 3380 case T_INT: movl(dst, src); break; 3381 default: 3382 ShouldNotReachHere(); 3383 } 3384 } 3385 3386 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3387 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3388 } 3389 3390 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3391 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3392 } 3393 3394 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3395 const int vlen = Assembler::AVX_256bit; 3396 switch (eltype) { 3397 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3398 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3399 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3400 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3401 case T_INT: 3402 // do nothing 3403 break; 3404 default: 3405 ShouldNotReachHere(); 3406 } 3407 } 3408 3409 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3410 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3411 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3412 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3413 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3414 BasicType eltype) { 3415 ShortBranchVerifier sbv(this); 3416 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3417 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3418 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3419 3420 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3421 SHORT_UNROLLED_LOOP_EXIT, 3422 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3423 UNROLLED_VECTOR_LOOP_BEGIN, 3424 END; 3425 switch (eltype) { 3426 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3427 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3428 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3429 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3430 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3431 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3432 } 3433 3434 // For "renaming" for readibility of the code 3435 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3436 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3437 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3438 3439 const int elsize = arrays_hashcode_elsize(eltype); 3440 3441 /* 3442 if (cnt1 >= 2) { 3443 if (cnt1 >= 32) { 3444 UNROLLED VECTOR LOOP 3445 } 3446 UNROLLED SCALAR LOOP 3447 } 3448 SINGLE SCALAR 3449 */ 3450 3451 cmpl(cnt1, 32); 3452 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3453 3454 // cnt1 >= 32 && generate_vectorized_loop 3455 xorl(index, index); 3456 3457 // vresult = IntVector.zero(I256); 3458 for (int idx = 0; idx < 4; idx++) { 3459 vpxor(vresult[idx], vresult[idx]); 3460 } 3461 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3462 Register bound = tmp2; 3463 Register next = tmp3; 3464 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3465 movl(next, Address(tmp2, 0)); 3466 movdl(vnext, next); 3467 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3468 3469 // index = 0; 3470 // bound = cnt1 & ~(32 - 1); 3471 movl(bound, cnt1); 3472 andl(bound, ~(32 - 1)); 3473 // for (; index < bound; index += 32) { 3474 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3475 // result *= next; 3476 imull(result, next); 3477 // loop fission to upfront the cost of fetching from memory, OOO execution 3478 // can then hopefully do a better job of prefetching 3479 for (int idx = 0; idx < 4; idx++) { 3480 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3481 } 3482 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3483 for (int idx = 0; idx < 4; idx++) { 3484 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3485 arrays_hashcode_elvcast(vtmp[idx], eltype); 3486 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3487 } 3488 // index += 32; 3489 addl(index, 32); 3490 // index < bound; 3491 cmpl(index, bound); 3492 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3493 // } 3494 3495 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3496 subl(cnt1, bound); 3497 // release bound 3498 3499 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3500 for (int idx = 0; idx < 4; idx++) { 3501 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3502 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3503 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3504 } 3505 // result += vresult.reduceLanes(ADD); 3506 for (int idx = 0; idx < 4; idx++) { 3507 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3508 } 3509 3510 // } else if (cnt1 < 32) { 3511 3512 bind(SHORT_UNROLLED_BEGIN); 3513 // int i = 1; 3514 movl(index, 1); 3515 cmpl(index, cnt1); 3516 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3517 3518 // for (; i < cnt1 ; i += 2) { 3519 bind(SHORT_UNROLLED_LOOP_BEGIN); 3520 movl(tmp3, 961); 3521 imull(result, tmp3); 3522 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3523 movl(tmp3, tmp2); 3524 shll(tmp3, 5); 3525 subl(tmp3, tmp2); 3526 addl(result, tmp3); 3527 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3528 addl(result, tmp3); 3529 addl(index, 2); 3530 cmpl(index, cnt1); 3531 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3532 3533 // } 3534 // if (i >= cnt1) { 3535 bind(SHORT_UNROLLED_LOOP_EXIT); 3536 jccb(Assembler::greater, END); 3537 movl(tmp2, result); 3538 shll(result, 5); 3539 subl(result, tmp2); 3540 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3541 addl(result, tmp3); 3542 // } 3543 bind(END); 3544 3545 BLOCK_COMMENT("} // arrays_hashcode"); 3546 3547 } // arrays_hashcode 3548 3549 // helper function for string_compare 3550 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3551 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3552 Address::ScaleFactor scale2, Register index, int ae) { 3553 if (ae == StrIntrinsicNode::LL) { 3554 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3555 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3556 } else if (ae == StrIntrinsicNode::UU) { 3557 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3558 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3559 } else { 3560 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3561 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3562 } 3563 } 3564 3565 // Compare strings, used for char[] and byte[]. 3566 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3567 Register cnt1, Register cnt2, Register result, 3568 XMMRegister vec1, int ae, KRegister mask) { 3569 ShortBranchVerifier sbv(this); 3570 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3571 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3572 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3573 int stride2x2 = 0x40; 3574 Address::ScaleFactor scale = Address::no_scale; 3575 Address::ScaleFactor scale1 = Address::no_scale; 3576 Address::ScaleFactor scale2 = Address::no_scale; 3577 3578 if (ae != StrIntrinsicNode::LL) { 3579 stride2x2 = 0x20; 3580 } 3581 3582 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3583 shrl(cnt2, 1); 3584 } 3585 // Compute the minimum of the string lengths and the 3586 // difference of the string lengths (stack). 3587 // Do the conditional move stuff 3588 movl(result, cnt1); 3589 subl(cnt1, cnt2); 3590 push(cnt1); 3591 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3592 3593 // Is the minimum length zero? 3594 testl(cnt2, cnt2); 3595 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3596 if (ae == StrIntrinsicNode::LL) { 3597 // Load first bytes 3598 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3599 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3600 } else if (ae == StrIntrinsicNode::UU) { 3601 // Load first characters 3602 load_unsigned_short(result, Address(str1, 0)); 3603 load_unsigned_short(cnt1, Address(str2, 0)); 3604 } else { 3605 load_unsigned_byte(result, Address(str1, 0)); 3606 load_unsigned_short(cnt1, Address(str2, 0)); 3607 } 3608 subl(result, cnt1); 3609 jcc(Assembler::notZero, POP_LABEL); 3610 3611 if (ae == StrIntrinsicNode::UU) { 3612 // Divide length by 2 to get number of chars 3613 shrl(cnt2, 1); 3614 } 3615 cmpl(cnt2, 1); 3616 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3617 3618 // Check if the strings start at the same location and setup scale and stride 3619 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3620 cmpptr(str1, str2); 3621 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3622 if (ae == StrIntrinsicNode::LL) { 3623 scale = Address::times_1; 3624 stride = 16; 3625 } else { 3626 scale = Address::times_2; 3627 stride = 8; 3628 } 3629 } else { 3630 scale1 = Address::times_1; 3631 scale2 = Address::times_2; 3632 // scale not used 3633 stride = 8; 3634 } 3635 3636 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3637 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3638 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3639 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3640 Label COMPARE_TAIL_LONG; 3641 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3642 3643 int pcmpmask = 0x19; 3644 if (ae == StrIntrinsicNode::LL) { 3645 pcmpmask &= ~0x01; 3646 } 3647 3648 // Setup to compare 16-chars (32-bytes) vectors, 3649 // start from first character again because it has aligned address. 3650 if (ae == StrIntrinsicNode::LL) { 3651 stride2 = 32; 3652 } else { 3653 stride2 = 16; 3654 } 3655 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3656 adr_stride = stride << scale; 3657 } else { 3658 adr_stride1 = 8; //stride << scale1; 3659 adr_stride2 = 16; //stride << scale2; 3660 } 3661 3662 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3663 // rax and rdx are used by pcmpestri as elements counters 3664 movl(result, cnt2); 3665 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3666 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3667 3668 // fast path : compare first 2 8-char vectors. 3669 bind(COMPARE_16_CHARS); 3670 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3671 movdqu(vec1, Address(str1, 0)); 3672 } else { 3673 pmovzxbw(vec1, Address(str1, 0)); 3674 } 3675 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3676 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3677 3678 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3679 movdqu(vec1, Address(str1, adr_stride)); 3680 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3681 } else { 3682 pmovzxbw(vec1, Address(str1, adr_stride1)); 3683 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3684 } 3685 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3686 addl(cnt1, stride); 3687 3688 // Compare the characters at index in cnt1 3689 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3690 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3691 subl(result, cnt2); 3692 jmp(POP_LABEL); 3693 3694 // Setup the registers to start vector comparison loop 3695 bind(COMPARE_WIDE_VECTORS); 3696 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3697 lea(str1, Address(str1, result, scale)); 3698 lea(str2, Address(str2, result, scale)); 3699 } else { 3700 lea(str1, Address(str1, result, scale1)); 3701 lea(str2, Address(str2, result, scale2)); 3702 } 3703 subl(result, stride2); 3704 subl(cnt2, stride2); 3705 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3706 negptr(result); 3707 3708 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3709 bind(COMPARE_WIDE_VECTORS_LOOP); 3710 3711 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3712 cmpl(cnt2, stride2x2); 3713 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3714 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3715 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3716 3717 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3718 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3719 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3720 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3721 } else { 3722 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3723 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3724 } 3725 kortestql(mask, mask); 3726 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3727 addptr(result, stride2x2); // update since we already compared at this addr 3728 subl(cnt2, stride2x2); // and sub the size too 3729 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3730 3731 vpxor(vec1, vec1); 3732 jmpb(COMPARE_WIDE_TAIL); 3733 }//if (VM_Version::supports_avx512vlbw()) 3734 3735 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3736 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3737 vmovdqu(vec1, Address(str1, result, scale)); 3738 vpxor(vec1, Address(str2, result, scale)); 3739 } else { 3740 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3741 vpxor(vec1, Address(str2, result, scale2)); 3742 } 3743 vptest(vec1, vec1); 3744 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3745 addptr(result, stride2); 3746 subl(cnt2, stride2); 3747 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3748 // clean upper bits of YMM registers 3749 vpxor(vec1, vec1); 3750 3751 // compare wide vectors tail 3752 bind(COMPARE_WIDE_TAIL); 3753 testptr(result, result); 3754 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3755 3756 movl(result, stride2); 3757 movl(cnt2, result); 3758 negptr(result); 3759 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3760 3761 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3762 bind(VECTOR_NOT_EQUAL); 3763 // clean upper bits of YMM registers 3764 vpxor(vec1, vec1); 3765 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3766 lea(str1, Address(str1, result, scale)); 3767 lea(str2, Address(str2, result, scale)); 3768 } else { 3769 lea(str1, Address(str1, result, scale1)); 3770 lea(str2, Address(str2, result, scale2)); 3771 } 3772 jmp(COMPARE_16_CHARS); 3773 3774 // Compare tail chars, length between 1 to 15 chars 3775 bind(COMPARE_TAIL_LONG); 3776 movl(cnt2, result); 3777 cmpl(cnt2, stride); 3778 jcc(Assembler::less, COMPARE_SMALL_STR); 3779 3780 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3781 movdqu(vec1, Address(str1, 0)); 3782 } else { 3783 pmovzxbw(vec1, Address(str1, 0)); 3784 } 3785 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3786 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3787 subptr(cnt2, stride); 3788 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3789 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3790 lea(str1, Address(str1, result, scale)); 3791 lea(str2, Address(str2, result, scale)); 3792 } else { 3793 lea(str1, Address(str1, result, scale1)); 3794 lea(str2, Address(str2, result, scale2)); 3795 } 3796 negptr(cnt2); 3797 jmpb(WHILE_HEAD_LABEL); 3798 3799 bind(COMPARE_SMALL_STR); 3800 } else if (UseSSE42Intrinsics) { 3801 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3802 int pcmpmask = 0x19; 3803 // Setup to compare 8-char (16-byte) vectors, 3804 // start from first character again because it has aligned address. 3805 movl(result, cnt2); 3806 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3807 if (ae == StrIntrinsicNode::LL) { 3808 pcmpmask &= ~0x01; 3809 } 3810 jcc(Assembler::zero, COMPARE_TAIL); 3811 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3812 lea(str1, Address(str1, result, scale)); 3813 lea(str2, Address(str2, result, scale)); 3814 } else { 3815 lea(str1, Address(str1, result, scale1)); 3816 lea(str2, Address(str2, result, scale2)); 3817 } 3818 negptr(result); 3819 3820 // pcmpestri 3821 // inputs: 3822 // vec1- substring 3823 // rax - negative string length (elements count) 3824 // mem - scanned string 3825 // rdx - string length (elements count) 3826 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3827 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3828 // outputs: 3829 // rcx - first mismatched element index 3830 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3831 3832 bind(COMPARE_WIDE_VECTORS); 3833 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3834 movdqu(vec1, Address(str1, result, scale)); 3835 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3836 } else { 3837 pmovzxbw(vec1, Address(str1, result, scale1)); 3838 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3839 } 3840 // After pcmpestri cnt1(rcx) contains mismatched element index 3841 3842 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3843 addptr(result, stride); 3844 subptr(cnt2, stride); 3845 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3846 3847 // compare wide vectors tail 3848 testptr(result, result); 3849 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3850 3851 movl(cnt2, stride); 3852 movl(result, stride); 3853 negptr(result); 3854 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3855 movdqu(vec1, Address(str1, result, scale)); 3856 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3857 } else { 3858 pmovzxbw(vec1, Address(str1, result, scale1)); 3859 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3860 } 3861 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3862 3863 // Mismatched characters in the vectors 3864 bind(VECTOR_NOT_EQUAL); 3865 addptr(cnt1, result); 3866 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3867 subl(result, cnt2); 3868 jmpb(POP_LABEL); 3869 3870 bind(COMPARE_TAIL); // limit is zero 3871 movl(cnt2, result); 3872 // Fallthru to tail compare 3873 } 3874 // Shift str2 and str1 to the end of the arrays, negate min 3875 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3876 lea(str1, Address(str1, cnt2, scale)); 3877 lea(str2, Address(str2, cnt2, scale)); 3878 } else { 3879 lea(str1, Address(str1, cnt2, scale1)); 3880 lea(str2, Address(str2, cnt2, scale2)); 3881 } 3882 decrementl(cnt2); // first character was compared already 3883 negptr(cnt2); 3884 3885 // Compare the rest of the elements 3886 bind(WHILE_HEAD_LABEL); 3887 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3888 subl(result, cnt1); 3889 jccb(Assembler::notZero, POP_LABEL); 3890 increment(cnt2); 3891 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3892 3893 // Strings are equal up to min length. Return the length difference. 3894 bind(LENGTH_DIFF_LABEL); 3895 pop(result); 3896 if (ae == StrIntrinsicNode::UU) { 3897 // Divide diff by 2 to get number of chars 3898 sarl(result, 1); 3899 } 3900 jmpb(DONE_LABEL); 3901 3902 if (VM_Version::supports_avx512vlbw()) { 3903 3904 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3905 3906 kmovql(cnt1, mask); 3907 notq(cnt1); 3908 bsfq(cnt2, cnt1); 3909 if (ae != StrIntrinsicNode::LL) { 3910 // Divide diff by 2 to get number of chars 3911 sarl(cnt2, 1); 3912 } 3913 addq(result, cnt2); 3914 if (ae == StrIntrinsicNode::LL) { 3915 load_unsigned_byte(cnt1, Address(str2, result)); 3916 load_unsigned_byte(result, Address(str1, result)); 3917 } else if (ae == StrIntrinsicNode::UU) { 3918 load_unsigned_short(cnt1, Address(str2, result, scale)); 3919 load_unsigned_short(result, Address(str1, result, scale)); 3920 } else { 3921 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3922 load_unsigned_byte(result, Address(str1, result, scale1)); 3923 } 3924 subl(result, cnt1); 3925 jmpb(POP_LABEL); 3926 }//if (VM_Version::supports_avx512vlbw()) 3927 3928 // Discard the stored length difference 3929 bind(POP_LABEL); 3930 pop(cnt1); 3931 3932 // That's it 3933 bind(DONE_LABEL); 3934 if(ae == StrIntrinsicNode::UL) { 3935 negl(result); 3936 } 3937 3938 } 3939 3940 // Search for Non-ASCII character (Negative byte value) in a byte array, 3941 // return the index of the first such character, otherwise the length 3942 // of the array segment searched. 3943 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3944 // @IntrinsicCandidate 3945 // public static int countPositives(byte[] ba, int off, int len) { 3946 // for (int i = off; i < off + len; i++) { 3947 // if (ba[i] < 0) { 3948 // return i - off; 3949 // } 3950 // } 3951 // return len; 3952 // } 3953 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3954 Register result, Register tmp1, 3955 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3956 // rsi: byte array 3957 // rcx: len 3958 // rax: result 3959 ShortBranchVerifier sbv(this); 3960 assert_different_registers(ary1, len, result, tmp1); 3961 assert_different_registers(vec1, vec2); 3962 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3963 3964 movl(result, len); // copy 3965 // len == 0 3966 testl(len, len); 3967 jcc(Assembler::zero, DONE); 3968 3969 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3970 VM_Version::supports_avx512vlbw() && 3971 VM_Version::supports_bmi2()) { 3972 3973 Label test_64_loop, test_tail, BREAK_LOOP; 3974 movl(tmp1, len); 3975 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3976 3977 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 3978 andl(len, 0xffffffc0); // vector count (in chars) 3979 jccb(Assembler::zero, test_tail); 3980 3981 lea(ary1, Address(ary1, len, Address::times_1)); 3982 negptr(len); 3983 3984 bind(test_64_loop); 3985 // Check whether our 64 elements of size byte contain negatives 3986 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3987 kortestql(mask1, mask1); 3988 jcc(Assembler::notZero, BREAK_LOOP); 3989 3990 addptr(len, 64); 3991 jccb(Assembler::notZero, test_64_loop); 3992 3993 bind(test_tail); 3994 // bail out when there is nothing to be done 3995 testl(tmp1, -1); 3996 jcc(Assembler::zero, DONE); 3997 3998 3999 // check the tail for absense of negatives 4000 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4001 { 4002 Register tmp3_aliased = len; 4003 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4004 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4005 notq(tmp3_aliased); 4006 kmovql(mask2, tmp3_aliased); 4007 } 4008 4009 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4010 ktestq(mask1, mask2); 4011 jcc(Assembler::zero, DONE); 4012 4013 // do a full check for negative registers in the tail 4014 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4015 // ary1 already pointing to the right place 4016 jmpb(TAIL_START); 4017 4018 bind(BREAK_LOOP); 4019 // At least one byte in the last 64 byte block was negative. 4020 // Set up to look at the last 64 bytes as if they were a tail 4021 lea(ary1, Address(ary1, len, Address::times_1)); 4022 addptr(result, len); 4023 // Ignore the very last byte: if all others are positive, 4024 // it must be negative, so we can skip right to the 2+1 byte 4025 // end comparison at this point 4026 orl(result, 63); 4027 movl(len, 63); 4028 // Fallthru to tail compare 4029 } else { 4030 4031 if (UseAVX >= 2) { 4032 // With AVX2, use 32-byte vector compare 4033 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4034 4035 // Compare 32-byte vectors 4036 testl(len, 0xffffffe0); // vector count (in bytes) 4037 jccb(Assembler::zero, TAIL_START); 4038 4039 andl(len, 0xffffffe0); 4040 lea(ary1, Address(ary1, len, Address::times_1)); 4041 negptr(len); 4042 4043 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4044 movdl(vec2, tmp1); 4045 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4046 4047 bind(COMPARE_WIDE_VECTORS); 4048 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4049 vptest(vec1, vec2); 4050 jccb(Assembler::notZero, BREAK_LOOP); 4051 addptr(len, 32); 4052 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4053 4054 testl(result, 0x0000001f); // any bytes remaining? 4055 jcc(Assembler::zero, DONE); 4056 4057 // Quick test using the already prepared vector mask 4058 movl(len, result); 4059 andl(len, 0x0000001f); 4060 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4061 vptest(vec1, vec2); 4062 jcc(Assembler::zero, DONE); 4063 // There are zeros, jump to the tail to determine exactly where 4064 jmpb(TAIL_START); 4065 4066 bind(BREAK_LOOP); 4067 // At least one byte in the last 32-byte vector is negative. 4068 // Set up to look at the last 32 bytes as if they were a tail 4069 lea(ary1, Address(ary1, len, Address::times_1)); 4070 addptr(result, len); 4071 // Ignore the very last byte: if all others are positive, 4072 // it must be negative, so we can skip right to the 2+1 byte 4073 // end comparison at this point 4074 orl(result, 31); 4075 movl(len, 31); 4076 // Fallthru to tail compare 4077 } else if (UseSSE42Intrinsics) { 4078 // With SSE4.2, use double quad vector compare 4079 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4080 4081 // Compare 16-byte vectors 4082 testl(len, 0xfffffff0); // vector count (in bytes) 4083 jcc(Assembler::zero, TAIL_START); 4084 4085 andl(len, 0xfffffff0); 4086 lea(ary1, Address(ary1, len, Address::times_1)); 4087 negptr(len); 4088 4089 movl(tmp1, 0x80808080); 4090 movdl(vec2, tmp1); 4091 pshufd(vec2, vec2, 0); 4092 4093 bind(COMPARE_WIDE_VECTORS); 4094 movdqu(vec1, Address(ary1, len, Address::times_1)); 4095 ptest(vec1, vec2); 4096 jccb(Assembler::notZero, BREAK_LOOP); 4097 addptr(len, 16); 4098 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4099 4100 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4101 jcc(Assembler::zero, DONE); 4102 4103 // Quick test using the already prepared vector mask 4104 movl(len, result); 4105 andl(len, 0x0000000f); // tail count (in bytes) 4106 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4107 ptest(vec1, vec2); 4108 jcc(Assembler::zero, DONE); 4109 jmpb(TAIL_START); 4110 4111 bind(BREAK_LOOP); 4112 // At least one byte in the last 16-byte vector is negative. 4113 // Set up and look at the last 16 bytes as if they were a tail 4114 lea(ary1, Address(ary1, len, Address::times_1)); 4115 addptr(result, len); 4116 // Ignore the very last byte: if all others are positive, 4117 // it must be negative, so we can skip right to the 2+1 byte 4118 // end comparison at this point 4119 orl(result, 15); 4120 movl(len, 15); 4121 // Fallthru to tail compare 4122 } 4123 } 4124 4125 bind(TAIL_START); 4126 // Compare 4-byte vectors 4127 andl(len, 0xfffffffc); // vector count (in bytes) 4128 jccb(Assembler::zero, COMPARE_CHAR); 4129 4130 lea(ary1, Address(ary1, len, Address::times_1)); 4131 negptr(len); 4132 4133 bind(COMPARE_VECTORS); 4134 movl(tmp1, Address(ary1, len, Address::times_1)); 4135 andl(tmp1, 0x80808080); 4136 jccb(Assembler::notZero, TAIL_ADJUST); 4137 addptr(len, 4); 4138 jccb(Assembler::notZero, COMPARE_VECTORS); 4139 4140 // Compare trailing char (final 2-3 bytes), if any 4141 bind(COMPARE_CHAR); 4142 4143 testl(result, 0x2); // tail char 4144 jccb(Assembler::zero, COMPARE_BYTE); 4145 load_unsigned_short(tmp1, Address(ary1, 0)); 4146 andl(tmp1, 0x00008080); 4147 jccb(Assembler::notZero, CHAR_ADJUST); 4148 lea(ary1, Address(ary1, 2)); 4149 4150 bind(COMPARE_BYTE); 4151 testl(result, 0x1); // tail byte 4152 jccb(Assembler::zero, DONE); 4153 load_unsigned_byte(tmp1, Address(ary1, 0)); 4154 testl(tmp1, 0x00000080); 4155 jccb(Assembler::zero, DONE); 4156 subptr(result, 1); 4157 jmpb(DONE); 4158 4159 bind(TAIL_ADJUST); 4160 // there are negative bits in the last 4 byte block. 4161 // Adjust result and check the next three bytes 4162 addptr(result, len); 4163 orl(result, 3); 4164 lea(ary1, Address(ary1, len, Address::times_1)); 4165 jmpb(COMPARE_CHAR); 4166 4167 bind(CHAR_ADJUST); 4168 // We are looking at a char + optional byte tail, and found that one 4169 // of the bytes in the char is negative. Adjust the result, check the 4170 // first byte and readjust if needed. 4171 andl(result, 0xfffffffc); 4172 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4173 jccb(Assembler::notZero, DONE); 4174 addptr(result, 1); 4175 4176 // That's it 4177 bind(DONE); 4178 if (UseAVX >= 2) { 4179 // clean upper bits of YMM registers 4180 vpxor(vec1, vec1); 4181 vpxor(vec2, vec2); 4182 } 4183 } 4184 4185 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4186 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4187 Register limit, Register result, Register chr, 4188 XMMRegister vec1, XMMRegister vec2, bool is_char, 4189 KRegister mask, bool expand_ary2) { 4190 // for expand_ary2, limit is the (smaller) size of the second array. 4191 ShortBranchVerifier sbv(this); 4192 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4193 4194 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4195 "Expansion only implemented for AVX2"); 4196 4197 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4198 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4199 4200 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4201 int scaleIncr = expand_ary2 ? 8 : 16; 4202 4203 if (is_array_equ) { 4204 // Check the input args 4205 cmpoop(ary1, ary2); 4206 jcc(Assembler::equal, TRUE_LABEL); 4207 4208 // Need additional checks for arrays_equals. 4209 testptr(ary1, ary1); 4210 jcc(Assembler::zero, FALSE_LABEL); 4211 testptr(ary2, ary2); 4212 jcc(Assembler::zero, FALSE_LABEL); 4213 4214 // Check the lengths 4215 movl(limit, Address(ary1, length_offset)); 4216 cmpl(limit, Address(ary2, length_offset)); 4217 jcc(Assembler::notEqual, FALSE_LABEL); 4218 } 4219 4220 // count == 0 4221 testl(limit, limit); 4222 jcc(Assembler::zero, TRUE_LABEL); 4223 4224 if (is_array_equ) { 4225 // Load array address 4226 lea(ary1, Address(ary1, base_offset)); 4227 lea(ary2, Address(ary2, base_offset)); 4228 } 4229 4230 if (is_array_equ && is_char) { 4231 // arrays_equals when used for char[]. 4232 shll(limit, 1); // byte count != 0 4233 } 4234 movl(result, limit); // copy 4235 4236 if (UseAVX >= 2) { 4237 // With AVX2, use 32-byte vector compare 4238 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4239 4240 // Compare 32-byte vectors 4241 if (expand_ary2) { 4242 andl(result, 0x0000000f); // tail count (in bytes) 4243 andl(limit, 0xfffffff0); // vector count (in bytes) 4244 jcc(Assembler::zero, COMPARE_TAIL); 4245 } else { 4246 andl(result, 0x0000001f); // tail count (in bytes) 4247 andl(limit, 0xffffffe0); // vector count (in bytes) 4248 jcc(Assembler::zero, COMPARE_TAIL_16); 4249 } 4250 4251 lea(ary1, Address(ary1, limit, scaleFactor)); 4252 lea(ary2, Address(ary2, limit, Address::times_1)); 4253 negptr(limit); 4254 4255 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4256 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4257 4258 cmpl(limit, -64); 4259 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4260 4261 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4262 4263 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4264 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4265 kortestql(mask, mask); 4266 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4267 addptr(limit, 64); // update since we already compared at this addr 4268 cmpl(limit, -64); 4269 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4270 4271 // At this point we may still need to compare -limit+result bytes. 4272 // We could execute the next two instruction and just continue via non-wide path: 4273 // cmpl(limit, 0); 4274 // jcc(Assembler::equal, COMPARE_TAIL); // true 4275 // But since we stopped at the points ary{1,2}+limit which are 4276 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4277 // (|limit| <= 32 and result < 32), 4278 // we may just compare the last 64 bytes. 4279 // 4280 addptr(result, -64); // it is safe, bc we just came from this area 4281 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4282 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4283 kortestql(mask, mask); 4284 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4285 4286 jmp(TRUE_LABEL); 4287 4288 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4289 4290 }//if (VM_Version::supports_avx512vlbw()) 4291 4292 bind(COMPARE_WIDE_VECTORS); 4293 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4294 if (expand_ary2) { 4295 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4296 } else { 4297 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4298 } 4299 vpxor(vec1, vec2); 4300 4301 vptest(vec1, vec1); 4302 jcc(Assembler::notZero, FALSE_LABEL); 4303 addptr(limit, scaleIncr * 2); 4304 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4305 4306 testl(result, result); 4307 jcc(Assembler::zero, TRUE_LABEL); 4308 4309 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4310 if (expand_ary2) { 4311 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4312 } else { 4313 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4314 } 4315 vpxor(vec1, vec2); 4316 4317 vptest(vec1, vec1); 4318 jcc(Assembler::notZero, FALSE_LABEL); 4319 jmp(TRUE_LABEL); 4320 4321 bind(COMPARE_TAIL_16); // limit is zero 4322 movl(limit, result); 4323 4324 // Compare 16-byte chunks 4325 andl(result, 0x0000000f); // tail count (in bytes) 4326 andl(limit, 0xfffffff0); // vector count (in bytes) 4327 jcc(Assembler::zero, COMPARE_TAIL); 4328 4329 lea(ary1, Address(ary1, limit, scaleFactor)); 4330 lea(ary2, Address(ary2, limit, Address::times_1)); 4331 negptr(limit); 4332 4333 bind(COMPARE_WIDE_VECTORS_16); 4334 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4335 if (expand_ary2) { 4336 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4337 } else { 4338 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4339 } 4340 pxor(vec1, vec2); 4341 4342 ptest(vec1, vec1); 4343 jcc(Assembler::notZero, FALSE_LABEL); 4344 addptr(limit, scaleIncr); 4345 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4346 4347 bind(COMPARE_TAIL); // limit is zero 4348 movl(limit, result); 4349 // Fallthru to tail compare 4350 } else if (UseSSE42Intrinsics) { 4351 // With SSE4.2, use double quad vector compare 4352 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4353 4354 // Compare 16-byte vectors 4355 andl(result, 0x0000000f); // tail count (in bytes) 4356 andl(limit, 0xfffffff0); // vector count (in bytes) 4357 jcc(Assembler::zero, COMPARE_TAIL); 4358 4359 lea(ary1, Address(ary1, limit, Address::times_1)); 4360 lea(ary2, Address(ary2, limit, Address::times_1)); 4361 negptr(limit); 4362 4363 bind(COMPARE_WIDE_VECTORS); 4364 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4365 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4366 pxor(vec1, vec2); 4367 4368 ptest(vec1, vec1); 4369 jcc(Assembler::notZero, FALSE_LABEL); 4370 addptr(limit, 16); 4371 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4372 4373 testl(result, result); 4374 jcc(Assembler::zero, TRUE_LABEL); 4375 4376 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4377 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4378 pxor(vec1, vec2); 4379 4380 ptest(vec1, vec1); 4381 jccb(Assembler::notZero, FALSE_LABEL); 4382 jmpb(TRUE_LABEL); 4383 4384 bind(COMPARE_TAIL); // limit is zero 4385 movl(limit, result); 4386 // Fallthru to tail compare 4387 } 4388 4389 // Compare 4-byte vectors 4390 if (expand_ary2) { 4391 testl(result, result); 4392 jccb(Assembler::zero, TRUE_LABEL); 4393 } else { 4394 andl(limit, 0xfffffffc); // vector count (in bytes) 4395 jccb(Assembler::zero, COMPARE_CHAR); 4396 } 4397 4398 lea(ary1, Address(ary1, limit, scaleFactor)); 4399 lea(ary2, Address(ary2, limit, Address::times_1)); 4400 negptr(limit); 4401 4402 bind(COMPARE_VECTORS); 4403 if (expand_ary2) { 4404 // There are no "vector" operations for bytes to shorts 4405 movzbl(chr, Address(ary2, limit, Address::times_1)); 4406 cmpw(Address(ary1, limit, Address::times_2), chr); 4407 jccb(Assembler::notEqual, FALSE_LABEL); 4408 addptr(limit, 1); 4409 jcc(Assembler::notZero, COMPARE_VECTORS); 4410 jmp(TRUE_LABEL); 4411 } else { 4412 movl(chr, Address(ary1, limit, Address::times_1)); 4413 cmpl(chr, Address(ary2, limit, Address::times_1)); 4414 jccb(Assembler::notEqual, FALSE_LABEL); 4415 addptr(limit, 4); 4416 jcc(Assembler::notZero, COMPARE_VECTORS); 4417 } 4418 4419 // Compare trailing char (final 2 bytes), if any 4420 bind(COMPARE_CHAR); 4421 testl(result, 0x2); // tail char 4422 jccb(Assembler::zero, COMPARE_BYTE); 4423 load_unsigned_short(chr, Address(ary1, 0)); 4424 load_unsigned_short(limit, Address(ary2, 0)); 4425 cmpl(chr, limit); 4426 jccb(Assembler::notEqual, FALSE_LABEL); 4427 4428 if (is_array_equ && is_char) { 4429 bind(COMPARE_BYTE); 4430 } else { 4431 lea(ary1, Address(ary1, 2)); 4432 lea(ary2, Address(ary2, 2)); 4433 4434 bind(COMPARE_BYTE); 4435 testl(result, 0x1); // tail byte 4436 jccb(Assembler::zero, TRUE_LABEL); 4437 load_unsigned_byte(chr, Address(ary1, 0)); 4438 load_unsigned_byte(limit, Address(ary2, 0)); 4439 cmpl(chr, limit); 4440 jccb(Assembler::notEqual, FALSE_LABEL); 4441 } 4442 bind(TRUE_LABEL); 4443 movl(result, 1); // return true 4444 jmpb(DONE); 4445 4446 bind(FALSE_LABEL); 4447 xorl(result, result); // return false 4448 4449 // That's it 4450 bind(DONE); 4451 if (UseAVX >= 2) { 4452 // clean upper bits of YMM registers 4453 vpxor(vec1, vec1); 4454 vpxor(vec2, vec2); 4455 } 4456 } 4457 4458 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4459 #define __ masm. 4460 Register dst = stub.data<0>(); 4461 XMMRegister src = stub.data<1>(); 4462 address target = stub.data<2>(); 4463 __ bind(stub.entry()); 4464 __ subptr(rsp, 8); 4465 __ movdbl(Address(rsp), src); 4466 __ call(RuntimeAddress(target)); 4467 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte. 4468 __ pop(dst); 4469 __ jmp(stub.continuation()); 4470 #undef __ 4471 } 4472 4473 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4474 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4475 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4476 4477 address slowpath_target; 4478 if (dst_bt == T_INT) { 4479 if (src_bt == T_FLOAT) { 4480 cvttss2sil(dst, src); 4481 cmpl(dst, 0x80000000); 4482 slowpath_target = StubRoutines::x86::f2i_fixup(); 4483 } else { 4484 cvttsd2sil(dst, src); 4485 cmpl(dst, 0x80000000); 4486 slowpath_target = StubRoutines::x86::d2i_fixup(); 4487 } 4488 } else { 4489 if (src_bt == T_FLOAT) { 4490 cvttss2siq(dst, src); 4491 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4492 slowpath_target = StubRoutines::x86::f2l_fixup(); 4493 } else { 4494 cvttsd2siq(dst, src); 4495 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4496 slowpath_target = StubRoutines::x86::d2l_fixup(); 4497 } 4498 } 4499 4500 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte. 4501 int max_size = 23 + (UseAPX ? 1 : 0); 4502 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath); 4503 jcc(Assembler::equal, stub->entry()); 4504 bind(stub->continuation()); 4505 } 4506 4507 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4508 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4509 switch(ideal_opc) { 4510 case Op_LShiftVS: 4511 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4512 case Op_LShiftVI: 4513 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4514 case Op_LShiftVL: 4515 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4516 case Op_RShiftVS: 4517 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4518 case Op_RShiftVI: 4519 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4520 case Op_RShiftVL: 4521 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4522 case Op_URShiftVS: 4523 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4524 case Op_URShiftVI: 4525 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4526 case Op_URShiftVL: 4527 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4528 case Op_RotateRightV: 4529 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4530 case Op_RotateLeftV: 4531 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4532 default: 4533 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4534 break; 4535 } 4536 } 4537 4538 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4539 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4540 if (is_unsigned) { 4541 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4542 } else { 4543 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4544 } 4545 } 4546 4547 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4548 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4549 switch (elem_bt) { 4550 case T_BYTE: 4551 if (ideal_opc == Op_SaturatingAddV) { 4552 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4553 } else { 4554 assert(ideal_opc == Op_SaturatingSubV, ""); 4555 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4556 } 4557 break; 4558 case T_SHORT: 4559 if (ideal_opc == Op_SaturatingAddV) { 4560 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4561 } else { 4562 assert(ideal_opc == Op_SaturatingSubV, ""); 4563 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4564 } 4565 break; 4566 default: 4567 fatal("Unsupported type %s", type2name(elem_bt)); 4568 break; 4569 } 4570 } 4571 4572 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4573 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4574 switch (elem_bt) { 4575 case T_BYTE: 4576 if (ideal_opc == Op_SaturatingAddV) { 4577 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4578 } else { 4579 assert(ideal_opc == Op_SaturatingSubV, ""); 4580 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4581 } 4582 break; 4583 case T_SHORT: 4584 if (ideal_opc == Op_SaturatingAddV) { 4585 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4586 } else { 4587 assert(ideal_opc == Op_SaturatingSubV, ""); 4588 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4589 } 4590 break; 4591 default: 4592 fatal("Unsupported type %s", type2name(elem_bt)); 4593 break; 4594 } 4595 } 4596 4597 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4598 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4599 if (is_unsigned) { 4600 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4601 } else { 4602 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4603 } 4604 } 4605 4606 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4607 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4608 switch (elem_bt) { 4609 case T_BYTE: 4610 if (ideal_opc == Op_SaturatingAddV) { 4611 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4612 } else { 4613 assert(ideal_opc == Op_SaturatingSubV, ""); 4614 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4615 } 4616 break; 4617 case T_SHORT: 4618 if (ideal_opc == Op_SaturatingAddV) { 4619 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4620 } else { 4621 assert(ideal_opc == Op_SaturatingSubV, ""); 4622 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4623 } 4624 break; 4625 default: 4626 fatal("Unsupported type %s", type2name(elem_bt)); 4627 break; 4628 } 4629 } 4630 4631 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4632 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4633 switch (elem_bt) { 4634 case T_BYTE: 4635 if (ideal_opc == Op_SaturatingAddV) { 4636 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4637 } else { 4638 assert(ideal_opc == Op_SaturatingSubV, ""); 4639 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4640 } 4641 break; 4642 case T_SHORT: 4643 if (ideal_opc == Op_SaturatingAddV) { 4644 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4645 } else { 4646 assert(ideal_opc == Op_SaturatingSubV, ""); 4647 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4648 } 4649 break; 4650 default: 4651 fatal("Unsupported type %s", type2name(elem_bt)); 4652 break; 4653 } 4654 } 4655 4656 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4657 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4658 bool is_varshift) { 4659 switch (ideal_opc) { 4660 case Op_AddVB: 4661 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4662 case Op_AddVS: 4663 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4664 case Op_AddVI: 4665 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4666 case Op_AddVL: 4667 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4668 case Op_AddVF: 4669 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4670 case Op_AddVD: 4671 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4672 case Op_SubVB: 4673 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4674 case Op_SubVS: 4675 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4676 case Op_SubVI: 4677 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4678 case Op_SubVL: 4679 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4680 case Op_SubVF: 4681 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4682 case Op_SubVD: 4683 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4684 case Op_MulVS: 4685 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4686 case Op_MulVI: 4687 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4688 case Op_MulVL: 4689 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4690 case Op_MulVF: 4691 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4692 case Op_MulVD: 4693 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4694 case Op_DivVF: 4695 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4696 case Op_DivVD: 4697 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4698 case Op_SqrtVF: 4699 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4700 case Op_SqrtVD: 4701 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4702 case Op_AbsVB: 4703 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4704 case Op_AbsVS: 4705 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4706 case Op_AbsVI: 4707 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4708 case Op_AbsVL: 4709 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4710 case Op_FmaVF: 4711 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4712 case Op_FmaVD: 4713 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4714 case Op_VectorRearrange: 4715 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4716 case Op_LShiftVS: 4717 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4718 case Op_LShiftVI: 4719 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4720 case Op_LShiftVL: 4721 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4722 case Op_RShiftVS: 4723 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4724 case Op_RShiftVI: 4725 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4726 case Op_RShiftVL: 4727 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4728 case Op_URShiftVS: 4729 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4730 case Op_URShiftVI: 4731 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4732 case Op_URShiftVL: 4733 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4734 case Op_RotateLeftV: 4735 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4736 case Op_RotateRightV: 4737 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4738 case Op_MaxV: 4739 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4740 case Op_MinV: 4741 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4742 case Op_UMinV: 4743 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4744 case Op_UMaxV: 4745 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4746 case Op_XorV: 4747 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4748 case Op_OrV: 4749 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4750 case Op_AndV: 4751 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4752 default: 4753 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4754 break; 4755 } 4756 } 4757 4758 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4759 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4760 switch (ideal_opc) { 4761 case Op_AddVB: 4762 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4763 case Op_AddVS: 4764 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4765 case Op_AddVI: 4766 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4767 case Op_AddVL: 4768 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4769 case Op_AddVF: 4770 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4771 case Op_AddVD: 4772 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4773 case Op_SubVB: 4774 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4775 case Op_SubVS: 4776 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4777 case Op_SubVI: 4778 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4779 case Op_SubVL: 4780 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4781 case Op_SubVF: 4782 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4783 case Op_SubVD: 4784 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4785 case Op_MulVS: 4786 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4787 case Op_MulVI: 4788 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_MulVL: 4790 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4791 case Op_MulVF: 4792 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4793 case Op_MulVD: 4794 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4795 case Op_DivVF: 4796 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4797 case Op_DivVD: 4798 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4799 case Op_FmaVF: 4800 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4801 case Op_FmaVD: 4802 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4803 case Op_MaxV: 4804 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4805 case Op_MinV: 4806 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4807 case Op_UMaxV: 4808 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4809 case Op_UMinV: 4810 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4811 case Op_XorV: 4812 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_OrV: 4814 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_AndV: 4816 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4817 default: 4818 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4819 break; 4820 } 4821 } 4822 4823 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4824 KRegister src1, KRegister src2) { 4825 BasicType etype = T_ILLEGAL; 4826 switch(mask_len) { 4827 case 2: 4828 case 4: 4829 case 8: etype = T_BYTE; break; 4830 case 16: etype = T_SHORT; break; 4831 case 32: etype = T_INT; break; 4832 case 64: etype = T_LONG; break; 4833 default: fatal("Unsupported type"); break; 4834 } 4835 assert(etype != T_ILLEGAL, ""); 4836 switch(ideal_opc) { 4837 case Op_AndVMask: 4838 kand(etype, dst, src1, src2); break; 4839 case Op_OrVMask: 4840 kor(etype, dst, src1, src2); break; 4841 case Op_XorVMask: 4842 kxor(etype, dst, src1, src2); break; 4843 default: 4844 fatal("Unsupported masked operation"); break; 4845 } 4846 } 4847 4848 /* 4849 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4850 * If src is NaN, the result is 0. 4851 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4852 * the result is equal to the value of Integer.MIN_VALUE. 4853 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4854 * the result is equal to the value of Integer.MAX_VALUE. 4855 */ 4856 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4857 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4858 Register rscratch, AddressLiteral float_sign_flip, 4859 int vec_enc) { 4860 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4861 Label done; 4862 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4863 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4864 vptest(xtmp2, xtmp2, vec_enc); 4865 jccb(Assembler::equal, done); 4866 4867 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4868 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4869 4870 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4871 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4872 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4873 4874 // Recompute the mask for remaining special value. 4875 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4876 // Extract SRC values corresponding to TRUE mask lanes. 4877 vpand(xtmp4, xtmp2, src, vec_enc); 4878 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4879 // values are set. 4880 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4881 4882 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4883 bind(done); 4884 } 4885 4886 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4887 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4888 Register rscratch, AddressLiteral float_sign_flip, 4889 int vec_enc) { 4890 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4891 Label done; 4892 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4893 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4894 kortestwl(ktmp1, ktmp1); 4895 jccb(Assembler::equal, done); 4896 4897 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4898 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4899 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4900 4901 kxorwl(ktmp1, ktmp1, ktmp2); 4902 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4903 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4904 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4905 bind(done); 4906 } 4907 4908 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4909 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4910 Register rscratch, AddressLiteral double_sign_flip, 4911 int vec_enc) { 4912 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4913 4914 Label done; 4915 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4916 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4917 kortestwl(ktmp1, ktmp1); 4918 jccb(Assembler::equal, done); 4919 4920 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4921 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4922 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4923 4924 kxorwl(ktmp1, ktmp1, ktmp2); 4925 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4926 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4927 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4928 bind(done); 4929 } 4930 4931 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4932 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4933 Register rscratch, AddressLiteral float_sign_flip, 4934 int vec_enc) { 4935 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4936 Label done; 4937 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4938 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4939 kortestwl(ktmp1, ktmp1); 4940 jccb(Assembler::equal, done); 4941 4942 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4943 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4944 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4945 4946 kxorwl(ktmp1, ktmp1, ktmp2); 4947 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4948 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4949 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4950 bind(done); 4951 } 4952 4953 /* 4954 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4955 * If src is NaN, the result is 0. 4956 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4957 * the result is equal to the value of Long.MIN_VALUE. 4958 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4959 * the result is equal to the value of Long.MAX_VALUE. 4960 */ 4961 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4962 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4963 Register rscratch, AddressLiteral double_sign_flip, 4964 int vec_enc) { 4965 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4966 4967 Label done; 4968 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4969 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4970 kortestwl(ktmp1, ktmp1); 4971 jccb(Assembler::equal, done); 4972 4973 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4974 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4975 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4976 4977 kxorwl(ktmp1, ktmp1, ktmp2); 4978 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4979 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4980 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4981 bind(done); 4982 } 4983 4984 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4985 XMMRegister xtmp, int index, int vec_enc) { 4986 assert(vec_enc < Assembler::AVX_512bit, ""); 4987 if (vec_enc == Assembler::AVX_256bit) { 4988 vextractf128_high(xtmp, src); 4989 vshufps(dst, src, xtmp, index, vec_enc); 4990 } else { 4991 vshufps(dst, src, zero, index, vec_enc); 4992 } 4993 } 4994 4995 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4996 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4997 AddressLiteral float_sign_flip, int src_vec_enc) { 4998 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4999 5000 Label done; 5001 // Compare the destination lanes with float_sign_flip 5002 // value to get mask for all special values. 5003 movdqu(xtmp1, float_sign_flip, rscratch); 5004 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5005 ptest(xtmp2, xtmp2); 5006 jccb(Assembler::equal, done); 5007 5008 // Flip float_sign_flip to get max integer value. 5009 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5010 pxor(xtmp1, xtmp4); 5011 5012 // Set detination lanes corresponding to unordered source lanes as zero. 5013 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5014 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5015 5016 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5017 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5018 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5019 5020 // Recompute the mask for remaining special value. 5021 pxor(xtmp2, xtmp3); 5022 // Extract mask corresponding to non-negative source lanes. 5023 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5024 5025 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5026 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5027 pand(xtmp3, xtmp2); 5028 5029 // Replace destination lanes holding special value(0x80000000) with max int 5030 // if corresponding source lane holds a +ve value. 5031 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5032 bind(done); 5033 } 5034 5035 5036 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5037 XMMRegister xtmp, Register rscratch, int vec_enc) { 5038 switch(to_elem_bt) { 5039 case T_SHORT: 5040 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5041 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5042 vpackusdw(dst, dst, zero, vec_enc); 5043 if (vec_enc == Assembler::AVX_256bit) { 5044 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5045 } 5046 break; 5047 case T_BYTE: 5048 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5049 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5050 vpackusdw(dst, dst, zero, vec_enc); 5051 if (vec_enc == Assembler::AVX_256bit) { 5052 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5053 } 5054 vpackuswb(dst, dst, zero, vec_enc); 5055 break; 5056 default: assert(false, "%s", type2name(to_elem_bt)); 5057 } 5058 } 5059 5060 /* 5061 * Algorithm for vector D2L and F2I conversions:- 5062 * a) Perform vector D2L/F2I cast. 5063 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5064 * It signifies that source value could be any of the special floating point 5065 * values(NaN,-Inf,Inf,Max,-Min). 5066 * c) Set destination to zero if source is NaN value. 5067 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5068 */ 5069 5070 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5071 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5072 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5073 int to_elem_sz = type2aelembytes(to_elem_bt); 5074 assert(to_elem_sz <= 4, ""); 5075 vcvttps2dq(dst, src, vec_enc); 5076 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5077 if (to_elem_sz < 4) { 5078 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5079 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5080 } 5081 } 5082 5083 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5084 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5085 Register rscratch, int vec_enc) { 5086 int to_elem_sz = type2aelembytes(to_elem_bt); 5087 assert(to_elem_sz <= 4, ""); 5088 vcvttps2dq(dst, src, vec_enc); 5089 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5090 switch(to_elem_bt) { 5091 case T_INT: 5092 break; 5093 case T_SHORT: 5094 evpmovdw(dst, dst, vec_enc); 5095 break; 5096 case T_BYTE: 5097 evpmovdb(dst, dst, vec_enc); 5098 break; 5099 default: assert(false, "%s", type2name(to_elem_bt)); 5100 } 5101 } 5102 5103 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5104 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5105 Register rscratch, int vec_enc) { 5106 evcvttps2qq(dst, src, vec_enc); 5107 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5108 } 5109 5110 // Handling for downcasting from double to integer or sub-word types on AVX2. 5111 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5112 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5113 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5114 int to_elem_sz = type2aelembytes(to_elem_bt); 5115 assert(to_elem_sz < 8, ""); 5116 vcvttpd2dq(dst, src, vec_enc); 5117 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5118 float_sign_flip, vec_enc); 5119 if (to_elem_sz < 4) { 5120 // xtmp4 holds all zero lanes. 5121 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5122 } 5123 } 5124 5125 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5126 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5127 KRegister ktmp2, AddressLiteral sign_flip, 5128 Register rscratch, int vec_enc) { 5129 if (VM_Version::supports_avx512dq()) { 5130 evcvttpd2qq(dst, src, vec_enc); 5131 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5132 switch(to_elem_bt) { 5133 case T_LONG: 5134 break; 5135 case T_INT: 5136 evpmovsqd(dst, dst, vec_enc); 5137 break; 5138 case T_SHORT: 5139 evpmovsqd(dst, dst, vec_enc); 5140 evpmovdw(dst, dst, vec_enc); 5141 break; 5142 case T_BYTE: 5143 evpmovsqd(dst, dst, vec_enc); 5144 evpmovdb(dst, dst, vec_enc); 5145 break; 5146 default: assert(false, "%s", type2name(to_elem_bt)); 5147 } 5148 } else { 5149 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5150 vcvttpd2dq(dst, src, vec_enc); 5151 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5152 switch(to_elem_bt) { 5153 case T_INT: 5154 break; 5155 case T_SHORT: 5156 evpmovdw(dst, dst, vec_enc); 5157 break; 5158 case T_BYTE: 5159 evpmovdb(dst, dst, vec_enc); 5160 break; 5161 default: assert(false, "%s", type2name(to_elem_bt)); 5162 } 5163 } 5164 } 5165 5166 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5167 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5168 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5169 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5170 // and re-instantiate original MXCSR.RC mode after that. 5171 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5172 5173 mov64(tmp, julong_cast(0.5L)); 5174 evpbroadcastq(xtmp1, tmp, vec_enc); 5175 vaddpd(xtmp1, src , xtmp1, vec_enc); 5176 evcvtpd2qq(dst, xtmp1, vec_enc); 5177 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5178 double_sign_flip, vec_enc);; 5179 5180 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5181 } 5182 5183 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5184 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5185 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5186 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5187 // and re-instantiate original MXCSR.RC mode after that. 5188 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5189 5190 movl(tmp, jint_cast(0.5)); 5191 movq(xtmp1, tmp); 5192 vbroadcastss(xtmp1, xtmp1, vec_enc); 5193 vaddps(xtmp1, src , xtmp1, vec_enc); 5194 vcvtps2dq(dst, xtmp1, vec_enc); 5195 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5196 float_sign_flip, vec_enc); 5197 5198 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5199 } 5200 5201 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5202 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5203 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5204 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5205 // and re-instantiate original MXCSR.RC mode after that. 5206 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5207 5208 movl(tmp, jint_cast(0.5)); 5209 movq(xtmp1, tmp); 5210 vbroadcastss(xtmp1, xtmp1, vec_enc); 5211 vaddps(xtmp1, src , xtmp1, vec_enc); 5212 vcvtps2dq(dst, xtmp1, vec_enc); 5213 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5214 5215 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5216 } 5217 5218 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5219 BasicType from_elem_bt, BasicType to_elem_bt) { 5220 switch (from_elem_bt) { 5221 case T_BYTE: 5222 switch (to_elem_bt) { 5223 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5224 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5225 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5226 default: ShouldNotReachHere(); 5227 } 5228 break; 5229 case T_SHORT: 5230 switch (to_elem_bt) { 5231 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5232 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5233 default: ShouldNotReachHere(); 5234 } 5235 break; 5236 case T_INT: 5237 assert(to_elem_bt == T_LONG, ""); 5238 vpmovzxdq(dst, src, vlen_enc); 5239 break; 5240 default: 5241 ShouldNotReachHere(); 5242 } 5243 } 5244 5245 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5246 BasicType from_elem_bt, BasicType to_elem_bt) { 5247 switch (from_elem_bt) { 5248 case T_BYTE: 5249 switch (to_elem_bt) { 5250 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5251 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5252 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5253 default: ShouldNotReachHere(); 5254 } 5255 break; 5256 case T_SHORT: 5257 switch (to_elem_bt) { 5258 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5259 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5260 default: ShouldNotReachHere(); 5261 } 5262 break; 5263 case T_INT: 5264 assert(to_elem_bt == T_LONG, ""); 5265 vpmovsxdq(dst, src, vlen_enc); 5266 break; 5267 default: 5268 ShouldNotReachHere(); 5269 } 5270 } 5271 5272 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5273 BasicType dst_bt, BasicType src_bt, int vlen) { 5274 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5275 assert(vlen_enc != AVX_512bit, ""); 5276 5277 int dst_bt_size = type2aelembytes(dst_bt); 5278 int src_bt_size = type2aelembytes(src_bt); 5279 if (dst_bt_size > src_bt_size) { 5280 switch (dst_bt_size / src_bt_size) { 5281 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5282 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5283 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5284 default: ShouldNotReachHere(); 5285 } 5286 } else { 5287 assert(dst_bt_size < src_bt_size, ""); 5288 switch (src_bt_size / dst_bt_size) { 5289 case 2: { 5290 if (vlen_enc == AVX_128bit) { 5291 vpacksswb(dst, src, src, vlen_enc); 5292 } else { 5293 vpacksswb(dst, src, src, vlen_enc); 5294 vpermq(dst, dst, 0x08, vlen_enc); 5295 } 5296 break; 5297 } 5298 case 4: { 5299 if (vlen_enc == AVX_128bit) { 5300 vpackssdw(dst, src, src, vlen_enc); 5301 vpacksswb(dst, dst, dst, vlen_enc); 5302 } else { 5303 vpackssdw(dst, src, src, vlen_enc); 5304 vpermq(dst, dst, 0x08, vlen_enc); 5305 vpacksswb(dst, dst, dst, AVX_128bit); 5306 } 5307 break; 5308 } 5309 case 8: { 5310 if (vlen_enc == AVX_128bit) { 5311 vpshufd(dst, src, 0x08, vlen_enc); 5312 vpackssdw(dst, dst, dst, vlen_enc); 5313 vpacksswb(dst, dst, dst, vlen_enc); 5314 } else { 5315 vpshufd(dst, src, 0x08, vlen_enc); 5316 vpermq(dst, dst, 0x08, vlen_enc); 5317 vpackssdw(dst, dst, dst, AVX_128bit); 5318 vpacksswb(dst, dst, dst, AVX_128bit); 5319 } 5320 break; 5321 } 5322 default: ShouldNotReachHere(); 5323 } 5324 } 5325 } 5326 5327 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5328 bool merge, BasicType bt, int vlen_enc) { 5329 if (bt == T_INT) { 5330 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5331 } else { 5332 assert(bt == T_LONG, ""); 5333 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5334 } 5335 } 5336 5337 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5338 bool merge, BasicType bt, int vlen_enc) { 5339 if (bt == T_INT) { 5340 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5341 } else { 5342 assert(bt == T_LONG, ""); 5343 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5344 } 5345 } 5346 5347 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5348 Register rtmp2, XMMRegister xtmp, int mask_len, 5349 int vec_enc) { 5350 int index = 0; 5351 int vindex = 0; 5352 mov64(rtmp1, 0x0101010101010101L); 5353 pdepq(rtmp1, src, rtmp1); 5354 if (mask_len > 8) { 5355 movq(rtmp2, src); 5356 vpxor(xtmp, xtmp, xtmp, vec_enc); 5357 movq(xtmp, rtmp1); 5358 } 5359 movq(dst, rtmp1); 5360 5361 mask_len -= 8; 5362 while (mask_len > 0) { 5363 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5364 index++; 5365 if ((index % 2) == 0) { 5366 pxor(xtmp, xtmp); 5367 } 5368 mov64(rtmp1, 0x0101010101010101L); 5369 shrq(rtmp2, 8); 5370 pdepq(rtmp1, rtmp2, rtmp1); 5371 pinsrq(xtmp, rtmp1, index % 2); 5372 vindex = index / 2; 5373 if (vindex) { 5374 // Write entire 16 byte vector when both 64 bit 5375 // lanes are update to save redundant instructions. 5376 if (index % 2) { 5377 vinsertf128(dst, dst, xtmp, vindex); 5378 } 5379 } else { 5380 vmovdqu(dst, xtmp); 5381 } 5382 mask_len -= 8; 5383 } 5384 } 5385 5386 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5387 switch(opc) { 5388 case Op_VectorMaskTrueCount: 5389 popcntq(dst, tmp); 5390 break; 5391 case Op_VectorMaskLastTrue: 5392 if (VM_Version::supports_lzcnt()) { 5393 lzcntq(tmp, tmp); 5394 movl(dst, 63); 5395 subl(dst, tmp); 5396 } else { 5397 movl(dst, -1); 5398 bsrq(tmp, tmp); 5399 cmov32(Assembler::notZero, dst, tmp); 5400 } 5401 break; 5402 case Op_VectorMaskFirstTrue: 5403 if (VM_Version::supports_bmi1()) { 5404 if (masklen < 32) { 5405 orl(tmp, 1 << masklen); 5406 tzcntl(dst, tmp); 5407 } else if (masklen == 32) { 5408 tzcntl(dst, tmp); 5409 } else { 5410 assert(masklen == 64, ""); 5411 tzcntq(dst, tmp); 5412 } 5413 } else { 5414 if (masklen < 32) { 5415 orl(tmp, 1 << masklen); 5416 bsfl(dst, tmp); 5417 } else { 5418 assert(masklen == 32 || masklen == 64, ""); 5419 movl(dst, masklen); 5420 if (masklen == 32) { 5421 bsfl(tmp, tmp); 5422 } else { 5423 bsfq(tmp, tmp); 5424 } 5425 cmov32(Assembler::notZero, dst, tmp); 5426 } 5427 } 5428 break; 5429 case Op_VectorMaskToLong: 5430 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5431 break; 5432 default: assert(false, "Unhandled mask operation"); 5433 } 5434 } 5435 5436 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5437 int masklen, int masksize, int vec_enc) { 5438 assert(VM_Version::supports_popcnt(), ""); 5439 5440 if(VM_Version::supports_avx512bw()) { 5441 kmovql(tmp, mask); 5442 } else { 5443 assert(masklen <= 16, ""); 5444 kmovwl(tmp, mask); 5445 } 5446 5447 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5448 // operations needs to be clipped. 5449 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5450 andq(tmp, (1 << masklen) - 1); 5451 } 5452 5453 vector_mask_operation_helper(opc, dst, tmp, masklen); 5454 } 5455 5456 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5457 Register tmp, int masklen, BasicType bt, int vec_enc) { 5458 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5459 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5460 assert(VM_Version::supports_popcnt(), ""); 5461 5462 bool need_clip = false; 5463 switch(bt) { 5464 case T_BOOLEAN: 5465 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5466 vpxor(xtmp, xtmp, xtmp, vec_enc); 5467 vpsubb(xtmp, xtmp, mask, vec_enc); 5468 vpmovmskb(tmp, xtmp, vec_enc); 5469 need_clip = masklen < 16; 5470 break; 5471 case T_BYTE: 5472 vpmovmskb(tmp, mask, vec_enc); 5473 need_clip = masklen < 16; 5474 break; 5475 case T_SHORT: 5476 vpacksswb(xtmp, mask, mask, vec_enc); 5477 if (masklen >= 16) { 5478 vpermpd(xtmp, xtmp, 8, vec_enc); 5479 } 5480 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5481 need_clip = masklen < 16; 5482 break; 5483 case T_INT: 5484 case T_FLOAT: 5485 vmovmskps(tmp, mask, vec_enc); 5486 need_clip = masklen < 4; 5487 break; 5488 case T_LONG: 5489 case T_DOUBLE: 5490 vmovmskpd(tmp, mask, vec_enc); 5491 need_clip = masklen < 2; 5492 break; 5493 default: assert(false, "Unhandled type, %s", type2name(bt)); 5494 } 5495 5496 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5497 // operations needs to be clipped. 5498 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5499 // need_clip implies masklen < 32 5500 andq(tmp, (1 << masklen) - 1); 5501 } 5502 5503 vector_mask_operation_helper(opc, dst, tmp, masklen); 5504 } 5505 5506 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5507 Register rtmp2, int mask_len) { 5508 kmov(rtmp1, src); 5509 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5510 mov64(rtmp2, -1L); 5511 pextq(rtmp2, rtmp2, rtmp1); 5512 kmov(dst, rtmp2); 5513 } 5514 5515 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5516 XMMRegister mask, Register rtmp, Register rscratch, 5517 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5518 int vec_enc) { 5519 assert(type2aelembytes(bt) >= 4, ""); 5520 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5521 address compress_perm_table = nullptr; 5522 address expand_perm_table = nullptr; 5523 if (type2aelembytes(bt) == 8) { 5524 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5525 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5526 vmovmskpd(rtmp, mask, vec_enc); 5527 } else { 5528 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5529 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5530 vmovmskps(rtmp, mask, vec_enc); 5531 } 5532 shlq(rtmp, 5); // for 32 byte permute row. 5533 if (opcode == Op_CompressV) { 5534 lea(rscratch, ExternalAddress(compress_perm_table)); 5535 } else { 5536 lea(rscratch, ExternalAddress(expand_perm_table)); 5537 } 5538 addptr(rtmp, rscratch); 5539 vmovdqu(permv, Address(rtmp)); 5540 vpermps(dst, permv, src, Assembler::AVX_256bit); 5541 vpxor(xtmp, xtmp, xtmp, vec_enc); 5542 // Blend the result with zero vector using permute mask, each column entry 5543 // in a permute table row contains either a valid permute index or a -1 (default) 5544 // value, this can potentially be used as a blending mask after 5545 // compressing/expanding the source vector lanes. 5546 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv); 5547 } 5548 5549 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5550 bool merge, BasicType bt, int vec_enc) { 5551 if (opcode == Op_CompressV) { 5552 switch(bt) { 5553 case T_BYTE: 5554 evpcompressb(dst, mask, src, merge, vec_enc); 5555 break; 5556 case T_CHAR: 5557 case T_SHORT: 5558 evpcompressw(dst, mask, src, merge, vec_enc); 5559 break; 5560 case T_INT: 5561 evpcompressd(dst, mask, src, merge, vec_enc); 5562 break; 5563 case T_FLOAT: 5564 evcompressps(dst, mask, src, merge, vec_enc); 5565 break; 5566 case T_LONG: 5567 evpcompressq(dst, mask, src, merge, vec_enc); 5568 break; 5569 case T_DOUBLE: 5570 evcompresspd(dst, mask, src, merge, vec_enc); 5571 break; 5572 default: 5573 fatal("Unsupported type %s", type2name(bt)); 5574 break; 5575 } 5576 } else { 5577 assert(opcode == Op_ExpandV, ""); 5578 switch(bt) { 5579 case T_BYTE: 5580 evpexpandb(dst, mask, src, merge, vec_enc); 5581 break; 5582 case T_CHAR: 5583 case T_SHORT: 5584 evpexpandw(dst, mask, src, merge, vec_enc); 5585 break; 5586 case T_INT: 5587 evpexpandd(dst, mask, src, merge, vec_enc); 5588 break; 5589 case T_FLOAT: 5590 evexpandps(dst, mask, src, merge, vec_enc); 5591 break; 5592 case T_LONG: 5593 evpexpandq(dst, mask, src, merge, vec_enc); 5594 break; 5595 case T_DOUBLE: 5596 evexpandpd(dst, mask, src, merge, vec_enc); 5597 break; 5598 default: 5599 fatal("Unsupported type %s", type2name(bt)); 5600 break; 5601 } 5602 } 5603 } 5604 5605 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5606 KRegister ktmp1, int vec_enc) { 5607 if (opcode == Op_SignumVD) { 5608 vsubpd(dst, zero, one, vec_enc); 5609 // if src < 0 ? -1 : 1 5610 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5611 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5612 // if src == NaN, -0.0 or 0.0 return src. 5613 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5614 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5615 } else { 5616 assert(opcode == Op_SignumVF, ""); 5617 vsubps(dst, zero, one, vec_enc); 5618 // if src < 0 ? -1 : 1 5619 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5620 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5621 // if src == NaN, -0.0 or 0.0 return src. 5622 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5623 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5624 } 5625 } 5626 5627 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5628 XMMRegister xtmp1, int vec_enc) { 5629 if (opcode == Op_SignumVD) { 5630 vsubpd(dst, zero, one, vec_enc); 5631 // if src < 0 ? -1 : 1 5632 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5633 // if src == NaN, -0.0 or 0.0 return src. 5634 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5635 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5636 } else { 5637 assert(opcode == Op_SignumVF, ""); 5638 vsubps(dst, zero, one, vec_enc); 5639 // if src < 0 ? -1 : 1 5640 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5641 // if src == NaN, -0.0 or 0.0 return src. 5642 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5643 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5644 } 5645 } 5646 5647 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5648 if (VM_Version::supports_avx512bw()) { 5649 if (mask_len > 32) { 5650 kmovql(dst, src); 5651 } else { 5652 kmovdl(dst, src); 5653 if (mask_len != 32) { 5654 kshiftrdl(dst, dst, 32 - mask_len); 5655 } 5656 } 5657 } else { 5658 assert(mask_len <= 16, ""); 5659 kmovwl(dst, src); 5660 if (mask_len != 16) { 5661 kshiftrwl(dst, dst, 16 - mask_len); 5662 } 5663 } 5664 } 5665 5666 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5667 int lane_size = type2aelembytes(bt); 5668 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5669 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5670 movptr(rtmp, imm32); 5671 switch(lane_size) { 5672 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5673 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5674 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5675 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5676 fatal("Unsupported lane size %d", lane_size); 5677 break; 5678 } 5679 } else { 5680 movptr(rtmp, imm32); 5681 movq(dst, rtmp); 5682 switch(lane_size) { 5683 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5684 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5685 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5686 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5687 fatal("Unsupported lane size %d", lane_size); 5688 break; 5689 } 5690 } 5691 } 5692 5693 // 5694 // Following is lookup table based popcount computation algorithm:- 5695 // Index Bit set count 5696 // [ 0000 -> 0, 5697 // 0001 -> 1, 5698 // 0010 -> 1, 5699 // 0011 -> 2, 5700 // 0100 -> 1, 5701 // 0101 -> 2, 5702 // 0110 -> 2, 5703 // 0111 -> 3, 5704 // 1000 -> 1, 5705 // 1001 -> 2, 5706 // 1010 -> 3, 5707 // 1011 -> 3, 5708 // 1100 -> 2, 5709 // 1101 -> 3, 5710 // 1111 -> 4 ] 5711 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5712 // shuffle indices for lookup table access. 5713 // b. Right shift each byte of vector lane by 4 positions. 5714 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5715 // shuffle indices for lookup table access. 5716 // d. Add the bitset count of upper and lower 4 bits of each byte. 5717 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5718 // count of all the bytes of a quadword. 5719 // f. Perform step e. for upper 128bit vector lane. 5720 // g. Pack the bitset count of quadwords back to double word. 5721 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5722 5723 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5724 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5725 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5726 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5727 vpsrlw(dst, src, 4, vec_enc); 5728 vpand(dst, dst, xtmp1, vec_enc); 5729 vpand(xtmp1, src, xtmp1, vec_enc); 5730 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5731 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5732 vpshufb(dst, xtmp2, dst, vec_enc); 5733 vpaddb(dst, dst, xtmp1, vec_enc); 5734 } 5735 5736 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5737 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5738 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5739 // Following code is as per steps e,f,g and h of above algorithm. 5740 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5741 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5742 vpsadbw(dst, dst, xtmp2, vec_enc); 5743 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5744 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5745 vpackuswb(dst, xtmp1, dst, vec_enc); 5746 } 5747 5748 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5749 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5750 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5751 // Add the popcount of upper and lower bytes of word. 5752 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5753 vpsrlw(dst, xtmp1, 8, vec_enc); 5754 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5755 vpaddw(dst, dst, xtmp1, vec_enc); 5756 } 5757 5758 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5759 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5760 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5761 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5762 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5763 } 5764 5765 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5766 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5767 switch(bt) { 5768 case T_LONG: 5769 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5770 break; 5771 case T_INT: 5772 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5773 break; 5774 case T_CHAR: 5775 case T_SHORT: 5776 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5777 break; 5778 case T_BYTE: 5779 case T_BOOLEAN: 5780 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5781 break; 5782 default: 5783 fatal("Unsupported type %s", type2name(bt)); 5784 break; 5785 } 5786 } 5787 5788 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5789 KRegister mask, bool merge, int vec_enc) { 5790 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5791 switch(bt) { 5792 case T_LONG: 5793 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5794 evpopcntq(dst, mask, src, merge, vec_enc); 5795 break; 5796 case T_INT: 5797 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5798 evpopcntd(dst, mask, src, merge, vec_enc); 5799 break; 5800 case T_CHAR: 5801 case T_SHORT: 5802 assert(VM_Version::supports_avx512_bitalg(), ""); 5803 evpopcntw(dst, mask, src, merge, vec_enc); 5804 break; 5805 case T_BYTE: 5806 case T_BOOLEAN: 5807 assert(VM_Version::supports_avx512_bitalg(), ""); 5808 evpopcntb(dst, mask, src, merge, vec_enc); 5809 break; 5810 default: 5811 fatal("Unsupported type %s", type2name(bt)); 5812 break; 5813 } 5814 } 5815 5816 // Bit reversal algorithm first reverses the bits of each byte followed by 5817 // a byte level reversal for multi-byte primitive types (short/int/long). 5818 // Algorithm performs a lookup table access to get reverse bit sequence 5819 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5820 // is obtained by swapping the reverse bit sequences of upper and lower 5821 // nibble of a byte. 5822 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5823 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5824 if (VM_Version::supports_avx512vlbw()) { 5825 5826 // Get the reverse bit sequence of lower nibble of each byte. 5827 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5828 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5829 evpandq(dst, xtmp2, src, vec_enc); 5830 vpshufb(dst, xtmp1, dst, vec_enc); 5831 vpsllq(dst, dst, 4, vec_enc); 5832 5833 // Get the reverse bit sequence of upper nibble of each byte. 5834 vpandn(xtmp2, xtmp2, src, vec_enc); 5835 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5836 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5837 5838 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5839 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5840 evporq(xtmp2, dst, xtmp2, vec_enc); 5841 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5842 5843 } else if(vec_enc == Assembler::AVX_512bit) { 5844 // Shift based bit reversal. 5845 assert(bt == T_LONG || bt == T_INT, ""); 5846 5847 // Swap lower and upper nibble of each byte. 5848 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5849 5850 // Swap two least and most significant bits of each nibble. 5851 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5852 5853 // Swap adjacent pair of bits. 5854 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5855 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5856 5857 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5858 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5859 } else { 5860 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5861 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5862 5863 // Get the reverse bit sequence of lower nibble of each byte. 5864 vpand(dst, xtmp2, src, vec_enc); 5865 vpshufb(dst, xtmp1, dst, vec_enc); 5866 vpsllq(dst, dst, 4, vec_enc); 5867 5868 // Get the reverse bit sequence of upper nibble of each byte. 5869 vpandn(xtmp2, xtmp2, src, vec_enc); 5870 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5871 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5872 5873 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5874 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5875 vpor(xtmp2, dst, xtmp2, vec_enc); 5876 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5877 } 5878 } 5879 5880 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5881 XMMRegister xtmp, Register rscratch) { 5882 assert(VM_Version::supports_gfni(), ""); 5883 assert(rscratch != noreg || always_reachable(mask), "missing"); 5884 5885 // Galois field instruction based bit reversal based on following algorithm. 5886 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5887 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5888 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5889 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5890 } 5891 5892 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5893 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5894 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5895 evpandq(dst, xtmp1, src, vec_enc); 5896 vpsllq(dst, dst, nbits, vec_enc); 5897 vpandn(xtmp1, xtmp1, src, vec_enc); 5898 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5899 evporq(dst, dst, xtmp1, vec_enc); 5900 } 5901 5902 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5903 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5904 // Shift based bit reversal. 5905 assert(VM_Version::supports_evex(), ""); 5906 switch(bt) { 5907 case T_LONG: 5908 // Swap upper and lower double word of each quad word. 5909 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5910 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5911 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5912 break; 5913 case T_INT: 5914 // Swap upper and lower word of each double word. 5915 evprord(xtmp1, k0, src, 16, true, vec_enc); 5916 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5917 break; 5918 case T_CHAR: 5919 case T_SHORT: 5920 // Swap upper and lower byte of each word. 5921 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5922 break; 5923 case T_BYTE: 5924 evmovdquq(dst, k0, src, true, vec_enc); 5925 break; 5926 default: 5927 fatal("Unsupported type %s", type2name(bt)); 5928 break; 5929 } 5930 } 5931 5932 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5933 if (bt == T_BYTE) { 5934 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5935 evmovdquq(dst, k0, src, true, vec_enc); 5936 } else { 5937 vmovdqu(dst, src); 5938 } 5939 return; 5940 } 5941 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5942 // pre-computed shuffle indices. 5943 switch(bt) { 5944 case T_LONG: 5945 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5946 break; 5947 case T_INT: 5948 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5949 break; 5950 case T_CHAR: 5951 case T_SHORT: 5952 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5953 break; 5954 default: 5955 fatal("Unsupported type %s", type2name(bt)); 5956 break; 5957 } 5958 vpshufb(dst, src, dst, vec_enc); 5959 } 5960 5961 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5962 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5963 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5964 assert(is_integral_type(bt), ""); 5965 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5966 assert(VM_Version::supports_avx512cd(), ""); 5967 switch(bt) { 5968 case T_LONG: 5969 evplzcntq(dst, ktmp, src, merge, vec_enc); 5970 break; 5971 case T_INT: 5972 evplzcntd(dst, ktmp, src, merge, vec_enc); 5973 break; 5974 case T_SHORT: 5975 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5976 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5977 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5978 vpunpckhwd(dst, xtmp1, src, vec_enc); 5979 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5980 vpackusdw(dst, xtmp2, dst, vec_enc); 5981 break; 5982 case T_BYTE: 5983 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5984 // accessing the lookup table. 5985 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5986 // accessing the lookup table. 5987 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5988 assert(VM_Version::supports_avx512bw(), ""); 5989 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5990 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5991 vpand(xtmp2, dst, src, vec_enc); 5992 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5993 vpsrlw(xtmp3, src, 4, vec_enc); 5994 vpand(xtmp3, dst, xtmp3, vec_enc); 5995 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5996 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5997 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5998 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5999 break; 6000 default: 6001 fatal("Unsupported type %s", type2name(bt)); 6002 break; 6003 } 6004 } 6005 6006 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6007 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6008 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6009 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6010 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6011 // accessing the lookup table. 6012 vpand(dst, xtmp2, src, vec_enc); 6013 vpshufb(dst, xtmp1, dst, vec_enc); 6014 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6015 // accessing the lookup table. 6016 vpsrlw(xtmp3, src, 4, vec_enc); 6017 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6018 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6019 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6020 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6021 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6022 vpaddb(dst, dst, xtmp2, vec_enc); 6023 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6024 } 6025 6026 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6027 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6028 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6029 // Add zero counts of lower byte and upper byte of a word if 6030 // upper byte holds a zero value. 6031 vpsrlw(xtmp3, src, 8, vec_enc); 6032 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6033 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6034 vpsllw(xtmp2, dst, 8, vec_enc); 6035 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6036 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6037 vpsrlw(dst, dst, 8, vec_enc); 6038 } 6039 6040 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6041 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6042 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6043 // hence biased exponent can be used to compute leading zero count as per 6044 // following formula:- 6045 // LZCNT = 31 - (biased_exp - 127) 6046 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6047 6048 // Broadcast 0xFF 6049 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6050 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6051 6052 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6053 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6054 // contributes to the leading number of zeros. 6055 vpsrld(xtmp2, src, 1, vec_enc); 6056 vpandn(xtmp3, xtmp2, src, vec_enc); 6057 6058 // Extract biased exponent. 6059 vcvtdq2ps(dst, xtmp3, vec_enc); 6060 vpsrld(dst, dst, 23, vec_enc); 6061 vpand(dst, dst, xtmp1, vec_enc); 6062 6063 // Broadcast 127. 6064 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6065 // Exponent = biased_exp - 127 6066 vpsubd(dst, dst, xtmp1, vec_enc); 6067 6068 // Exponent_plus_one = Exponent + 1 6069 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6070 vpaddd(dst, dst, xtmp3, vec_enc); 6071 6072 // Replace -ve exponent with zero, exponent is -ve when src 6073 // lane contains a zero value. 6074 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6075 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6076 6077 // Rematerialize broadcast 32. 6078 vpslld(xtmp1, xtmp3, 5, vec_enc); 6079 // Exponent is 32 if corresponding source lane contains max_int value. 6080 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6081 // LZCNT = 32 - exponent_plus_one 6082 vpsubd(dst, xtmp1, dst, vec_enc); 6083 6084 // Replace LZCNT with a value 1 if corresponding source lane 6085 // contains max_int value. 6086 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6087 6088 // Replace biased_exp with 0 if source lane value is less than zero. 6089 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6090 vblendvps(dst, dst, xtmp2, src, vec_enc); 6091 } 6092 6093 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6094 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6095 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6096 // Add zero counts of lower word and upper word of a double word if 6097 // upper word holds a zero value. 6098 vpsrld(xtmp3, src, 16, vec_enc); 6099 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6100 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6101 vpslld(xtmp2, dst, 16, vec_enc); 6102 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6103 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6104 vpsrld(dst, dst, 16, vec_enc); 6105 // Add zero counts of lower doubleword and upper doubleword of a 6106 // quadword if upper doubleword holds a zero value. 6107 vpsrlq(xtmp3, src, 32, vec_enc); 6108 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6109 vpsllq(xtmp2, dst, 32, vec_enc); 6110 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6111 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6112 vpsrlq(dst, dst, 32, vec_enc); 6113 } 6114 6115 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6116 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6117 Register rtmp, int vec_enc) { 6118 assert(is_integral_type(bt), "unexpected type"); 6119 assert(vec_enc < Assembler::AVX_512bit, ""); 6120 switch(bt) { 6121 case T_LONG: 6122 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6123 break; 6124 case T_INT: 6125 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6126 break; 6127 case T_SHORT: 6128 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6129 break; 6130 case T_BYTE: 6131 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6132 break; 6133 default: 6134 fatal("Unsupported type %s", type2name(bt)); 6135 break; 6136 } 6137 } 6138 6139 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6140 switch(bt) { 6141 case T_BYTE: 6142 vpsubb(dst, src1, src2, vec_enc); 6143 break; 6144 case T_SHORT: 6145 vpsubw(dst, src1, src2, vec_enc); 6146 break; 6147 case T_INT: 6148 vpsubd(dst, src1, src2, vec_enc); 6149 break; 6150 case T_LONG: 6151 vpsubq(dst, src1, src2, vec_enc); 6152 break; 6153 default: 6154 fatal("Unsupported type %s", type2name(bt)); 6155 break; 6156 } 6157 } 6158 6159 // Trailing zero count computation is based on leading zero count operation as per 6160 // following equation. All AVX3 targets support AVX512CD feature which offers 6161 // direct vector instruction to compute leading zero count. 6162 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6163 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6164 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6165 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6166 assert(is_integral_type(bt), ""); 6167 // xtmp = -1 6168 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6169 // xtmp = xtmp + src 6170 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6171 // xtmp = xtmp & ~src 6172 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6173 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6174 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6175 vpsub(bt, dst, xtmp4, dst, vec_enc); 6176 } 6177 6178 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6179 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6180 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6181 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6182 assert(is_integral_type(bt), ""); 6183 // xtmp = 0 6184 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6185 // xtmp = 0 - src 6186 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6187 // xtmp = xtmp | src 6188 vpor(xtmp3, xtmp3, src, vec_enc); 6189 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6190 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6191 vpsub(bt, dst, xtmp1, dst, vec_enc); 6192 } 6193 6194 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6195 Label done; 6196 Label neg_divisor_fastpath; 6197 cmpl(divisor, 0); 6198 jccb(Assembler::less, neg_divisor_fastpath); 6199 xorl(rdx, rdx); 6200 divl(divisor); 6201 jmpb(done); 6202 bind(neg_divisor_fastpath); 6203 // Fastpath for divisor < 0: 6204 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6205 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6206 movl(rdx, rax); 6207 subl(rdx, divisor); 6208 if (VM_Version::supports_bmi1()) { 6209 andnl(rax, rdx, rax); 6210 } else { 6211 notl(rdx); 6212 andl(rax, rdx); 6213 } 6214 shrl(rax, 31); 6215 bind(done); 6216 } 6217 6218 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6219 Label done; 6220 Label neg_divisor_fastpath; 6221 cmpl(divisor, 0); 6222 jccb(Assembler::less, neg_divisor_fastpath); 6223 xorl(rdx, rdx); 6224 divl(divisor); 6225 jmpb(done); 6226 bind(neg_divisor_fastpath); 6227 // Fastpath when divisor < 0: 6228 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6229 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6230 movl(rdx, rax); 6231 subl(rax, divisor); 6232 if (VM_Version::supports_bmi1()) { 6233 andnl(rax, rax, rdx); 6234 } else { 6235 notl(rax); 6236 andl(rax, rdx); 6237 } 6238 sarl(rax, 31); 6239 andl(rax, divisor); 6240 subl(rdx, rax); 6241 bind(done); 6242 } 6243 6244 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6245 Label done; 6246 Label neg_divisor_fastpath; 6247 6248 cmpl(divisor, 0); 6249 jccb(Assembler::less, neg_divisor_fastpath); 6250 xorl(rdx, rdx); 6251 divl(divisor); 6252 jmpb(done); 6253 bind(neg_divisor_fastpath); 6254 // Fastpath for divisor < 0: 6255 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6256 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6257 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6258 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6259 movl(rdx, rax); 6260 subl(rax, divisor); 6261 if (VM_Version::supports_bmi1()) { 6262 andnl(rax, rax, rdx); 6263 } else { 6264 notl(rax); 6265 andl(rax, rdx); 6266 } 6267 movl(tmp, rax); 6268 shrl(rax, 31); // quotient 6269 sarl(tmp, 31); 6270 andl(tmp, divisor); 6271 subl(rdx, tmp); // remainder 6272 bind(done); 6273 } 6274 6275 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6276 XMMRegister xtmp2, Register rtmp) { 6277 if(VM_Version::supports_gfni()) { 6278 // Galois field instruction based bit reversal based on following algorithm. 6279 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6280 mov64(rtmp, 0x8040201008040201L); 6281 movq(xtmp1, src); 6282 movq(xtmp2, rtmp); 6283 gf2p8affineqb(xtmp1, xtmp2, 0); 6284 movq(dst, xtmp1); 6285 } else { 6286 // Swap even and odd numbered bits. 6287 movl(rtmp, src); 6288 andl(rtmp, 0x55555555); 6289 shll(rtmp, 1); 6290 movl(dst, src); 6291 andl(dst, 0xAAAAAAAA); 6292 shrl(dst, 1); 6293 orl(dst, rtmp); 6294 6295 // Swap LSB and MSB 2 bits of each nibble. 6296 movl(rtmp, dst); 6297 andl(rtmp, 0x33333333); 6298 shll(rtmp, 2); 6299 andl(dst, 0xCCCCCCCC); 6300 shrl(dst, 2); 6301 orl(dst, rtmp); 6302 6303 // Swap LSB and MSB 4 bits of each byte. 6304 movl(rtmp, dst); 6305 andl(rtmp, 0x0F0F0F0F); 6306 shll(rtmp, 4); 6307 andl(dst, 0xF0F0F0F0); 6308 shrl(dst, 4); 6309 orl(dst, rtmp); 6310 } 6311 bswapl(dst); 6312 } 6313 6314 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6315 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6316 if(VM_Version::supports_gfni()) { 6317 // Galois field instruction based bit reversal based on following algorithm. 6318 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6319 mov64(rtmp1, 0x8040201008040201L); 6320 movq(xtmp1, src); 6321 movq(xtmp2, rtmp1); 6322 gf2p8affineqb(xtmp1, xtmp2, 0); 6323 movq(dst, xtmp1); 6324 } else { 6325 // Swap even and odd numbered bits. 6326 movq(rtmp1, src); 6327 mov64(rtmp2, 0x5555555555555555L); 6328 andq(rtmp1, rtmp2); 6329 shlq(rtmp1, 1); 6330 movq(dst, src); 6331 notq(rtmp2); 6332 andq(dst, rtmp2); 6333 shrq(dst, 1); 6334 orq(dst, rtmp1); 6335 6336 // Swap LSB and MSB 2 bits of each nibble. 6337 movq(rtmp1, dst); 6338 mov64(rtmp2, 0x3333333333333333L); 6339 andq(rtmp1, rtmp2); 6340 shlq(rtmp1, 2); 6341 notq(rtmp2); 6342 andq(dst, rtmp2); 6343 shrq(dst, 2); 6344 orq(dst, rtmp1); 6345 6346 // Swap LSB and MSB 4 bits of each byte. 6347 movq(rtmp1, dst); 6348 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6349 andq(rtmp1, rtmp2); 6350 shlq(rtmp1, 4); 6351 notq(rtmp2); 6352 andq(dst, rtmp2); 6353 shrq(dst, 4); 6354 orq(dst, rtmp1); 6355 } 6356 bswapq(dst); 6357 } 6358 6359 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6360 Label done; 6361 Label neg_divisor_fastpath; 6362 cmpq(divisor, 0); 6363 jccb(Assembler::less, neg_divisor_fastpath); 6364 xorl(rdx, rdx); 6365 divq(divisor); 6366 jmpb(done); 6367 bind(neg_divisor_fastpath); 6368 // Fastpath for divisor < 0: 6369 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6370 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6371 movq(rdx, rax); 6372 subq(rdx, divisor); 6373 if (VM_Version::supports_bmi1()) { 6374 andnq(rax, rdx, rax); 6375 } else { 6376 notq(rdx); 6377 andq(rax, rdx); 6378 } 6379 shrq(rax, 63); 6380 bind(done); 6381 } 6382 6383 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6384 Label done; 6385 Label neg_divisor_fastpath; 6386 cmpq(divisor, 0); 6387 jccb(Assembler::less, neg_divisor_fastpath); 6388 xorq(rdx, rdx); 6389 divq(divisor); 6390 jmp(done); 6391 bind(neg_divisor_fastpath); 6392 // Fastpath when divisor < 0: 6393 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6394 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6395 movq(rdx, rax); 6396 subq(rax, divisor); 6397 if (VM_Version::supports_bmi1()) { 6398 andnq(rax, rax, rdx); 6399 } else { 6400 notq(rax); 6401 andq(rax, rdx); 6402 } 6403 sarq(rax, 63); 6404 andq(rax, divisor); 6405 subq(rdx, rax); 6406 bind(done); 6407 } 6408 6409 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6410 Label done; 6411 Label neg_divisor_fastpath; 6412 cmpq(divisor, 0); 6413 jccb(Assembler::less, neg_divisor_fastpath); 6414 xorq(rdx, rdx); 6415 divq(divisor); 6416 jmp(done); 6417 bind(neg_divisor_fastpath); 6418 // Fastpath for divisor < 0: 6419 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6420 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6421 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6422 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6423 movq(rdx, rax); 6424 subq(rax, divisor); 6425 if (VM_Version::supports_bmi1()) { 6426 andnq(rax, rax, rdx); 6427 } else { 6428 notq(rax); 6429 andq(rax, rdx); 6430 } 6431 movq(tmp, rax); 6432 shrq(rax, 63); // quotient 6433 sarq(tmp, 63); 6434 andq(tmp, divisor); 6435 subq(rdx, tmp); // remainder 6436 bind(done); 6437 } 6438 6439 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6440 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6441 int vlen_enc) { 6442 assert(VM_Version::supports_avx512bw(), ""); 6443 // Byte shuffles are inlane operations and indices are determined using 6444 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6445 // normalized to index range 0-15. This makes sure that all the multiples 6446 // of an index value are placed at same relative position in 128 bit 6447 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6448 // will be 16th element in their respective 128 bit lanes. 6449 movl(rtmp, 16); 6450 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6451 6452 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6453 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6454 // original shuffle indices and move the shuffled lanes corresponding to true 6455 // mask to destination vector. 6456 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6457 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6458 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6459 6460 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6461 // and broadcasting second 128 bit lane. 6462 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6463 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6464 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6465 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6466 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6467 6468 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6469 // and broadcasting third 128 bit lane. 6470 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6471 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6472 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6473 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6474 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6475 6476 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6477 // and broadcasting third 128 bit lane. 6478 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6479 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6480 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6481 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6482 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6483 } 6484 6485 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6486 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6487 if (vlen_enc == AVX_128bit) { 6488 vpermilps(dst, src, shuffle, vlen_enc); 6489 } else if (bt == T_INT) { 6490 vpermd(dst, shuffle, src, vlen_enc); 6491 } else { 6492 assert(bt == T_FLOAT, ""); 6493 vpermps(dst, shuffle, src, vlen_enc); 6494 } 6495 } 6496 6497 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6498 switch(opcode) { 6499 case Op_AddHF: vaddsh(dst, src1, src2); break; 6500 case Op_SubHF: vsubsh(dst, src1, src2); break; 6501 case Op_MulHF: vmulsh(dst, src1, src2); break; 6502 case Op_DivHF: vdivsh(dst, src1, src2); break; 6503 default: assert(false, "%s", NodeClassNames[opcode]); break; 6504 } 6505 } 6506 6507 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6508 switch(elem_bt) { 6509 case T_BYTE: 6510 if (ideal_opc == Op_SaturatingAddV) { 6511 vpaddsb(dst, src1, src2, vlen_enc); 6512 } else { 6513 assert(ideal_opc == Op_SaturatingSubV, ""); 6514 vpsubsb(dst, src1, src2, vlen_enc); 6515 } 6516 break; 6517 case T_SHORT: 6518 if (ideal_opc == Op_SaturatingAddV) { 6519 vpaddsw(dst, src1, src2, vlen_enc); 6520 } else { 6521 assert(ideal_opc == Op_SaturatingSubV, ""); 6522 vpsubsw(dst, src1, src2, vlen_enc); 6523 } 6524 break; 6525 default: 6526 fatal("Unsupported type %s", type2name(elem_bt)); 6527 break; 6528 } 6529 } 6530 6531 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6532 switch(elem_bt) { 6533 case T_BYTE: 6534 if (ideal_opc == Op_SaturatingAddV) { 6535 vpaddusb(dst, src1, src2, vlen_enc); 6536 } else { 6537 assert(ideal_opc == Op_SaturatingSubV, ""); 6538 vpsubusb(dst, src1, src2, vlen_enc); 6539 } 6540 break; 6541 case T_SHORT: 6542 if (ideal_opc == Op_SaturatingAddV) { 6543 vpaddusw(dst, src1, src2, vlen_enc); 6544 } else { 6545 assert(ideal_opc == Op_SaturatingSubV, ""); 6546 vpsubusw(dst, src1, src2, vlen_enc); 6547 } 6548 break; 6549 default: 6550 fatal("Unsupported type %s", type2name(elem_bt)); 6551 break; 6552 } 6553 } 6554 6555 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6556 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6557 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6558 // overflow_mask = Inp1 <u Inp2 6559 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6560 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6561 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6562 } 6563 6564 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6565 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6566 // Emulate unsigned comparison using signed comparison 6567 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6568 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6569 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6570 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6571 6572 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6573 6574 // Res = INP1 - INP2 (non-commutative and non-associative) 6575 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6576 // Res = Mask ? Zero : Res 6577 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6578 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6579 } 6580 6581 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6582 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6583 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6584 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6585 // Res = Signed Add INP1, INP2 6586 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6587 // T1 = SRC1 | SRC2 6588 vpor(xtmp1, src1, src2, vlen_enc); 6589 // Max_Unsigned = -1 6590 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6591 // Unsigned compare: Mask = Res <u T1 6592 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6593 // res = Mask ? Max_Unsigned : Res 6594 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6595 } 6596 6597 // 6598 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6599 // unsigned addition operation. 6600 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6601 // 6602 // We empirically determined its semantic equivalence to following reduced expression 6603 // overflow_mask = (a + b) <u (a | b) 6604 // 6605 // and also verified it though Alive2 solver. 6606 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6607 // 6608 6609 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6610 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6611 // Res = Signed Add INP1, INP2 6612 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6613 // Compute T1 = INP1 | INP2 6614 vpor(xtmp3, src1, src2, vlen_enc); 6615 // T1 = Minimum signed value. 6616 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6617 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6618 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6619 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6620 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6621 // Compute overflow detection mask = Res<1> <s T1 6622 if (elem_bt == T_INT) { 6623 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6624 } else { 6625 assert(elem_bt == T_LONG, ""); 6626 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6627 } 6628 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6629 } 6630 6631 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6632 int vlen_enc, bool xtmp2_hold_M1) { 6633 if (VM_Version::supports_avx512dq()) { 6634 evpmovq2m(ktmp, src, vlen_enc); 6635 } else { 6636 assert(VM_Version::supports_evex(), ""); 6637 if (!xtmp2_hold_M1) { 6638 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6639 } 6640 evpsraq(xtmp1, src, 63, vlen_enc); 6641 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6642 } 6643 } 6644 6645 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6646 int vlen_enc, bool xtmp2_hold_M1) { 6647 if (VM_Version::supports_avx512dq()) { 6648 evpmovd2m(ktmp, src, vlen_enc); 6649 } else { 6650 assert(VM_Version::supports_evex(), ""); 6651 if (!xtmp2_hold_M1) { 6652 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6653 } 6654 vpsrad(xtmp1, src, 31, vlen_enc); 6655 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6656 } 6657 } 6658 6659 6660 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6661 if (elem_bt == T_LONG) { 6662 if (VM_Version::supports_evex()) { 6663 evpsraq(dst, src, 63, vlen_enc); 6664 } else { 6665 vpsrad(dst, src, 31, vlen_enc); 6666 vpshufd(dst, dst, 0xF5, vlen_enc); 6667 } 6668 } else { 6669 assert(elem_bt == T_INT, ""); 6670 vpsrad(dst, src, 31, vlen_enc); 6671 } 6672 } 6673 6674 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6675 if (compute_allones) { 6676 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6677 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6678 } else { 6679 vpcmpeqq(allones, allones, allones, vlen_enc); 6680 } 6681 } 6682 if (elem_bt == T_LONG) { 6683 vpsrlq(dst, allones, 1, vlen_enc); 6684 } else { 6685 assert(elem_bt == T_INT, ""); 6686 vpsrld(dst, allones, 1, vlen_enc); 6687 } 6688 } 6689 6690 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6691 if (compute_allones) { 6692 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6693 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6694 } else { 6695 vpcmpeqq(allones, allones, allones, vlen_enc); 6696 } 6697 } 6698 if (elem_bt == T_LONG) { 6699 vpsllq(dst, allones, 63, vlen_enc); 6700 } else { 6701 assert(elem_bt == T_INT, ""); 6702 vpslld(dst, allones, 31, vlen_enc); 6703 } 6704 } 6705 6706 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6707 Assembler::ComparisonPredicate cond, int vlen_enc) { 6708 switch(elem_bt) { 6709 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6710 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6711 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6712 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6713 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6714 } 6715 } 6716 6717 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6718 switch(elem_bt) { 6719 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6720 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6721 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6722 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6723 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6724 } 6725 } 6726 6727 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6728 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6729 if (elem_bt == T_LONG) { 6730 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6731 } else { 6732 assert(elem_bt == T_INT, ""); 6733 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6734 } 6735 } 6736 6737 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6738 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6739 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6740 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6741 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6742 // Overflow detection based on Hacker's delight section 2-13. 6743 if (ideal_opc == Op_SaturatingAddV) { 6744 // res = src1 + src2 6745 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6746 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6747 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6748 vpxor(xtmp1, dst, src1, vlen_enc); 6749 vpxor(xtmp2, dst, src2, vlen_enc); 6750 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6751 } else { 6752 assert(ideal_opc == Op_SaturatingSubV, ""); 6753 // res = src1 - src2 6754 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6755 // Overflow occurs when both inputs have opposite polarity and 6756 // result polarity does not comply with first input polarity. 6757 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6758 vpxor(xtmp1, src1, src2, vlen_enc); 6759 vpxor(xtmp2, dst, src1, vlen_enc); 6760 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6761 } 6762 6763 // Compute overflow detection mask. 6764 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6765 // Note: xtmp1 hold -1 in all its lanes after above call. 6766 6767 // Compute mask based on first input polarity. 6768 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6769 6770 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6771 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6772 6773 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6774 // set bits in first input polarity mask holds a min value. 6775 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6776 // Blend destination lanes with saturated values using overflow detection mask. 6777 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6778 } 6779 6780 6781 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6782 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6783 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6784 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6785 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6786 // Overflow detection based on Hacker's delight section 2-13. 6787 if (ideal_opc == Op_SaturatingAddV) { 6788 // res = src1 + src2 6789 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6790 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6791 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6792 vpxor(xtmp1, dst, src1, vlen_enc); 6793 vpxor(xtmp2, dst, src2, vlen_enc); 6794 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6795 } else { 6796 assert(ideal_opc == Op_SaturatingSubV, ""); 6797 // res = src1 - src2 6798 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6799 // Overflow occurs when both inputs have opposite polarity and 6800 // result polarity does not comply with first input polarity. 6801 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6802 vpxor(xtmp1, src1, src2, vlen_enc); 6803 vpxor(xtmp2, dst, src1, vlen_enc); 6804 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6805 } 6806 6807 // Sign-extend to compute overflow detection mask. 6808 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6809 6810 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6811 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6812 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6813 6814 // Compose saturating min/max vector using first input polarity mask. 6815 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6816 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6817 6818 // Blend result with saturating vector using overflow detection mask. 6819 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6820 } 6821 6822 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6823 switch(elem_bt) { 6824 case T_BYTE: 6825 if (ideal_opc == Op_SaturatingAddV) { 6826 vpaddsb(dst, src1, src2, vlen_enc); 6827 } else { 6828 assert(ideal_opc == Op_SaturatingSubV, ""); 6829 vpsubsb(dst, src1, src2, vlen_enc); 6830 } 6831 break; 6832 case T_SHORT: 6833 if (ideal_opc == Op_SaturatingAddV) { 6834 vpaddsw(dst, src1, src2, vlen_enc); 6835 } else { 6836 assert(ideal_opc == Op_SaturatingSubV, ""); 6837 vpsubsw(dst, src1, src2, vlen_enc); 6838 } 6839 break; 6840 default: 6841 fatal("Unsupported type %s", type2name(elem_bt)); 6842 break; 6843 } 6844 } 6845 6846 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6847 switch(elem_bt) { 6848 case T_BYTE: 6849 if (ideal_opc == Op_SaturatingAddV) { 6850 vpaddusb(dst, src1, src2, vlen_enc); 6851 } else { 6852 assert(ideal_opc == Op_SaturatingSubV, ""); 6853 vpsubusb(dst, src1, src2, vlen_enc); 6854 } 6855 break; 6856 case T_SHORT: 6857 if (ideal_opc == Op_SaturatingAddV) { 6858 vpaddusw(dst, src1, src2, vlen_enc); 6859 } else { 6860 assert(ideal_opc == Op_SaturatingSubV, ""); 6861 vpsubusw(dst, src1, src2, vlen_enc); 6862 } 6863 break; 6864 default: 6865 fatal("Unsupported type %s", type2name(elem_bt)); 6866 break; 6867 } 6868 } 6869 6870 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6871 XMMRegister src2, int vlen_enc) { 6872 switch(elem_bt) { 6873 case T_BYTE: 6874 evpermi2b(dst, src1, src2, vlen_enc); 6875 break; 6876 case T_SHORT: 6877 evpermi2w(dst, src1, src2, vlen_enc); 6878 break; 6879 case T_INT: 6880 evpermi2d(dst, src1, src2, vlen_enc); 6881 break; 6882 case T_LONG: 6883 evpermi2q(dst, src1, src2, vlen_enc); 6884 break; 6885 case T_FLOAT: 6886 evpermi2ps(dst, src1, src2, vlen_enc); 6887 break; 6888 case T_DOUBLE: 6889 evpermi2pd(dst, src1, src2, vlen_enc); 6890 break; 6891 default: 6892 fatal("Unsupported type %s", type2name(elem_bt)); 6893 break; 6894 } 6895 } 6896 6897 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 6898 if (is_unsigned) { 6899 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6900 } else { 6901 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6902 } 6903 } 6904 6905 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 6906 if (is_unsigned) { 6907 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6908 } else { 6909 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6910 } 6911 } 6912 6913 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6914 switch(opcode) { 6915 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 6916 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 6917 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 6918 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 6919 default: assert(false, "%s", NodeClassNames[opcode]); break; 6920 } 6921 } 6922 6923 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6924 switch(opcode) { 6925 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 6926 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 6927 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 6928 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 6929 default: assert(false, "%s", NodeClassNames[opcode]); break; 6930 } 6931 } 6932 6933 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6934 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 6935 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 6936 } 6937 6938 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6939 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6940 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 6941 // Move sign bits of src2 to mask register. 6942 evpmovw2m(ktmp, src2, vlen_enc); 6943 // xtmp1 = src2 < 0 ? src2 : src1 6944 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 6945 // xtmp2 = src2 < 0 ? ? src1 : src2 6946 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 6947 // Idea behind above swapping is to make seconds source operand a +ve value. 6948 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 6949 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 6950 // the second source operand, either a NaN or a valid floating-point value, is returned 6951 // dst = max(xtmp1, xtmp2) 6952 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 6953 // isNaN = is_unordered_quiet(xtmp1) 6954 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 6955 // Final result is same as first source if its a NaN value, 6956 // in case second operand holds a NaN value then as per above semantics 6957 // result is same as second operand. 6958 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 6959 } else { 6960 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 6961 // Move sign bits of src1 to mask register. 6962 evpmovw2m(ktmp, src1, vlen_enc); 6963 // xtmp1 = src1 < 0 ? src2 : src1 6964 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 6965 // xtmp2 = src1 < 0 ? src1 : src2 6966 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 6967 // Idea behind above swapping is to make seconds source operand a -ve value. 6968 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 6969 // the second source operand is returned. 6970 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 6971 // or a valid floating-point value, is written to the result. 6972 // dst = min(xtmp1, xtmp2) 6973 evminph(dst, xtmp1, xtmp2, vlen_enc); 6974 // isNaN = is_unordered_quiet(xtmp1) 6975 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 6976 // Final result is same as first source if its a NaN value, 6977 // in case second operand holds a NaN value then as per above semantics 6978 // result is same as second operand. 6979 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 6980 } 6981 }