1 /* 2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 53 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 54 55 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 56 // Remove word for return addr 57 framesize -= wordSize; 58 stack_bang_size -= wordSize; 59 60 // Calls to C2R adapters often do not accept exceptional returns. 61 // We require that their callers must bang for them. But be careful, because 62 // some VM calls (such as call site linkage) can use several kilobytes of 63 // stack. But the stack safety zone should account for that. 64 // See bugs 4446381, 4468289, 4497237. 65 if (stack_bang_size > 0) { 66 generate_stack_overflow_check(stack_bang_size); 67 68 // We always push rbp, so that on return to interpreter rbp, will be 69 // restored correctly and we can correct the stack. 70 push(rbp); 71 // Save caller's stack pointer into RBP if the frame pointer is preserved. 72 if (PreserveFramePointer) { 73 mov(rbp, rsp); 74 } 75 // Remove word for ebp 76 framesize -= wordSize; 77 78 // Create frame 79 if (framesize) { 80 subptr(rsp, framesize); 81 } 82 } else { 83 subptr(rsp, framesize); 84 85 // Save RBP register now. 86 framesize -= wordSize; 87 movptr(Address(rsp, framesize), rbp); 88 // Save caller's stack pointer into RBP if the frame pointer is preserved. 89 if (PreserveFramePointer) { 90 movptr(rbp, rsp); 91 if (framesize > 0) { 92 addptr(rbp, framesize); 93 } 94 } 95 } 96 97 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 98 framesize -= wordSize; 99 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 100 } 101 102 #ifdef ASSERT 103 if (VerifyStackAtCalls) { 104 Label L; 105 push(rax); 106 mov(rax, rsp); 107 andptr(rax, StackAlignmentInBytes-1); 108 cmpptr(rax, StackAlignmentInBytes-wordSize); 109 pop(rax); 110 jcc(Assembler::equal, L); 111 STOP("Stack is not properly aligned!"); 112 bind(L); 113 } 114 #endif 115 116 if (!is_stub) { 117 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 118 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 119 Label dummy_slow_path; 120 Label dummy_continuation; 121 Label* slow_path = &dummy_slow_path; 122 Label* continuation = &dummy_continuation; 123 if (!Compile::current()->output()->in_scratch_emit_size()) { 124 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 125 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 126 Compile::current()->output()->add_stub(stub); 127 slow_path = &stub->entry(); 128 continuation = &stub->continuation(); 129 } 130 bs->nmethod_entry_barrier(this, slow_path, continuation); 131 } 132 } 133 134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 135 switch (vlen_in_bytes) { 136 case 4: // fall-through 137 case 8: // fall-through 138 case 16: return Assembler::AVX_128bit; 139 case 32: return Assembler::AVX_256bit; 140 case 64: return Assembler::AVX_512bit; 141 142 default: { 143 ShouldNotReachHere(); 144 return Assembler::AVX_NoVec; 145 } 146 } 147 } 148 149 // fast_lock and fast_unlock used by C2 150 151 // Because the transitions from emitted code to the runtime 152 // monitorenter/exit helper stubs are so slow it's critical that 153 // we inline both the stack-locking fast path and the inflated fast path. 154 // 155 // See also: cmpFastLock and cmpFastUnlock. 156 // 157 // What follows is a specialized inline transliteration of the code 158 // in enter() and exit(). If we're concerned about I$ bloat another 159 // option would be to emit TrySlowEnter and TrySlowExit methods 160 // at startup-time. These methods would accept arguments as 161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 162 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 164 // In practice, however, the # of lock sites is bounded and is usually small. 165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 166 // if the processor uses simple bimodal branch predictors keyed by EIP 167 // Since the helper routines would be called from multiple synchronization 168 // sites. 169 // 170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 172 // to those specialized methods. That'd give us a mostly platform-independent 173 // implementation that the JITs could optimize and inline at their pleasure. 174 // Done correctly, the only time we'd need to cross to native could would be 175 // to park() or unpark() threads. We'd also need a few more unsafe operators 176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 177 // (b) explicit barriers or fence operations. 178 // 179 // TODO: 180 // 181 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 182 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 183 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 184 // the lock operators would typically be faster than reifying Self. 185 // 186 // * Ideally I'd define the primitives as: 187 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 188 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 189 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 190 // Instead, we're stuck with a rather awkward and brittle register assignments below. 191 // Furthermore the register assignments are overconstrained, possibly resulting in 192 // sub-optimal code near the synchronization site. 193 // 194 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 195 // Alternately, use a better sp-proximity test. 196 // 197 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 198 // Either one is sufficient to uniquely identify a thread. 199 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 200 // 201 // * Intrinsify notify() and notifyAll() for the common cases where the 202 // object is locked by the calling thread but the waitlist is empty. 203 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 204 // 205 // * use jccb and jmpb instead of jcc and jmp to improve code density. 206 // But beware of excessive branch density on AMD Opterons. 207 // 208 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 209 // or failure of the fast path. If the fast path fails then we pass 210 // control to the slow path, typically in C. In fast_lock and 211 // fast_unlock we often branch to DONE_LABEL, just to find that C2 212 // will emit a conditional branch immediately after the node. 213 // So we have branches to branches and lots of ICC.ZF games. 214 // Instead, it might be better to have C2 pass a "FailureLabel" 215 // into fast_lock and fast_unlock. In the case of success, control 216 // will drop through the node. ICC.ZF is undefined at exit. 217 // In the case of failure, the node will branch directly to the 218 // FailureLabel 219 220 221 // obj: object to lock 222 // box: on-stack box address -- KILLED 223 // rax: tmp -- KILLED 224 // t : tmp -- KILLED 225 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg, 226 Register t, Register thread) { 227 assert(rax_reg == rax, "Used for CAS"); 228 assert_different_registers(obj, box, rax_reg, t, thread); 229 230 // Handle inflated monitor. 231 Label inflated; 232 // Finish fast lock successfully. ZF value is irrelevant. 233 Label locked; 234 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 235 Label slow_path; 236 237 if (UseObjectMonitorTable) { 238 // Clear cache in case fast locking succeeds or we need to take the slow-path. 239 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 240 } 241 242 if (DiagnoseSyncOnValueBasedClasses != 0) { 243 load_klass(rax_reg, obj, t); 244 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 245 jcc(Assembler::notZero, slow_path); 246 } 247 248 const Register mark = t; 249 250 { // Fast Lock 251 252 Label push; 253 254 const Register top = UseObjectMonitorTable ? rax_reg : box; 255 256 // Load the mark. 257 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 258 259 // Prefetch top. 260 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 261 262 // Check for monitor (0b10). 263 testptr(mark, markWord::monitor_value); 264 jcc(Assembler::notZero, inflated); 265 266 // Check if lock-stack is full. 267 cmpl(top, LockStack::end_offset() - 1); 268 jcc(Assembler::greater, slow_path); 269 270 // Check if recursive. 271 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 272 jccb(Assembler::equal, push); 273 274 // Try to lock. Transition lock bits 0b01 => 0b00 275 movptr(rax_reg, mark); 276 orptr(rax_reg, markWord::unlocked_value); 277 andptr(mark, ~(int32_t)markWord::unlocked_value); 278 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 279 jcc(Assembler::notEqual, slow_path); 280 281 if (UseObjectMonitorTable) { 282 // Need to reload top, clobbered by CAS. 283 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 284 } 285 bind(push); 286 // After successful lock, push object on lock-stack. 287 movptr(Address(thread, top), obj); 288 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 289 jmpb(locked); 290 } 291 292 { // Handle inflated monitor. 293 bind(inflated); 294 295 const Register monitor = t; 296 297 if (!UseObjectMonitorTable) { 298 assert(mark == monitor, "should be the same here"); 299 } else { 300 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 301 // Fetch ObjectMonitor* from the cache or take the slow-path. 302 Label monitor_found; 303 304 // Load cache address 305 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 306 307 const int num_unrolled = 2; 308 for (int i = 0; i < num_unrolled; i++) { 309 cmpptr(obj, Address(t)); 310 jccb(Assembler::equal, monitor_found); 311 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 312 } 313 314 Label loop; 315 316 // Search for obj in cache. 317 bind(loop); 318 319 // Check for match. 320 cmpptr(obj, Address(t)); 321 jccb(Assembler::equal, monitor_found); 322 323 // Search until null encountered, guaranteed _null_sentinel at end. 324 cmpptr(Address(t), 1); 325 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 326 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 327 jmpb(loop); 328 329 // Cache hit. 330 bind(monitor_found); 331 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 332 } 333 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 334 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 335 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 336 337 Label monitor_locked; 338 // Lock the monitor. 339 340 if (UseObjectMonitorTable) { 341 // Cache the monitor for unlock before trashing box. On failure to acquire 342 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 343 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 344 } 345 346 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 347 xorptr(rax_reg, rax_reg); 348 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 349 lock(); cmpxchgptr(box, owner_address); 350 jccb(Assembler::equal, monitor_locked); 351 352 // Check if recursive. 353 cmpptr(box, rax_reg); 354 jccb(Assembler::notEqual, slow_path); 355 356 // Recursive. 357 increment(recursions_address); 358 359 bind(monitor_locked); 360 } 361 362 bind(locked); 363 // Set ZF = 1 364 xorl(rax_reg, rax_reg); 365 366 #ifdef ASSERT 367 // Check that locked label is reached with ZF set. 368 Label zf_correct; 369 Label zf_bad_zero; 370 jcc(Assembler::zero, zf_correct); 371 jmp(zf_bad_zero); 372 #endif 373 374 bind(slow_path); 375 #ifdef ASSERT 376 // Check that slow_path label is reached with ZF not set. 377 jcc(Assembler::notZero, zf_correct); 378 stop("Fast Lock ZF != 0"); 379 bind(zf_bad_zero); 380 stop("Fast Lock ZF != 1"); 381 bind(zf_correct); 382 #endif 383 // C2 uses the value of ZF to determine the continuation. 384 } 385 386 // obj: object to lock 387 // rax: tmp -- KILLED 388 // t : tmp - cannot be obj nor rax -- KILLED 389 // 390 // Some commentary on balanced locking: 391 // 392 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 393 // Methods that don't have provably balanced locking are forced to run in the 394 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 395 // The interpreter provides two properties: 396 // I1: At return-time the interpreter automatically and quietly unlocks any 397 // objects acquired in the current activation (frame). Recall that the 398 // interpreter maintains an on-stack list of locks currently held by 399 // a frame. 400 // I2: If a method attempts to unlock an object that is not held by the 401 // frame the interpreter throws IMSX. 402 // 403 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 404 // B() doesn't have provably balanced locking so it runs in the interpreter. 405 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 406 // is still locked by A(). 407 // 408 // The only other source of unbalanced locking would be JNI. The "Java Native Interface 409 // Specification" states that an object locked by JNI's MonitorEnter should not be 410 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't 411 // specify what will occur if a program engages in such mixed-mode locking, however. 412 // Arguably given that the spec legislates the JNI case as undefined our implementation 413 // could reasonably *avoid* checking owner in fast_unlock(). 414 // In the interest of performance we elide m->Owner==Self check in unlock. 415 // A perfectly viable alternative is to elide the owner check except when 416 // Xcheck:jni is enabled. 417 418 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) { 419 assert(reg_rax == rax, "Used for CAS"); 420 assert_different_registers(obj, reg_rax, t); 421 422 // Handle inflated monitor. 423 Label inflated, inflated_check_lock_stack; 424 // Finish fast unlock successfully. MUST jump with ZF == 1 425 Label unlocked, slow_path; 426 427 const Register mark = t; 428 const Register monitor = t; 429 const Register top = UseObjectMonitorTable ? t : reg_rax; 430 const Register box = reg_rax; 431 432 Label dummy; 433 C2FastUnlockStub* stub = nullptr; 434 435 if (!Compile::current()->output()->in_scratch_emit_size()) { 436 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread); 437 Compile::current()->output()->add_stub(stub); 438 } 439 440 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 441 442 { // Fast Unlock 443 444 // Load top. 445 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 446 447 if (!UseObjectMonitorTable) { 448 // Prefetch mark. 449 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 450 } 451 452 // Check if obj is top of lock-stack. 453 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 454 // Top of lock stack was not obj. Must be monitor. 455 jcc(Assembler::notEqual, inflated_check_lock_stack); 456 457 // Pop lock-stack. 458 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 459 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 460 461 // Check if recursive. 462 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 463 jcc(Assembler::equal, unlocked); 464 465 // We elide the monitor check, let the CAS fail instead. 466 467 if (UseObjectMonitorTable) { 468 // Load mark. 469 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 470 } 471 472 // Try to unlock. Transition lock bits 0b00 => 0b01 473 movptr(reg_rax, mark); 474 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 475 orptr(mark, markWord::unlocked_value); 476 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 477 jcc(Assembler::notEqual, push_and_slow_path); 478 jmp(unlocked); 479 } 480 481 482 { // Handle inflated monitor. 483 bind(inflated_check_lock_stack); 484 #ifdef ASSERT 485 Label check_done; 486 subl(top, oopSize); 487 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 488 jcc(Assembler::below, check_done); 489 cmpptr(obj, Address(thread, top)); 490 jccb(Assembler::notEqual, inflated_check_lock_stack); 491 stop("Fast Unlock lock on stack"); 492 bind(check_done); 493 if (UseObjectMonitorTable) { 494 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 495 } 496 testptr(mark, markWord::monitor_value); 497 jccb(Assembler::notZero, inflated); 498 stop("Fast Unlock not monitor"); 499 #endif 500 501 bind(inflated); 502 503 if (!UseObjectMonitorTable) { 504 assert(mark == monitor, "should be the same here"); 505 } else { 506 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 507 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 508 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 509 cmpptr(monitor, alignof(ObjectMonitor*)); 510 jcc(Assembler::below, slow_path); 511 } 512 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 513 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 514 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 515 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 516 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 517 518 Label recursive; 519 520 // Check if recursive. 521 cmpptr(recursions_address, 0); 522 jccb(Assembler::notZero, recursive); 523 524 // Set owner to null. 525 // Release to satisfy the JMM 526 movptr(owner_address, NULL_WORD); 527 // We need a full fence after clearing owner to avoid stranding. 528 // StoreLoad achieves this. 529 membar(StoreLoad); 530 531 // Check if the entry_list is empty. 532 cmpptr(entry_list_address, NULL_WORD); 533 jccb(Assembler::zero, unlocked); // If so we are done. 534 535 // Check if there is a successor. 536 cmpptr(succ_address, NULL_WORD); 537 jccb(Assembler::notZero, unlocked); // If so we are done. 538 539 // Save the monitor pointer in the current thread, so we can try to 540 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 541 if (!UseObjectMonitorTable) { 542 andptr(monitor, ~(int32_t)markWord::monitor_value); 543 } 544 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 545 546 orl(t, 1); // Fast Unlock ZF = 0 547 jmpb(slow_path); 548 549 // Recursive unlock. 550 bind(recursive); 551 decrement(recursions_address); 552 } 553 554 bind(unlocked); 555 xorl(t, t); // Fast Unlock ZF = 1 556 557 #ifdef ASSERT 558 // Check that unlocked label is reached with ZF set. 559 Label zf_correct; 560 Label zf_bad_zero; 561 jcc(Assembler::zero, zf_correct); 562 jmp(zf_bad_zero); 563 #endif 564 565 bind(slow_path); 566 if (stub != nullptr) { 567 bind(stub->slow_path_continuation()); 568 } 569 #ifdef ASSERT 570 // Check that stub->continuation() label is reached with ZF not set. 571 jcc(Assembler::notZero, zf_correct); 572 stop("Fast Unlock ZF != 0"); 573 bind(zf_bad_zero); 574 stop("Fast Unlock ZF != 1"); 575 bind(zf_correct); 576 #endif 577 // C2 uses the value of ZF to determine the continuation. 578 } 579 580 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 581 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 582 } 583 584 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 585 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 586 masm->movptr(dst, rsp); 587 if (framesize > 2 * wordSize) { 588 masm->addptr(dst, framesize - 2 * wordSize); 589 } 590 } 591 592 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 593 if (PreserveFramePointer) { 594 // frame pointer is valid 595 #ifdef ASSERT 596 // Verify frame pointer value in rbp. 597 reconstruct_frame_pointer_helper(this, rtmp); 598 Label L_success; 599 cmpq(rbp, rtmp); 600 jccb(Assembler::equal, L_success); 601 STOP("frame pointer mismatch"); 602 bind(L_success); 603 #endif // ASSERT 604 } else { 605 reconstruct_frame_pointer_helper(this, rbp); 606 } 607 } 608 609 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 610 jint lo = t->_lo; 611 jint hi = t->_hi; 612 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 613 if (t == TypeInt::INT) { 614 return; 615 } 616 617 BLOCK_COMMENT("CastII {"); 618 Label fail; 619 Label succeed; 620 621 if (lo != min_jint) { 622 cmpl(val, lo); 623 jccb(Assembler::less, fail); 624 } 625 if (hi != max_jint) { 626 cmpl(val, hi); 627 jccb(Assembler::greater, fail); 628 } 629 jmpb(succeed); 630 631 bind(fail); 632 movl(c_rarg0, idx); 633 movl(c_rarg1, val); 634 movl(c_rarg2, lo); 635 movl(c_rarg3, hi); 636 reconstruct_frame_pointer(rscratch1); 637 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 638 hlt(); 639 bind(succeed); 640 BLOCK_COMMENT("} // CastII"); 641 } 642 643 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 644 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 645 } 646 647 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 648 jlong lo = t->_lo; 649 jlong hi = t->_hi; 650 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 651 if (t == TypeLong::LONG) { 652 return; 653 } 654 655 BLOCK_COMMENT("CastLL {"); 656 Label fail; 657 Label succeed; 658 659 auto cmp_val = [&](jlong bound) { 660 if (is_simm32(bound)) { 661 cmpq(val, checked_cast<int>(bound)); 662 } else { 663 mov64(tmp, bound); 664 cmpq(val, tmp); 665 } 666 }; 667 668 if (lo != min_jlong) { 669 cmp_val(lo); 670 jccb(Assembler::less, fail); 671 } 672 if (hi != max_jlong) { 673 cmp_val(hi); 674 jccb(Assembler::greater, fail); 675 } 676 jmpb(succeed); 677 678 bind(fail); 679 movl(c_rarg0, idx); 680 movq(c_rarg1, val); 681 mov64(c_rarg2, lo); 682 mov64(c_rarg3, hi); 683 reconstruct_frame_pointer(rscratch1); 684 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 685 hlt(); 686 bind(succeed); 687 BLOCK_COMMENT("} // CastLL"); 688 } 689 690 //------------------------------------------------------------------------------------------- 691 // Generic instructions support for use in .ad files C2 code generation 692 693 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 694 if (dst != src) { 695 movdqu(dst, src); 696 } 697 if (opcode == Op_AbsVD) { 698 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 699 } else { 700 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 701 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 702 } 703 } 704 705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 706 if (opcode == Op_AbsVD) { 707 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 708 } else { 709 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 710 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 711 } 712 } 713 714 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 715 if (dst != src) { 716 movdqu(dst, src); 717 } 718 if (opcode == Op_AbsVF) { 719 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 720 } else { 721 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 722 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 723 } 724 } 725 726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 727 if (opcode == Op_AbsVF) { 728 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 729 } else { 730 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 731 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 732 } 733 } 734 735 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 736 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 737 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 738 739 if (opcode == Op_MinV) { 740 if (elem_bt == T_BYTE) { 741 pminsb(dst, src); 742 } else if (elem_bt == T_SHORT) { 743 pminsw(dst, src); 744 } else if (elem_bt == T_INT) { 745 pminsd(dst, src); 746 } else { 747 assert(elem_bt == T_LONG, "required"); 748 assert(tmp == xmm0, "required"); 749 assert_different_registers(dst, src, tmp); 750 movdqu(xmm0, dst); 751 pcmpgtq(xmm0, src); 752 blendvpd(dst, src); // xmm0 as mask 753 } 754 } else { // opcode == Op_MaxV 755 if (elem_bt == T_BYTE) { 756 pmaxsb(dst, src); 757 } else if (elem_bt == T_SHORT) { 758 pmaxsw(dst, src); 759 } else if (elem_bt == T_INT) { 760 pmaxsd(dst, src); 761 } else { 762 assert(elem_bt == T_LONG, "required"); 763 assert(tmp == xmm0, "required"); 764 assert_different_registers(dst, src, tmp); 765 movdqu(xmm0, src); 766 pcmpgtq(xmm0, dst); 767 blendvpd(dst, src); // xmm0 as mask 768 } 769 } 770 } 771 772 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 773 XMMRegister src1, Address src2, int vlen_enc) { 774 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 775 if (opcode == Op_UMinV) { 776 switch(elem_bt) { 777 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 778 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 779 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 780 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 781 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 782 } 783 } else { 784 assert(opcode == Op_UMaxV, "required"); 785 switch(elem_bt) { 786 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 787 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 788 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 789 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 790 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 791 } 792 } 793 } 794 795 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 796 // For optimality, leverage a full vector width of 512 bits 797 // for operations over smaller vector sizes on AVX512 targets. 798 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 799 if (opcode == Op_UMaxV) { 800 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 801 } else { 802 assert(opcode == Op_UMinV, "required"); 803 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 804 } 805 } else { 806 // T1 = -1 807 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 808 // T1 = -1 << 63 809 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 810 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 811 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 812 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 813 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 814 // Mask = T2 > T1 815 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 816 if (opcode == Op_UMaxV) { 817 // Res = Mask ? Src2 : Src1 818 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 819 } else { 820 // Res = Mask ? Src1 : Src2 821 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 822 } 823 } 824 } 825 826 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 827 XMMRegister src1, XMMRegister src2, int vlen_enc) { 828 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 829 if (opcode == Op_UMinV) { 830 switch(elem_bt) { 831 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 832 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 833 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 834 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 835 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 836 } 837 } else { 838 assert(opcode == Op_UMaxV, "required"); 839 switch(elem_bt) { 840 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 841 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 842 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 843 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 844 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 845 } 846 } 847 } 848 849 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 850 XMMRegister dst, XMMRegister src1, XMMRegister src2, 851 int vlen_enc) { 852 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 853 854 if (opcode == Op_MinV) { 855 if (elem_bt == T_BYTE) { 856 vpminsb(dst, src1, src2, vlen_enc); 857 } else if (elem_bt == T_SHORT) { 858 vpminsw(dst, src1, src2, vlen_enc); 859 } else if (elem_bt == T_INT) { 860 vpminsd(dst, src1, src2, vlen_enc); 861 } else { 862 assert(elem_bt == T_LONG, "required"); 863 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 864 vpminsq(dst, src1, src2, vlen_enc); 865 } else { 866 assert_different_registers(dst, src1, src2); 867 vpcmpgtq(dst, src1, src2, vlen_enc); 868 vblendvpd(dst, src1, src2, dst, vlen_enc); 869 } 870 } 871 } else { // opcode == Op_MaxV 872 if (elem_bt == T_BYTE) { 873 vpmaxsb(dst, src1, src2, vlen_enc); 874 } else if (elem_bt == T_SHORT) { 875 vpmaxsw(dst, src1, src2, vlen_enc); 876 } else if (elem_bt == T_INT) { 877 vpmaxsd(dst, src1, src2, vlen_enc); 878 } else { 879 assert(elem_bt == T_LONG, "required"); 880 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 881 vpmaxsq(dst, src1, src2, vlen_enc); 882 } else { 883 assert_different_registers(dst, src1, src2); 884 vpcmpgtq(dst, src1, src2, vlen_enc); 885 vblendvpd(dst, src2, src1, dst, vlen_enc); 886 } 887 } 888 } 889 } 890 891 // Float/Double min max 892 893 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 894 XMMRegister dst, XMMRegister a, XMMRegister b, 895 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 896 int vlen_enc) { 897 assert(UseAVX > 0, "required"); 898 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 899 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 900 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 901 assert_different_registers(a, tmp, atmp, btmp); 902 assert_different_registers(b, tmp, atmp, btmp); 903 904 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 905 bool is_double_word = is_double_word_type(elem_bt); 906 907 /* Note on 'non-obvious' assembly sequence: 908 * 909 * While there are vminps/vmaxps instructions, there are two important differences between hardware 910 * and Java on how they handle floats: 911 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 912 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 913 * 914 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 915 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 916 * (only useful when signs differ, noop otherwise) 917 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 918 919 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 920 * btmp = (b < +0.0) ? a : b 921 * atmp = (b < +0.0) ? b : a 922 * Tmp = Max_Float(atmp , btmp) 923 * Res = (atmp == NaN) ? atmp : Tmp 924 */ 925 926 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 927 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 928 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 929 XMMRegister mask; 930 931 if (!is_double_word && is_min) { 932 mask = a; 933 vblend = &MacroAssembler::vblendvps; 934 vmaxmin = &MacroAssembler::vminps; 935 vcmp = &MacroAssembler::vcmpps; 936 } else if (!is_double_word && !is_min) { 937 mask = b; 938 vblend = &MacroAssembler::vblendvps; 939 vmaxmin = &MacroAssembler::vmaxps; 940 vcmp = &MacroAssembler::vcmpps; 941 } else if (is_double_word && is_min) { 942 mask = a; 943 vblend = &MacroAssembler::vblendvpd; 944 vmaxmin = &MacroAssembler::vminpd; 945 vcmp = &MacroAssembler::vcmppd; 946 } else { 947 assert(is_double_word && !is_min, "sanity"); 948 mask = b; 949 vblend = &MacroAssembler::vblendvpd; 950 vmaxmin = &MacroAssembler::vmaxpd; 951 vcmp = &MacroAssembler::vcmppd; 952 } 953 954 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 955 XMMRegister maxmin, scratch; 956 if (dst == btmp) { 957 maxmin = btmp; 958 scratch = tmp; 959 } else { 960 maxmin = tmp; 961 scratch = btmp; 962 } 963 964 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 965 if (precompute_mask && !is_double_word) { 966 vpsrad(tmp, mask, 32, vlen_enc); 967 mask = tmp; 968 } else if (precompute_mask && is_double_word) { 969 vpxor(tmp, tmp, tmp, vlen_enc); 970 vpcmpgtq(tmp, tmp, mask, vlen_enc); 971 mask = tmp; 972 } 973 974 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 975 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 976 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 977 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 978 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 979 } 980 981 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 982 XMMRegister dst, XMMRegister a, XMMRegister b, 983 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 984 int vlen_enc) { 985 assert(UseAVX > 2, "required"); 986 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 987 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 988 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 989 assert_different_registers(dst, a, atmp, btmp); 990 assert_different_registers(dst, b, atmp, btmp); 991 992 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 993 bool is_double_word = is_double_word_type(elem_bt); 994 bool merge = true; 995 996 if (!is_double_word && is_min) { 997 evpmovd2m(ktmp, a, vlen_enc); 998 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 999 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1000 vminps(dst, atmp, btmp, vlen_enc); 1001 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1002 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1003 } else if (!is_double_word && !is_min) { 1004 evpmovd2m(ktmp, b, vlen_enc); 1005 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1006 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1007 vmaxps(dst, atmp, btmp, vlen_enc); 1008 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1009 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1010 } else if (is_double_word && is_min) { 1011 evpmovq2m(ktmp, a, vlen_enc); 1012 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1013 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1014 vminpd(dst, atmp, btmp, vlen_enc); 1015 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1016 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1017 } else { 1018 assert(is_double_word && !is_min, "sanity"); 1019 evpmovq2m(ktmp, b, vlen_enc); 1020 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1021 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1022 vmaxpd(dst, atmp, btmp, vlen_enc); 1023 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1024 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1025 } 1026 } 1027 1028 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask, 1029 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1030 assert(opc == Op_MinV || opc == Op_MinReductionV || 1031 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity"); 1032 1033 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN 1034 : AVX10_2_MINMAX_MAX_COMPARE_SIGN; 1035 if (elem_bt == T_FLOAT) { 1036 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc); 1037 } else { 1038 assert(elem_bt == T_DOUBLE, ""); 1039 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc); 1040 } 1041 } 1042 1043 // Float/Double signum 1044 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1045 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1046 1047 Label DONE_LABEL; 1048 1049 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument 1050 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases 1051 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases 1052 if (opcode == Op_SignumF) { 1053 if (VM_Version::supports_avx10_2()) { 1054 vucomxss(dst, zero); 1055 jcc(Assembler::negative, DONE_LABEL); 1056 } else { 1057 ucomiss(dst, zero); 1058 jcc(Assembler::equal, DONE_LABEL); 1059 } 1060 movflt(dst, one); 1061 jcc(Assembler::above, DONE_LABEL); 1062 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1063 } else if (opcode == Op_SignumD) { 1064 if (VM_Version::supports_avx10_2()) { 1065 vucomxsd(dst, zero); 1066 jcc(Assembler::negative, DONE_LABEL); 1067 } else { 1068 ucomisd(dst, zero); 1069 jcc(Assembler::equal, DONE_LABEL); 1070 } 1071 movdbl(dst, one); 1072 jcc(Assembler::above, DONE_LABEL); 1073 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1074 } 1075 1076 bind(DONE_LABEL); 1077 } 1078 1079 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1080 if (sign) { 1081 pmovsxbw(dst, src); 1082 } else { 1083 pmovzxbw(dst, src); 1084 } 1085 } 1086 1087 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1088 if (sign) { 1089 vpmovsxbw(dst, src, vector_len); 1090 } else { 1091 vpmovzxbw(dst, src, vector_len); 1092 } 1093 } 1094 1095 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1096 if (sign) { 1097 vpmovsxbd(dst, src, vector_len); 1098 } else { 1099 vpmovzxbd(dst, src, vector_len); 1100 } 1101 } 1102 1103 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1104 if (sign) { 1105 vpmovsxwd(dst, src, vector_len); 1106 } else { 1107 vpmovzxwd(dst, src, vector_len); 1108 } 1109 } 1110 1111 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1112 int shift, int vector_len) { 1113 if (opcode == Op_RotateLeftV) { 1114 if (etype == T_INT) { 1115 evprold(dst, src, shift, vector_len); 1116 } else { 1117 assert(etype == T_LONG, "expected type T_LONG"); 1118 evprolq(dst, src, shift, vector_len); 1119 } 1120 } else { 1121 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1122 if (etype == T_INT) { 1123 evprord(dst, src, shift, vector_len); 1124 } else { 1125 assert(etype == T_LONG, "expected type T_LONG"); 1126 evprorq(dst, src, shift, vector_len); 1127 } 1128 } 1129 } 1130 1131 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1132 XMMRegister shift, int vector_len) { 1133 if (opcode == Op_RotateLeftV) { 1134 if (etype == T_INT) { 1135 evprolvd(dst, src, shift, vector_len); 1136 } else { 1137 assert(etype == T_LONG, "expected type T_LONG"); 1138 evprolvq(dst, src, shift, vector_len); 1139 } 1140 } else { 1141 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1142 if (etype == T_INT) { 1143 evprorvd(dst, src, shift, vector_len); 1144 } else { 1145 assert(etype == T_LONG, "expected type T_LONG"); 1146 evprorvq(dst, src, shift, vector_len); 1147 } 1148 } 1149 } 1150 1151 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1152 if (opcode == Op_RShiftVI) { 1153 psrad(dst, shift); 1154 } else if (opcode == Op_LShiftVI) { 1155 pslld(dst, shift); 1156 } else { 1157 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1158 psrld(dst, shift); 1159 } 1160 } 1161 1162 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1163 switch (opcode) { 1164 case Op_RShiftVI: psrad(dst, shift); break; 1165 case Op_LShiftVI: pslld(dst, shift); break; 1166 case Op_URShiftVI: psrld(dst, shift); break; 1167 1168 default: assert(false, "%s", NodeClassNames[opcode]); 1169 } 1170 } 1171 1172 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1173 if (opcode == Op_RShiftVI) { 1174 vpsrad(dst, nds, shift, vector_len); 1175 } else if (opcode == Op_LShiftVI) { 1176 vpslld(dst, nds, shift, vector_len); 1177 } else { 1178 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1179 vpsrld(dst, nds, shift, vector_len); 1180 } 1181 } 1182 1183 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1184 switch (opcode) { 1185 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1186 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1187 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1188 1189 default: assert(false, "%s", NodeClassNames[opcode]); 1190 } 1191 } 1192 1193 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1194 switch (opcode) { 1195 case Op_RShiftVB: // fall-through 1196 case Op_RShiftVS: psraw(dst, shift); break; 1197 1198 case Op_LShiftVB: // fall-through 1199 case Op_LShiftVS: psllw(dst, shift); break; 1200 1201 case Op_URShiftVS: // fall-through 1202 case Op_URShiftVB: psrlw(dst, shift); break; 1203 1204 default: assert(false, "%s", NodeClassNames[opcode]); 1205 } 1206 } 1207 1208 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1209 switch (opcode) { 1210 case Op_RShiftVB: // fall-through 1211 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1212 1213 case Op_LShiftVB: // fall-through 1214 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1215 1216 case Op_URShiftVS: // fall-through 1217 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1218 1219 default: assert(false, "%s", NodeClassNames[opcode]); 1220 } 1221 } 1222 1223 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1224 switch (opcode) { 1225 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1226 case Op_LShiftVL: psllq(dst, shift); break; 1227 case Op_URShiftVL: psrlq(dst, shift); break; 1228 1229 default: assert(false, "%s", NodeClassNames[opcode]); 1230 } 1231 } 1232 1233 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1234 if (opcode == Op_RShiftVL) { 1235 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1236 } else if (opcode == Op_LShiftVL) { 1237 psllq(dst, shift); 1238 } else { 1239 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1240 psrlq(dst, shift); 1241 } 1242 } 1243 1244 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1245 switch (opcode) { 1246 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1247 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1248 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1249 1250 default: assert(false, "%s", NodeClassNames[opcode]); 1251 } 1252 } 1253 1254 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1255 if (opcode == Op_RShiftVL) { 1256 evpsraq(dst, nds, shift, vector_len); 1257 } else if (opcode == Op_LShiftVL) { 1258 vpsllq(dst, nds, shift, vector_len); 1259 } else { 1260 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1261 vpsrlq(dst, nds, shift, vector_len); 1262 } 1263 } 1264 1265 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1266 switch (opcode) { 1267 case Op_RShiftVB: // fall-through 1268 case Op_RShiftVS: // fall-through 1269 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1270 1271 case Op_LShiftVB: // fall-through 1272 case Op_LShiftVS: // fall-through 1273 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1274 1275 case Op_URShiftVB: // fall-through 1276 case Op_URShiftVS: // fall-through 1277 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1278 1279 default: assert(false, "%s", NodeClassNames[opcode]); 1280 } 1281 } 1282 1283 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1284 switch (opcode) { 1285 case Op_RShiftVB: // fall-through 1286 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1287 1288 case Op_LShiftVB: // fall-through 1289 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1290 1291 case Op_URShiftVB: // fall-through 1292 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1293 1294 default: assert(false, "%s", NodeClassNames[opcode]); 1295 } 1296 } 1297 1298 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1299 assert(UseAVX >= 2, "required"); 1300 switch (opcode) { 1301 case Op_RShiftVL: { 1302 if (UseAVX > 2) { 1303 assert(tmp == xnoreg, "not used"); 1304 if (!VM_Version::supports_avx512vl()) { 1305 vlen_enc = Assembler::AVX_512bit; 1306 } 1307 evpsravq(dst, src, shift, vlen_enc); 1308 } else { 1309 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1310 vpsrlvq(dst, src, shift, vlen_enc); 1311 vpsrlvq(tmp, tmp, shift, vlen_enc); 1312 vpxor(dst, dst, tmp, vlen_enc); 1313 vpsubq(dst, dst, tmp, vlen_enc); 1314 } 1315 break; 1316 } 1317 case Op_LShiftVL: { 1318 assert(tmp == xnoreg, "not used"); 1319 vpsllvq(dst, src, shift, vlen_enc); 1320 break; 1321 } 1322 case Op_URShiftVL: { 1323 assert(tmp == xnoreg, "not used"); 1324 vpsrlvq(dst, src, shift, vlen_enc); 1325 break; 1326 } 1327 default: assert(false, "%s", NodeClassNames[opcode]); 1328 } 1329 } 1330 1331 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1332 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1333 assert(opcode == Op_LShiftVB || 1334 opcode == Op_RShiftVB || 1335 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1336 bool sign = (opcode != Op_URShiftVB); 1337 assert(vector_len == 0, "required"); 1338 vextendbd(sign, dst, src, 1); 1339 vpmovzxbd(vtmp, shift, 1); 1340 varshiftd(opcode, dst, dst, vtmp, 1); 1341 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1342 vextracti128_high(vtmp, dst); 1343 vpackusdw(dst, dst, vtmp, 0); 1344 } 1345 1346 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1347 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1348 assert(opcode == Op_LShiftVB || 1349 opcode == Op_RShiftVB || 1350 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1351 bool sign = (opcode != Op_URShiftVB); 1352 int ext_vector_len = vector_len + 1; 1353 vextendbw(sign, dst, src, ext_vector_len); 1354 vpmovzxbw(vtmp, shift, ext_vector_len); 1355 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1356 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1357 if (vector_len == 0) { 1358 vextracti128_high(vtmp, dst); 1359 vpackuswb(dst, dst, vtmp, vector_len); 1360 } else { 1361 vextracti64x4_high(vtmp, dst); 1362 vpackuswb(dst, dst, vtmp, vector_len); 1363 vpermq(dst, dst, 0xD8, vector_len); 1364 } 1365 } 1366 1367 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1368 switch(typ) { 1369 case T_BYTE: 1370 pinsrb(dst, val, idx); 1371 break; 1372 case T_SHORT: 1373 pinsrw(dst, val, idx); 1374 break; 1375 case T_INT: 1376 pinsrd(dst, val, idx); 1377 break; 1378 case T_LONG: 1379 pinsrq(dst, val, idx); 1380 break; 1381 default: 1382 assert(false,"Should not reach here."); 1383 break; 1384 } 1385 } 1386 1387 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1388 switch(typ) { 1389 case T_BYTE: 1390 vpinsrb(dst, src, val, idx); 1391 break; 1392 case T_SHORT: 1393 vpinsrw(dst, src, val, idx); 1394 break; 1395 case T_INT: 1396 vpinsrd(dst, src, val, idx); 1397 break; 1398 case T_LONG: 1399 vpinsrq(dst, src, val, idx); 1400 break; 1401 default: 1402 assert(false,"Should not reach here."); 1403 break; 1404 } 1405 } 1406 1407 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst, 1408 Register base, Register idx_base, 1409 Register mask, Register mask_idx, 1410 Register rtmp, int vlen_enc) { 1411 vpxor(dst, dst, dst, vlen_enc); 1412 if (elem_bt == T_SHORT) { 1413 for (int i = 0; i < 4; i++) { 1414 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1415 Label skip_load; 1416 btq(mask, mask_idx); 1417 jccb(Assembler::carryClear, skip_load); 1418 movl(rtmp, Address(idx_base, i * 4)); 1419 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1420 bind(skip_load); 1421 incq(mask_idx); 1422 } 1423 } else { 1424 assert(elem_bt == T_BYTE, ""); 1425 for (int i = 0; i < 8; i++) { 1426 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1427 Label skip_load; 1428 btq(mask, mask_idx); 1429 jccb(Assembler::carryClear, skip_load); 1430 movl(rtmp, Address(idx_base, i * 4)); 1431 pinsrb(dst, Address(base, rtmp), i); 1432 bind(skip_load); 1433 incq(mask_idx); 1434 } 1435 } 1436 } 1437 1438 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst, 1439 Register base, Register idx_base, 1440 Register rtmp, int vlen_enc) { 1441 vpxor(dst, dst, dst, vlen_enc); 1442 if (elem_bt == T_SHORT) { 1443 for (int i = 0; i < 4; i++) { 1444 // dst[i] = src[idx_base[i]] 1445 movl(rtmp, Address(idx_base, i * 4)); 1446 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1447 } 1448 } else { 1449 assert(elem_bt == T_BYTE, ""); 1450 for (int i = 0; i < 8; i++) { 1451 // dst[i] = src[idx_base[i]] 1452 movl(rtmp, Address(idx_base, i * 4)); 1453 pinsrb(dst, Address(base, rtmp), i); 1454 } 1455 } 1456 } 1457 1458 /* 1459 * Gather using hybrid algorithm, first partially unroll scalar loop 1460 * to accumulate values from gather indices into a quad-word(64bit) slice. 1461 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1462 * permutation to place the slice into appropriate vector lane 1463 * locations in destination vector. Following pseudo code describes the 1464 * algorithm in detail: 1465 * 1466 * DST_VEC = ZERO_VEC 1467 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1468 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1469 * FOREACH_ITER: 1470 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1471 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1472 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1473 * PERM_INDEX = PERM_INDEX - TWO_VEC 1474 * 1475 * With each iteration, doubleword permute indices (0,1) corresponding 1476 * to gathered quadword gets right shifted by two lane positions. 1477 * 1478 */ 1479 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1480 Register base, Register idx_base, 1481 Register mask, XMMRegister xtmp1, 1482 XMMRegister xtmp2, XMMRegister temp_dst, 1483 Register rtmp, Register mask_idx, 1484 Register length, int vector_len, int vlen_enc) { 1485 Label GATHER8_LOOP; 1486 assert(is_subword_type(elem_ty), ""); 1487 movl(length, vector_len); 1488 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1489 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1490 vallones(xtmp2, vlen_enc); 1491 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1492 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1493 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1494 1495 bind(GATHER8_LOOP); 1496 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1497 if (mask == noreg) { 1498 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc); 1499 } else { 1500 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc); 1501 } 1502 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1503 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1504 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1505 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1506 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1507 vpor(dst, dst, temp_dst, vlen_enc); 1508 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1509 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1510 jcc(Assembler::notEqual, GATHER8_LOOP); 1511 } 1512 1513 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1514 switch(typ) { 1515 case T_INT: 1516 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1517 break; 1518 case T_FLOAT: 1519 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1520 break; 1521 case T_LONG: 1522 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1523 break; 1524 case T_DOUBLE: 1525 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1526 break; 1527 default: 1528 assert(false,"Should not reach here."); 1529 break; 1530 } 1531 } 1532 1533 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1534 switch(typ) { 1535 case T_INT: 1536 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1537 break; 1538 case T_FLOAT: 1539 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1540 break; 1541 case T_LONG: 1542 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1543 break; 1544 case T_DOUBLE: 1545 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1546 break; 1547 default: 1548 assert(false,"Should not reach here."); 1549 break; 1550 } 1551 } 1552 1553 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1554 switch(typ) { 1555 case T_INT: 1556 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1557 break; 1558 case T_FLOAT: 1559 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1560 break; 1561 case T_LONG: 1562 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1563 break; 1564 case T_DOUBLE: 1565 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1566 break; 1567 default: 1568 assert(false,"Should not reach here."); 1569 break; 1570 } 1571 } 1572 1573 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1574 if (vlen_in_bytes <= 16) { 1575 pxor (dst, dst); 1576 psubb(dst, src); 1577 switch (elem_bt) { 1578 case T_BYTE: /* nothing to do */ break; 1579 case T_SHORT: pmovsxbw(dst, dst); break; 1580 case T_INT: pmovsxbd(dst, dst); break; 1581 case T_FLOAT: pmovsxbd(dst, dst); break; 1582 case T_LONG: pmovsxbq(dst, dst); break; 1583 case T_DOUBLE: pmovsxbq(dst, dst); break; 1584 1585 default: assert(false, "%s", type2name(elem_bt)); 1586 } 1587 } else { 1588 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1589 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1590 1591 vpxor (dst, dst, dst, vlen_enc); 1592 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1593 1594 switch (elem_bt) { 1595 case T_BYTE: /* nothing to do */ break; 1596 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1597 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1598 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1599 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1600 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1601 1602 default: assert(false, "%s", type2name(elem_bt)); 1603 } 1604 } 1605 } 1606 1607 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1608 if (novlbwdq) { 1609 vpmovsxbd(xtmp, src, vlen_enc); 1610 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1611 Assembler::eq, true, vlen_enc, noreg); 1612 } else { 1613 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1614 vpsubb(xtmp, xtmp, src, vlen_enc); 1615 evpmovb2m(dst, xtmp, vlen_enc); 1616 } 1617 } 1618 1619 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1620 if (is_integral_type(bt)) { 1621 switch (vlen_in_bytes) { 1622 case 4: movdl(dst, src); break; 1623 case 8: movq(dst, src); break; 1624 case 16: movdqu(dst, src); break; 1625 case 32: vmovdqu(dst, src); break; 1626 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1627 default: ShouldNotReachHere(); 1628 } 1629 } else { 1630 switch (vlen_in_bytes) { 1631 case 4: movflt(dst, src); break; 1632 case 8: movdbl(dst, src); break; 1633 case 16: movups(dst, src); break; 1634 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1635 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1636 default: ShouldNotReachHere(); 1637 } 1638 } 1639 } 1640 1641 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1642 assert(rscratch != noreg || always_reachable(src), "missing"); 1643 1644 if (reachable(src)) { 1645 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1646 } else { 1647 lea(rscratch, src); 1648 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1649 } 1650 } 1651 1652 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1653 int vlen_enc = vector_length_encoding(vlen); 1654 if (VM_Version::supports_avx()) { 1655 if (bt == T_LONG) { 1656 if (VM_Version::supports_avx2()) { 1657 vpbroadcastq(dst, src, vlen_enc); 1658 } else { 1659 vmovddup(dst, src, vlen_enc); 1660 } 1661 } else if (bt == T_DOUBLE) { 1662 if (vlen_enc != Assembler::AVX_128bit) { 1663 vbroadcastsd(dst, src, vlen_enc, noreg); 1664 } else { 1665 vmovddup(dst, src, vlen_enc); 1666 } 1667 } else { 1668 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1669 vpbroadcastd(dst, src, vlen_enc); 1670 } else { 1671 vbroadcastss(dst, src, vlen_enc); 1672 } 1673 } 1674 } else if (VM_Version::supports_sse3()) { 1675 movddup(dst, src); 1676 } else { 1677 load_vector(bt, dst, src, vlen); 1678 } 1679 } 1680 1681 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1682 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1683 int offset = exact_log2(type2aelembytes(bt)) << 6; 1684 if (is_floating_point_type(bt)) { 1685 offset += 128; 1686 } 1687 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1688 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1689 } 1690 1691 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1692 1693 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1694 int vector_len = Assembler::AVX_128bit; 1695 1696 switch (opcode) { 1697 case Op_AndReductionV: pand(dst, src); break; 1698 case Op_OrReductionV: por (dst, src); break; 1699 case Op_XorReductionV: pxor(dst, src); break; 1700 case Op_MinReductionV: 1701 switch (typ) { 1702 case T_BYTE: pminsb(dst, src); break; 1703 case T_SHORT: pminsw(dst, src); break; 1704 case T_INT: pminsd(dst, src); break; 1705 case T_LONG: assert(UseAVX > 2, "required"); 1706 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1707 default: assert(false, "wrong type"); 1708 } 1709 break; 1710 case Op_MaxReductionV: 1711 switch (typ) { 1712 case T_BYTE: pmaxsb(dst, src); break; 1713 case T_SHORT: pmaxsw(dst, src); break; 1714 case T_INT: pmaxsd(dst, src); break; 1715 case T_LONG: assert(UseAVX > 2, "required"); 1716 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1717 default: assert(false, "wrong type"); 1718 } 1719 break; 1720 case Op_AddReductionVF: addss(dst, src); break; 1721 case Op_AddReductionVD: addsd(dst, src); break; 1722 case Op_AddReductionVI: 1723 switch (typ) { 1724 case T_BYTE: paddb(dst, src); break; 1725 case T_SHORT: paddw(dst, src); break; 1726 case T_INT: paddd(dst, src); break; 1727 default: assert(false, "wrong type"); 1728 } 1729 break; 1730 case Op_AddReductionVL: paddq(dst, src); break; 1731 case Op_MulReductionVF: mulss(dst, src); break; 1732 case Op_MulReductionVD: mulsd(dst, src); break; 1733 case Op_MulReductionVI: 1734 switch (typ) { 1735 case T_SHORT: pmullw(dst, src); break; 1736 case T_INT: pmulld(dst, src); break; 1737 default: assert(false, "wrong type"); 1738 } 1739 break; 1740 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1741 evpmullq(dst, dst, src, vector_len); break; 1742 default: assert(false, "wrong opcode"); 1743 } 1744 } 1745 1746 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1747 switch (opcode) { 1748 case Op_AddReductionVF: addps(dst, src); break; 1749 case Op_AddReductionVD: addpd(dst, src); break; 1750 case Op_MulReductionVF: mulps(dst, src); break; 1751 case Op_MulReductionVD: mulpd(dst, src); break; 1752 default: assert(false, "%s", NodeClassNames[opcode]); 1753 } 1754 } 1755 1756 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1757 int vector_len = Assembler::AVX_256bit; 1758 1759 switch (opcode) { 1760 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1761 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1762 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1763 case Op_MinReductionV: 1764 switch (typ) { 1765 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1766 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1767 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1768 case T_LONG: assert(UseAVX > 2, "required"); 1769 vpminsq(dst, src1, src2, vector_len); break; 1770 default: assert(false, "wrong type"); 1771 } 1772 break; 1773 case Op_MaxReductionV: 1774 switch (typ) { 1775 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1776 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1777 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1778 case T_LONG: assert(UseAVX > 2, "required"); 1779 vpmaxsq(dst, src1, src2, vector_len); break; 1780 default: assert(false, "wrong type"); 1781 } 1782 break; 1783 case Op_AddReductionVI: 1784 switch (typ) { 1785 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1786 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1787 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1788 default: assert(false, "wrong type"); 1789 } 1790 break; 1791 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1792 case Op_MulReductionVI: 1793 switch (typ) { 1794 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1795 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1796 default: assert(false, "wrong type"); 1797 } 1798 break; 1799 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1800 default: assert(false, "wrong opcode"); 1801 } 1802 } 1803 1804 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1805 int vector_len = Assembler::AVX_256bit; 1806 1807 switch (opcode) { 1808 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1809 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1810 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1811 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1812 default: assert(false, "%s", NodeClassNames[opcode]); 1813 } 1814 } 1815 1816 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1817 XMMRegister dst, XMMRegister src, 1818 XMMRegister vtmp1, XMMRegister vtmp2) { 1819 switch (opcode) { 1820 case Op_AddReductionVF: 1821 case Op_MulReductionVF: 1822 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1823 break; 1824 1825 case Op_AddReductionVD: 1826 case Op_MulReductionVD: 1827 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1828 break; 1829 1830 default: assert(false, "wrong opcode"); 1831 } 1832 } 1833 1834 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1835 XMMRegister dst, XMMRegister src, 1836 XMMRegister vtmp1, XMMRegister vtmp2) { 1837 switch (opcode) { 1838 case Op_AddReductionVF: 1839 case Op_MulReductionVF: 1840 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1841 break; 1842 1843 case Op_AddReductionVD: 1844 case Op_MulReductionVD: 1845 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1846 break; 1847 1848 default: assert(false, "%s", NodeClassNames[opcode]); 1849 } 1850 } 1851 1852 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1853 Register dst, Register src1, XMMRegister src2, 1854 XMMRegister vtmp1, XMMRegister vtmp2) { 1855 switch (vlen) { 1856 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1857 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1858 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1859 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1860 1861 default: assert(false, "wrong vector length"); 1862 } 1863 } 1864 1865 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1866 Register dst, Register src1, XMMRegister src2, 1867 XMMRegister vtmp1, XMMRegister vtmp2) { 1868 switch (vlen) { 1869 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1870 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1871 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1872 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1873 1874 default: assert(false, "wrong vector length"); 1875 } 1876 } 1877 1878 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1879 Register dst, Register src1, XMMRegister src2, 1880 XMMRegister vtmp1, XMMRegister vtmp2) { 1881 switch (vlen) { 1882 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1883 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1884 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1885 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1886 1887 default: assert(false, "wrong vector length"); 1888 } 1889 } 1890 1891 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1892 Register dst, Register src1, XMMRegister src2, 1893 XMMRegister vtmp1, XMMRegister vtmp2) { 1894 switch (vlen) { 1895 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1896 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1897 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1898 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1899 1900 default: assert(false, "wrong vector length"); 1901 } 1902 } 1903 1904 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1905 Register dst, Register src1, XMMRegister src2, 1906 XMMRegister vtmp1, XMMRegister vtmp2) { 1907 switch (vlen) { 1908 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1909 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1910 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1911 1912 default: assert(false, "wrong vector length"); 1913 } 1914 } 1915 1916 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1917 switch (vlen) { 1918 case 2: 1919 assert(vtmp2 == xnoreg, ""); 1920 reduce2F(opcode, dst, src, vtmp1); 1921 break; 1922 case 4: 1923 assert(vtmp2 == xnoreg, ""); 1924 reduce4F(opcode, dst, src, vtmp1); 1925 break; 1926 case 8: 1927 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1928 break; 1929 case 16: 1930 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1931 break; 1932 default: assert(false, "wrong vector length"); 1933 } 1934 } 1935 1936 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1937 switch (vlen) { 1938 case 2: 1939 assert(vtmp2 == xnoreg, ""); 1940 reduce2D(opcode, dst, src, vtmp1); 1941 break; 1942 case 4: 1943 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1944 break; 1945 case 8: 1946 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1947 break; 1948 default: assert(false, "wrong vector length"); 1949 } 1950 } 1951 1952 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1953 switch (vlen) { 1954 case 2: 1955 assert(vtmp1 == xnoreg, ""); 1956 assert(vtmp2 == xnoreg, ""); 1957 unorderedReduce2F(opcode, dst, src); 1958 break; 1959 case 4: 1960 assert(vtmp2 == xnoreg, ""); 1961 unorderedReduce4F(opcode, dst, src, vtmp1); 1962 break; 1963 case 8: 1964 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 1965 break; 1966 case 16: 1967 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 1968 break; 1969 default: assert(false, "wrong vector length"); 1970 } 1971 } 1972 1973 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1974 switch (vlen) { 1975 case 2: 1976 assert(vtmp1 == xnoreg, ""); 1977 assert(vtmp2 == xnoreg, ""); 1978 unorderedReduce2D(opcode, dst, src); 1979 break; 1980 case 4: 1981 assert(vtmp2 == xnoreg, ""); 1982 unorderedReduce4D(opcode, dst, src, vtmp1); 1983 break; 1984 case 8: 1985 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 1986 break; 1987 default: assert(false, "wrong vector length"); 1988 } 1989 } 1990 1991 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1992 if (opcode == Op_AddReductionVI) { 1993 if (vtmp1 != src2) { 1994 movdqu(vtmp1, src2); 1995 } 1996 phaddd(vtmp1, vtmp1); 1997 } else { 1998 pshufd(vtmp1, src2, 0x1); 1999 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2000 } 2001 movdl(vtmp2, src1); 2002 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2003 movdl(dst, vtmp1); 2004 } 2005 2006 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2007 if (opcode == Op_AddReductionVI) { 2008 if (vtmp1 != src2) { 2009 movdqu(vtmp1, src2); 2010 } 2011 phaddd(vtmp1, src2); 2012 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2013 } else { 2014 pshufd(vtmp2, src2, 0xE); 2015 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2016 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2017 } 2018 } 2019 2020 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2021 if (opcode == Op_AddReductionVI) { 2022 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2023 vextracti128_high(vtmp2, vtmp1); 2024 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2025 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2026 } else { 2027 vextracti128_high(vtmp1, src2); 2028 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2029 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2030 } 2031 } 2032 2033 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2034 vextracti64x4_high(vtmp2, src2); 2035 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2036 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2037 } 2038 2039 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2040 pshufd(vtmp2, src2, 0x1); 2041 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2042 movdqu(vtmp1, vtmp2); 2043 psrldq(vtmp1, 2); 2044 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2045 movdqu(vtmp2, vtmp1); 2046 psrldq(vtmp2, 1); 2047 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2048 movdl(vtmp2, src1); 2049 pmovsxbd(vtmp1, vtmp1); 2050 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2051 pextrb(dst, vtmp1, 0x0); 2052 movsbl(dst, dst); 2053 } 2054 2055 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2056 pshufd(vtmp1, src2, 0xE); 2057 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2058 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2059 } 2060 2061 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2062 vextracti128_high(vtmp2, src2); 2063 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2064 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2065 } 2066 2067 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2068 vextracti64x4_high(vtmp1, src2); 2069 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2070 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2071 } 2072 2073 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2074 pmovsxbw(vtmp2, src2); 2075 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2076 } 2077 2078 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2079 if (UseAVX > 1) { 2080 int vector_len = Assembler::AVX_256bit; 2081 vpmovsxbw(vtmp1, src2, vector_len); 2082 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2083 } else { 2084 pmovsxbw(vtmp2, src2); 2085 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2086 pshufd(vtmp2, src2, 0x1); 2087 pmovsxbw(vtmp2, src2); 2088 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2089 } 2090 } 2091 2092 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2093 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2094 int vector_len = Assembler::AVX_512bit; 2095 vpmovsxbw(vtmp1, src2, vector_len); 2096 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2097 } else { 2098 assert(UseAVX >= 2,"Should not reach here."); 2099 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2100 vextracti128_high(vtmp2, src2); 2101 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2102 } 2103 } 2104 2105 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2106 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2107 vextracti64x4_high(vtmp2, src2); 2108 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2109 } 2110 2111 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2112 if (opcode == Op_AddReductionVI) { 2113 if (vtmp1 != src2) { 2114 movdqu(vtmp1, src2); 2115 } 2116 phaddw(vtmp1, vtmp1); 2117 phaddw(vtmp1, vtmp1); 2118 } else { 2119 pshufd(vtmp2, src2, 0x1); 2120 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2121 movdqu(vtmp1, vtmp2); 2122 psrldq(vtmp1, 2); 2123 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2124 } 2125 movdl(vtmp2, src1); 2126 pmovsxwd(vtmp1, vtmp1); 2127 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2128 pextrw(dst, vtmp1, 0x0); 2129 movswl(dst, dst); 2130 } 2131 2132 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2133 if (opcode == Op_AddReductionVI) { 2134 if (vtmp1 != src2) { 2135 movdqu(vtmp1, src2); 2136 } 2137 phaddw(vtmp1, src2); 2138 } else { 2139 pshufd(vtmp1, src2, 0xE); 2140 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2141 } 2142 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2143 } 2144 2145 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2146 if (opcode == Op_AddReductionVI) { 2147 int vector_len = Assembler::AVX_256bit; 2148 vphaddw(vtmp2, src2, src2, vector_len); 2149 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2150 } else { 2151 vextracti128_high(vtmp2, src2); 2152 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2153 } 2154 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2155 } 2156 2157 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2158 int vector_len = Assembler::AVX_256bit; 2159 vextracti64x4_high(vtmp1, src2); 2160 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2161 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2162 } 2163 2164 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2165 pshufd(vtmp2, src2, 0xE); 2166 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2167 movdq(vtmp1, src1); 2168 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2169 movdq(dst, vtmp1); 2170 } 2171 2172 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2173 vextracti128_high(vtmp1, src2); 2174 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2175 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2176 } 2177 2178 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2179 vextracti64x4_high(vtmp2, src2); 2180 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2181 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2182 } 2183 2184 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2185 mov64(temp, -1L); 2186 bzhiq(temp, temp, len); 2187 kmovql(dst, temp); 2188 } 2189 2190 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2191 reduce_operation_128(T_FLOAT, opcode, dst, src); 2192 pshufd(vtmp, src, 0x1); 2193 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2194 } 2195 2196 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2197 reduce2F(opcode, dst, src, vtmp); 2198 pshufd(vtmp, src, 0x2); 2199 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2200 pshufd(vtmp, src, 0x3); 2201 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2202 } 2203 2204 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2205 reduce4F(opcode, dst, src, vtmp2); 2206 vextractf128_high(vtmp2, src); 2207 reduce4F(opcode, dst, vtmp2, vtmp1); 2208 } 2209 2210 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2211 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2212 vextracti64x4_high(vtmp1, src); 2213 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2214 } 2215 2216 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2217 pshufd(dst, src, 0x1); 2218 reduce_operation_128(T_FLOAT, opcode, dst, src); 2219 } 2220 2221 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2222 pshufd(vtmp, src, 0xE); 2223 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2224 unorderedReduce2F(opcode, dst, vtmp); 2225 } 2226 2227 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2228 vextractf128_high(vtmp1, src); 2229 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2230 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2231 } 2232 2233 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2234 vextractf64x4_high(vtmp2, src); 2235 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2236 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2237 } 2238 2239 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2240 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2241 pshufd(vtmp, src, 0xE); 2242 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2243 } 2244 2245 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2246 reduce2D(opcode, dst, src, vtmp2); 2247 vextractf128_high(vtmp2, src); 2248 reduce2D(opcode, dst, vtmp2, vtmp1); 2249 } 2250 2251 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2252 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2253 vextracti64x4_high(vtmp1, src); 2254 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2255 } 2256 2257 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2258 pshufd(dst, src, 0xE); 2259 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2260 } 2261 2262 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2263 vextractf128_high(vtmp, src); 2264 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2265 unorderedReduce2D(opcode, dst, vtmp); 2266 } 2267 2268 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2269 vextractf64x4_high(vtmp2, src); 2270 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2271 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2272 } 2273 2274 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2275 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2276 } 2277 2278 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2279 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2280 } 2281 2282 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2283 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2284 } 2285 2286 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2287 int vec_enc) { 2288 switch(elem_bt) { 2289 case T_INT: 2290 case T_FLOAT: 2291 vmaskmovps(dst, src, mask, vec_enc); 2292 break; 2293 case T_LONG: 2294 case T_DOUBLE: 2295 vmaskmovpd(dst, src, mask, vec_enc); 2296 break; 2297 default: 2298 fatal("Unsupported type %s", type2name(elem_bt)); 2299 break; 2300 } 2301 } 2302 2303 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2304 int vec_enc) { 2305 switch(elem_bt) { 2306 case T_INT: 2307 case T_FLOAT: 2308 vmaskmovps(dst, src, mask, vec_enc); 2309 break; 2310 case T_LONG: 2311 case T_DOUBLE: 2312 vmaskmovpd(dst, src, mask, vec_enc); 2313 break; 2314 default: 2315 fatal("Unsupported type %s", type2name(elem_bt)); 2316 break; 2317 } 2318 } 2319 2320 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2321 XMMRegister dst, XMMRegister src, 2322 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2323 XMMRegister xmm_0, XMMRegister xmm_1) { 2324 const int permconst[] = {1, 14}; 2325 XMMRegister wsrc = src; 2326 XMMRegister wdst = xmm_0; 2327 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2328 2329 int vlen_enc = Assembler::AVX_128bit; 2330 if (vlen == 16) { 2331 vlen_enc = Assembler::AVX_256bit; 2332 } 2333 2334 for (int i = log2(vlen) - 1; i >=0; i--) { 2335 if (i == 0 && !is_dst_valid) { 2336 wdst = dst; 2337 } 2338 if (i == 3) { 2339 vextracti64x4_high(wtmp, wsrc); 2340 } else if (i == 2) { 2341 vextracti128_high(wtmp, wsrc); 2342 } else { // i = [0,1] 2343 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2344 } 2345 2346 if (VM_Version::supports_avx10_2()) { 2347 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc); 2348 } else { 2349 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2350 } 2351 wsrc = wdst; 2352 vlen_enc = Assembler::AVX_128bit; 2353 } 2354 if (is_dst_valid) { 2355 if (VM_Version::supports_avx10_2()) { 2356 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit); 2357 } else { 2358 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2359 } 2360 } 2361 } 2362 2363 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2364 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2365 XMMRegister xmm_0, XMMRegister xmm_1) { 2366 XMMRegister wsrc = src; 2367 XMMRegister wdst = xmm_0; 2368 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2369 int vlen_enc = Assembler::AVX_128bit; 2370 if (vlen == 8) { 2371 vlen_enc = Assembler::AVX_256bit; 2372 } 2373 for (int i = log2(vlen) - 1; i >=0; i--) { 2374 if (i == 0 && !is_dst_valid) { 2375 wdst = dst; 2376 } 2377 if (i == 1) { 2378 vextracti128_high(wtmp, wsrc); 2379 } else if (i == 2) { 2380 vextracti64x4_high(wtmp, wsrc); 2381 } else { 2382 assert(i == 0, "%d", i); 2383 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2384 } 2385 2386 if (VM_Version::supports_avx10_2()) { 2387 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc); 2388 } else { 2389 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2390 } 2391 2392 wsrc = wdst; 2393 vlen_enc = Assembler::AVX_128bit; 2394 } 2395 2396 if (is_dst_valid) { 2397 if (VM_Version::supports_avx10_2()) { 2398 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit); 2399 } else { 2400 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2401 } 2402 } 2403 } 2404 2405 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2406 switch (bt) { 2407 case T_BYTE: pextrb(dst, src, idx); break; 2408 case T_SHORT: pextrw(dst, src, idx); break; 2409 case T_INT: pextrd(dst, src, idx); break; 2410 case T_LONG: pextrq(dst, src, idx); break; 2411 2412 default: 2413 assert(false,"Should not reach here."); 2414 break; 2415 } 2416 } 2417 2418 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2419 int esize = type2aelembytes(typ); 2420 int elem_per_lane = 16/esize; 2421 int lane = elemindex / elem_per_lane; 2422 int eindex = elemindex % elem_per_lane; 2423 2424 if (lane >= 2) { 2425 assert(UseAVX > 2, "required"); 2426 vextractf32x4(dst, src, lane & 3); 2427 return dst; 2428 } else if (lane > 0) { 2429 assert(UseAVX > 0, "required"); 2430 vextractf128(dst, src, lane); 2431 return dst; 2432 } else { 2433 return src; 2434 } 2435 } 2436 2437 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2438 if (typ == T_BYTE) { 2439 movsbl(dst, dst); 2440 } else if (typ == T_SHORT) { 2441 movswl(dst, dst); 2442 } 2443 } 2444 2445 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2446 int esize = type2aelembytes(typ); 2447 int elem_per_lane = 16/esize; 2448 int eindex = elemindex % elem_per_lane; 2449 assert(is_integral_type(typ),"required"); 2450 2451 if (eindex == 0) { 2452 if (typ == T_LONG) { 2453 movq(dst, src); 2454 } else { 2455 movdl(dst, src); 2456 movsxl(typ, dst); 2457 } 2458 } else { 2459 extract(typ, dst, src, eindex); 2460 movsxl(typ, dst); 2461 } 2462 } 2463 2464 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2465 int esize = type2aelembytes(typ); 2466 int elem_per_lane = 16/esize; 2467 int eindex = elemindex % elem_per_lane; 2468 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2469 2470 if (eindex == 0) { 2471 movq(dst, src); 2472 } else { 2473 if (typ == T_FLOAT) { 2474 if (UseAVX == 0) { 2475 movdqu(dst, src); 2476 shufps(dst, dst, eindex); 2477 } else { 2478 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2479 } 2480 } else { 2481 if (UseAVX == 0) { 2482 movdqu(dst, src); 2483 psrldq(dst, eindex*esize); 2484 } else { 2485 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2486 } 2487 movq(dst, dst); 2488 } 2489 } 2490 // Zero upper bits 2491 if (typ == T_FLOAT) { 2492 if (UseAVX == 0) { 2493 assert(vtmp != xnoreg, "required."); 2494 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2495 pand(dst, vtmp); 2496 } else { 2497 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2498 } 2499 } 2500 } 2501 2502 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2503 switch(typ) { 2504 case T_BYTE: 2505 case T_BOOLEAN: 2506 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2507 break; 2508 case T_SHORT: 2509 case T_CHAR: 2510 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2511 break; 2512 case T_INT: 2513 case T_FLOAT: 2514 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2515 break; 2516 case T_LONG: 2517 case T_DOUBLE: 2518 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2519 break; 2520 default: 2521 assert(false,"Should not reach here."); 2522 break; 2523 } 2524 } 2525 2526 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2527 assert(rscratch != noreg || always_reachable(src2), "missing"); 2528 2529 switch(typ) { 2530 case T_BOOLEAN: 2531 case T_BYTE: 2532 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2533 break; 2534 case T_CHAR: 2535 case T_SHORT: 2536 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2537 break; 2538 case T_INT: 2539 case T_FLOAT: 2540 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2541 break; 2542 case T_LONG: 2543 case T_DOUBLE: 2544 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2545 break; 2546 default: 2547 assert(false,"Should not reach here."); 2548 break; 2549 } 2550 } 2551 2552 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2553 switch(typ) { 2554 case T_BYTE: 2555 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2556 break; 2557 case T_SHORT: 2558 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2559 break; 2560 case T_INT: 2561 case T_FLOAT: 2562 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2563 break; 2564 case T_LONG: 2565 case T_DOUBLE: 2566 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2567 break; 2568 default: 2569 assert(false,"Should not reach here."); 2570 break; 2571 } 2572 } 2573 2574 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2575 assert(vlen_in_bytes <= 32, ""); 2576 int esize = type2aelembytes(bt); 2577 if (vlen_in_bytes == 32) { 2578 assert(vtmp == xnoreg, "required."); 2579 if (esize >= 4) { 2580 vtestps(src1, src2, AVX_256bit); 2581 } else { 2582 vptest(src1, src2, AVX_256bit); 2583 } 2584 return; 2585 } 2586 if (vlen_in_bytes < 16) { 2587 // Duplicate the lower part to fill the whole register, 2588 // Don't need to do so for src2 2589 assert(vtmp != xnoreg, "required"); 2590 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2591 pshufd(vtmp, src1, shuffle_imm); 2592 } else { 2593 assert(vtmp == xnoreg, "required"); 2594 vtmp = src1; 2595 } 2596 if (esize >= 4 && VM_Version::supports_avx()) { 2597 vtestps(vtmp, src2, AVX_128bit); 2598 } else { 2599 ptest(vtmp, src2); 2600 } 2601 } 2602 2603 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2604 #ifdef ASSERT 2605 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2606 bool is_bw_supported = VM_Version::supports_avx512bw(); 2607 if (is_bw && !is_bw_supported) { 2608 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2609 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2610 "XMM register should be 0-15"); 2611 } 2612 #endif // ASSERT 2613 switch (elem_bt) { 2614 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2615 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2616 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2617 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2618 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2619 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2620 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2621 } 2622 } 2623 2624 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2625 assert(UseAVX >= 2, "required"); 2626 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2627 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2628 if ((UseAVX > 2) && 2629 (!is_bw || VM_Version::supports_avx512bw()) && 2630 (!is_vl || VM_Version::supports_avx512vl())) { 2631 switch (elem_bt) { 2632 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2633 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2634 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2635 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2636 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2637 } 2638 } else { 2639 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2640 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2641 switch (elem_bt) { 2642 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2643 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2644 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2645 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2646 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2647 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2648 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2649 } 2650 } 2651 } 2652 2653 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2654 switch (to_elem_bt) { 2655 case T_SHORT: 2656 vpmovsxbw(dst, src, vlen_enc); 2657 break; 2658 case T_INT: 2659 vpmovsxbd(dst, src, vlen_enc); 2660 break; 2661 case T_FLOAT: 2662 vpmovsxbd(dst, src, vlen_enc); 2663 vcvtdq2ps(dst, dst, vlen_enc); 2664 break; 2665 case T_LONG: 2666 vpmovsxbq(dst, src, vlen_enc); 2667 break; 2668 case T_DOUBLE: { 2669 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2670 vpmovsxbd(dst, src, mid_vlen_enc); 2671 vcvtdq2pd(dst, dst, vlen_enc); 2672 break; 2673 } 2674 default: 2675 fatal("Unsupported type %s", type2name(to_elem_bt)); 2676 break; 2677 } 2678 } 2679 2680 //------------------------------------------------------------------------------------------- 2681 2682 // IndexOf for constant substrings with size >= 8 chars 2683 // which don't need to be loaded through stack. 2684 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2685 Register cnt1, Register cnt2, 2686 int int_cnt2, Register result, 2687 XMMRegister vec, Register tmp, 2688 int ae) { 2689 ShortBranchVerifier sbv(this); 2690 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2691 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2692 2693 // This method uses the pcmpestri instruction with bound registers 2694 // inputs: 2695 // xmm - substring 2696 // rax - substring length (elements count) 2697 // mem - scanned string 2698 // rdx - string length (elements count) 2699 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2700 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2701 // outputs: 2702 // rcx - matched index in string 2703 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2704 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2705 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2706 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2707 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2708 2709 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2710 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2711 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2712 2713 // Note, inline_string_indexOf() generates checks: 2714 // if (substr.count > string.count) return -1; 2715 // if (substr.count == 0) return 0; 2716 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2717 2718 // Load substring. 2719 if (ae == StrIntrinsicNode::UL) { 2720 pmovzxbw(vec, Address(str2, 0)); 2721 } else { 2722 movdqu(vec, Address(str2, 0)); 2723 } 2724 movl(cnt2, int_cnt2); 2725 movptr(result, str1); // string addr 2726 2727 if (int_cnt2 > stride) { 2728 jmpb(SCAN_TO_SUBSTR); 2729 2730 // Reload substr for rescan, this code 2731 // is executed only for large substrings (> 8 chars) 2732 bind(RELOAD_SUBSTR); 2733 if (ae == StrIntrinsicNode::UL) { 2734 pmovzxbw(vec, Address(str2, 0)); 2735 } else { 2736 movdqu(vec, Address(str2, 0)); 2737 } 2738 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2739 2740 bind(RELOAD_STR); 2741 // We came here after the beginning of the substring was 2742 // matched but the rest of it was not so we need to search 2743 // again. Start from the next element after the previous match. 2744 2745 // cnt2 is number of substring reminding elements and 2746 // cnt1 is number of string reminding elements when cmp failed. 2747 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2748 subl(cnt1, cnt2); 2749 addl(cnt1, int_cnt2); 2750 movl(cnt2, int_cnt2); // Now restore cnt2 2751 2752 decrementl(cnt1); // Shift to next element 2753 cmpl(cnt1, cnt2); 2754 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2755 2756 addptr(result, (1<<scale1)); 2757 2758 } // (int_cnt2 > 8) 2759 2760 // Scan string for start of substr in 16-byte vectors 2761 bind(SCAN_TO_SUBSTR); 2762 pcmpestri(vec, Address(result, 0), mode); 2763 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2764 subl(cnt1, stride); 2765 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2766 cmpl(cnt1, cnt2); 2767 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2768 addptr(result, 16); 2769 jmpb(SCAN_TO_SUBSTR); 2770 2771 // Found a potential substr 2772 bind(FOUND_CANDIDATE); 2773 // Matched whole vector if first element matched (tmp(rcx) == 0). 2774 if (int_cnt2 == stride) { 2775 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2776 } else { // int_cnt2 > 8 2777 jccb(Assembler::overflow, FOUND_SUBSTR); 2778 } 2779 // After pcmpestri tmp(rcx) contains matched element index 2780 // Compute start addr of substr 2781 lea(result, Address(result, tmp, scale1)); 2782 2783 // Make sure string is still long enough 2784 subl(cnt1, tmp); 2785 cmpl(cnt1, cnt2); 2786 if (int_cnt2 == stride) { 2787 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2788 } else { // int_cnt2 > 8 2789 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2790 } 2791 // Left less then substring. 2792 2793 bind(RET_NOT_FOUND); 2794 movl(result, -1); 2795 jmp(EXIT); 2796 2797 if (int_cnt2 > stride) { 2798 // This code is optimized for the case when whole substring 2799 // is matched if its head is matched. 2800 bind(MATCH_SUBSTR_HEAD); 2801 pcmpestri(vec, Address(result, 0), mode); 2802 // Reload only string if does not match 2803 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2804 2805 Label CONT_SCAN_SUBSTR; 2806 // Compare the rest of substring (> 8 chars). 2807 bind(FOUND_SUBSTR); 2808 // First 8 chars are already matched. 2809 negptr(cnt2); 2810 addptr(cnt2, stride); 2811 2812 bind(SCAN_SUBSTR); 2813 subl(cnt1, stride); 2814 cmpl(cnt2, -stride); // Do not read beyond substring 2815 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2816 // Back-up strings to avoid reading beyond substring: 2817 // cnt1 = cnt1 - cnt2 + 8 2818 addl(cnt1, cnt2); // cnt2 is negative 2819 addl(cnt1, stride); 2820 movl(cnt2, stride); negptr(cnt2); 2821 bind(CONT_SCAN_SUBSTR); 2822 if (int_cnt2 < (int)G) { 2823 int tail_off1 = int_cnt2<<scale1; 2824 int tail_off2 = int_cnt2<<scale2; 2825 if (ae == StrIntrinsicNode::UL) { 2826 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2827 } else { 2828 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2829 } 2830 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2831 } else { 2832 // calculate index in register to avoid integer overflow (int_cnt2*2) 2833 movl(tmp, int_cnt2); 2834 addptr(tmp, cnt2); 2835 if (ae == StrIntrinsicNode::UL) { 2836 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2837 } else { 2838 movdqu(vec, Address(str2, tmp, scale2, 0)); 2839 } 2840 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2841 } 2842 // Need to reload strings pointers if not matched whole vector 2843 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2844 addptr(cnt2, stride); 2845 jcc(Assembler::negative, SCAN_SUBSTR); 2846 // Fall through if found full substring 2847 2848 } // (int_cnt2 > 8) 2849 2850 bind(RET_FOUND); 2851 // Found result if we matched full small substring. 2852 // Compute substr offset 2853 subptr(result, str1); 2854 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2855 shrl(result, 1); // index 2856 } 2857 bind(EXIT); 2858 2859 } // string_indexofC8 2860 2861 // Small strings are loaded through stack if they cross page boundary. 2862 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2863 Register cnt1, Register cnt2, 2864 int int_cnt2, Register result, 2865 XMMRegister vec, Register tmp, 2866 int ae) { 2867 ShortBranchVerifier sbv(this); 2868 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2869 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2870 2871 // 2872 // int_cnt2 is length of small (< 8 chars) constant substring 2873 // or (-1) for non constant substring in which case its length 2874 // is in cnt2 register. 2875 // 2876 // Note, inline_string_indexOf() generates checks: 2877 // if (substr.count > string.count) return -1; 2878 // if (substr.count == 0) return 0; 2879 // 2880 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2881 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2882 // This method uses the pcmpestri instruction with bound registers 2883 // inputs: 2884 // xmm - substring 2885 // rax - substring length (elements count) 2886 // mem - scanned string 2887 // rdx - string length (elements count) 2888 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2889 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2890 // outputs: 2891 // rcx - matched index in string 2892 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2893 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2894 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2895 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2896 2897 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2898 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2899 FOUND_CANDIDATE; 2900 2901 { //======================================================== 2902 // We don't know where these strings are located 2903 // and we can't read beyond them. Load them through stack. 2904 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2905 2906 movptr(tmp, rsp); // save old SP 2907 2908 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2909 if (int_cnt2 == (1>>scale2)) { // One byte 2910 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2911 load_unsigned_byte(result, Address(str2, 0)); 2912 movdl(vec, result); // move 32 bits 2913 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2914 // Not enough header space in 32-bit VM: 12+3 = 15. 2915 movl(result, Address(str2, -1)); 2916 shrl(result, 8); 2917 movdl(vec, result); // move 32 bits 2918 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2919 load_unsigned_short(result, Address(str2, 0)); 2920 movdl(vec, result); // move 32 bits 2921 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2922 movdl(vec, Address(str2, 0)); // move 32 bits 2923 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2924 movq(vec, Address(str2, 0)); // move 64 bits 2925 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2926 // Array header size is 12 bytes in 32-bit VM 2927 // + 6 bytes for 3 chars == 18 bytes, 2928 // enough space to load vec and shift. 2929 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2930 if (ae == StrIntrinsicNode::UL) { 2931 int tail_off = int_cnt2-8; 2932 pmovzxbw(vec, Address(str2, tail_off)); 2933 psrldq(vec, -2*tail_off); 2934 } 2935 else { 2936 int tail_off = int_cnt2*(1<<scale2); 2937 movdqu(vec, Address(str2, tail_off-16)); 2938 psrldq(vec, 16-tail_off); 2939 } 2940 } 2941 } else { // not constant substring 2942 cmpl(cnt2, stride); 2943 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2944 2945 // We can read beyond string if srt+16 does not cross page boundary 2946 // since heaps are aligned and mapped by pages. 2947 assert(os::vm_page_size() < (int)G, "default page should be small"); 2948 movl(result, str2); // We need only low 32 bits 2949 andl(result, ((int)os::vm_page_size()-1)); 2950 cmpl(result, ((int)os::vm_page_size()-16)); 2951 jccb(Assembler::belowEqual, CHECK_STR); 2952 2953 // Move small strings to stack to allow load 16 bytes into vec. 2954 subptr(rsp, 16); 2955 int stk_offset = wordSize-(1<<scale2); 2956 push(cnt2); 2957 2958 bind(COPY_SUBSTR); 2959 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2960 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2961 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2962 } else if (ae == StrIntrinsicNode::UU) { 2963 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2964 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2965 } 2966 decrement(cnt2); 2967 jccb(Assembler::notZero, COPY_SUBSTR); 2968 2969 pop(cnt2); 2970 movptr(str2, rsp); // New substring address 2971 } // non constant 2972 2973 bind(CHECK_STR); 2974 cmpl(cnt1, stride); 2975 jccb(Assembler::aboveEqual, BIG_STRINGS); 2976 2977 // Check cross page boundary. 2978 movl(result, str1); // We need only low 32 bits 2979 andl(result, ((int)os::vm_page_size()-1)); 2980 cmpl(result, ((int)os::vm_page_size()-16)); 2981 jccb(Assembler::belowEqual, BIG_STRINGS); 2982 2983 subptr(rsp, 16); 2984 int stk_offset = -(1<<scale1); 2985 if (int_cnt2 < 0) { // not constant 2986 push(cnt2); 2987 stk_offset += wordSize; 2988 } 2989 movl(cnt2, cnt1); 2990 2991 bind(COPY_STR); 2992 if (ae == StrIntrinsicNode::LL) { 2993 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2994 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2995 } else { 2996 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2997 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2998 } 2999 decrement(cnt2); 3000 jccb(Assembler::notZero, COPY_STR); 3001 3002 if (int_cnt2 < 0) { // not constant 3003 pop(cnt2); 3004 } 3005 movptr(str1, rsp); // New string address 3006 3007 bind(BIG_STRINGS); 3008 // Load substring. 3009 if (int_cnt2 < 0) { // -1 3010 if (ae == StrIntrinsicNode::UL) { 3011 pmovzxbw(vec, Address(str2, 0)); 3012 } else { 3013 movdqu(vec, Address(str2, 0)); 3014 } 3015 push(cnt2); // substr count 3016 push(str2); // substr addr 3017 push(str1); // string addr 3018 } else { 3019 // Small (< 8 chars) constant substrings are loaded already. 3020 movl(cnt2, int_cnt2); 3021 } 3022 push(tmp); // original SP 3023 3024 } // Finished loading 3025 3026 //======================================================== 3027 // Start search 3028 // 3029 3030 movptr(result, str1); // string addr 3031 3032 if (int_cnt2 < 0) { // Only for non constant substring 3033 jmpb(SCAN_TO_SUBSTR); 3034 3035 // SP saved at sp+0 3036 // String saved at sp+1*wordSize 3037 // Substr saved at sp+2*wordSize 3038 // Substr count saved at sp+3*wordSize 3039 3040 // Reload substr for rescan, this code 3041 // is executed only for large substrings (> 8 chars) 3042 bind(RELOAD_SUBSTR); 3043 movptr(str2, Address(rsp, 2*wordSize)); 3044 movl(cnt2, Address(rsp, 3*wordSize)); 3045 if (ae == StrIntrinsicNode::UL) { 3046 pmovzxbw(vec, Address(str2, 0)); 3047 } else { 3048 movdqu(vec, Address(str2, 0)); 3049 } 3050 // We came here after the beginning of the substring was 3051 // matched but the rest of it was not so we need to search 3052 // again. Start from the next element after the previous match. 3053 subptr(str1, result); // Restore counter 3054 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3055 shrl(str1, 1); 3056 } 3057 addl(cnt1, str1); 3058 decrementl(cnt1); // Shift to next element 3059 cmpl(cnt1, cnt2); 3060 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3061 3062 addptr(result, (1<<scale1)); 3063 } // non constant 3064 3065 // Scan string for start of substr in 16-byte vectors 3066 bind(SCAN_TO_SUBSTR); 3067 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3068 pcmpestri(vec, Address(result, 0), mode); 3069 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3070 subl(cnt1, stride); 3071 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3072 cmpl(cnt1, cnt2); 3073 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3074 addptr(result, 16); 3075 3076 bind(ADJUST_STR); 3077 cmpl(cnt1, stride); // Do not read beyond string 3078 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3079 // Back-up string to avoid reading beyond string. 3080 lea(result, Address(result, cnt1, scale1, -16)); 3081 movl(cnt1, stride); 3082 jmpb(SCAN_TO_SUBSTR); 3083 3084 // Found a potential substr 3085 bind(FOUND_CANDIDATE); 3086 // After pcmpestri tmp(rcx) contains matched element index 3087 3088 // Make sure string is still long enough 3089 subl(cnt1, tmp); 3090 cmpl(cnt1, cnt2); 3091 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3092 // Left less then substring. 3093 3094 bind(RET_NOT_FOUND); 3095 movl(result, -1); 3096 jmp(CLEANUP); 3097 3098 bind(FOUND_SUBSTR); 3099 // Compute start addr of substr 3100 lea(result, Address(result, tmp, scale1)); 3101 if (int_cnt2 > 0) { // Constant substring 3102 // Repeat search for small substring (< 8 chars) 3103 // from new point without reloading substring. 3104 // Have to check that we don't read beyond string. 3105 cmpl(tmp, stride-int_cnt2); 3106 jccb(Assembler::greater, ADJUST_STR); 3107 // Fall through if matched whole substring. 3108 } else { // non constant 3109 assert(int_cnt2 == -1, "should be != 0"); 3110 3111 addl(tmp, cnt2); 3112 // Found result if we matched whole substring. 3113 cmpl(tmp, stride); 3114 jcc(Assembler::lessEqual, RET_FOUND); 3115 3116 // Repeat search for small substring (<= 8 chars) 3117 // from new point 'str1' without reloading substring. 3118 cmpl(cnt2, stride); 3119 // Have to check that we don't read beyond string. 3120 jccb(Assembler::lessEqual, ADJUST_STR); 3121 3122 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3123 // Compare the rest of substring (> 8 chars). 3124 movptr(str1, result); 3125 3126 cmpl(tmp, cnt2); 3127 // First 8 chars are already matched. 3128 jccb(Assembler::equal, CHECK_NEXT); 3129 3130 bind(SCAN_SUBSTR); 3131 pcmpestri(vec, Address(str1, 0), mode); 3132 // Need to reload strings pointers if not matched whole vector 3133 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3134 3135 bind(CHECK_NEXT); 3136 subl(cnt2, stride); 3137 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3138 addptr(str1, 16); 3139 if (ae == StrIntrinsicNode::UL) { 3140 addptr(str2, 8); 3141 } else { 3142 addptr(str2, 16); 3143 } 3144 subl(cnt1, stride); 3145 cmpl(cnt2, stride); // Do not read beyond substring 3146 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3147 // Back-up strings to avoid reading beyond substring. 3148 3149 if (ae == StrIntrinsicNode::UL) { 3150 lea(str2, Address(str2, cnt2, scale2, -8)); 3151 lea(str1, Address(str1, cnt2, scale1, -16)); 3152 } else { 3153 lea(str2, Address(str2, cnt2, scale2, -16)); 3154 lea(str1, Address(str1, cnt2, scale1, -16)); 3155 } 3156 subl(cnt1, cnt2); 3157 movl(cnt2, stride); 3158 addl(cnt1, stride); 3159 bind(CONT_SCAN_SUBSTR); 3160 if (ae == StrIntrinsicNode::UL) { 3161 pmovzxbw(vec, Address(str2, 0)); 3162 } else { 3163 movdqu(vec, Address(str2, 0)); 3164 } 3165 jmp(SCAN_SUBSTR); 3166 3167 bind(RET_FOUND_LONG); 3168 movptr(str1, Address(rsp, wordSize)); 3169 } // non constant 3170 3171 bind(RET_FOUND); 3172 // Compute substr offset 3173 subptr(result, str1); 3174 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3175 shrl(result, 1); // index 3176 } 3177 bind(CLEANUP); 3178 pop(rsp); // restore SP 3179 3180 } // string_indexof 3181 3182 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3183 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3184 ShortBranchVerifier sbv(this); 3185 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3186 3187 int stride = 8; 3188 3189 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3190 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3191 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3192 FOUND_SEQ_CHAR, DONE_LABEL; 3193 3194 movptr(result, str1); 3195 if (UseAVX >= 2) { 3196 cmpl(cnt1, stride); 3197 jcc(Assembler::less, SCAN_TO_CHAR); 3198 cmpl(cnt1, 2*stride); 3199 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3200 movdl(vec1, ch); 3201 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3202 vpxor(vec2, vec2); 3203 movl(tmp, cnt1); 3204 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3205 andl(cnt1,0x0000000F); //tail count (in chars) 3206 3207 bind(SCAN_TO_16_CHAR_LOOP); 3208 vmovdqu(vec3, Address(result, 0)); 3209 vpcmpeqw(vec3, vec3, vec1, 1); 3210 vptest(vec2, vec3); 3211 jcc(Assembler::carryClear, FOUND_CHAR); 3212 addptr(result, 32); 3213 subl(tmp, 2*stride); 3214 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3215 jmp(SCAN_TO_8_CHAR); 3216 bind(SCAN_TO_8_CHAR_INIT); 3217 movdl(vec1, ch); 3218 pshuflw(vec1, vec1, 0x00); 3219 pshufd(vec1, vec1, 0); 3220 pxor(vec2, vec2); 3221 } 3222 bind(SCAN_TO_8_CHAR); 3223 cmpl(cnt1, stride); 3224 jcc(Assembler::less, SCAN_TO_CHAR); 3225 if (UseAVX < 2) { 3226 movdl(vec1, ch); 3227 pshuflw(vec1, vec1, 0x00); 3228 pshufd(vec1, vec1, 0); 3229 pxor(vec2, vec2); 3230 } 3231 movl(tmp, cnt1); 3232 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3233 andl(cnt1,0x00000007); //tail count (in chars) 3234 3235 bind(SCAN_TO_8_CHAR_LOOP); 3236 movdqu(vec3, Address(result, 0)); 3237 pcmpeqw(vec3, vec1); 3238 ptest(vec2, vec3); 3239 jcc(Assembler::carryClear, FOUND_CHAR); 3240 addptr(result, 16); 3241 subl(tmp, stride); 3242 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3243 bind(SCAN_TO_CHAR); 3244 testl(cnt1, cnt1); 3245 jcc(Assembler::zero, RET_NOT_FOUND); 3246 bind(SCAN_TO_CHAR_LOOP); 3247 load_unsigned_short(tmp, Address(result, 0)); 3248 cmpl(ch, tmp); 3249 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3250 addptr(result, 2); 3251 subl(cnt1, 1); 3252 jccb(Assembler::zero, RET_NOT_FOUND); 3253 jmp(SCAN_TO_CHAR_LOOP); 3254 3255 bind(RET_NOT_FOUND); 3256 movl(result, -1); 3257 jmpb(DONE_LABEL); 3258 3259 bind(FOUND_CHAR); 3260 if (UseAVX >= 2) { 3261 vpmovmskb(tmp, vec3); 3262 } else { 3263 pmovmskb(tmp, vec3); 3264 } 3265 bsfl(ch, tmp); 3266 addptr(result, ch); 3267 3268 bind(FOUND_SEQ_CHAR); 3269 subptr(result, str1); 3270 shrl(result, 1); 3271 3272 bind(DONE_LABEL); 3273 } // string_indexof_char 3274 3275 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3276 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3277 ShortBranchVerifier sbv(this); 3278 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3279 3280 int stride = 16; 3281 3282 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3283 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3284 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3285 FOUND_SEQ_CHAR, DONE_LABEL; 3286 3287 movptr(result, str1); 3288 if (UseAVX >= 2) { 3289 cmpl(cnt1, stride); 3290 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3291 cmpl(cnt1, stride*2); 3292 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3293 movdl(vec1, ch); 3294 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3295 vpxor(vec2, vec2); 3296 movl(tmp, cnt1); 3297 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3298 andl(cnt1,0x0000001F); //tail count (in chars) 3299 3300 bind(SCAN_TO_32_CHAR_LOOP); 3301 vmovdqu(vec3, Address(result, 0)); 3302 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3303 vptest(vec2, vec3); 3304 jcc(Assembler::carryClear, FOUND_CHAR); 3305 addptr(result, 32); 3306 subl(tmp, stride*2); 3307 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3308 jmp(SCAN_TO_16_CHAR); 3309 3310 bind(SCAN_TO_16_CHAR_INIT); 3311 movdl(vec1, ch); 3312 pxor(vec2, vec2); 3313 pshufb(vec1, vec2); 3314 } 3315 3316 bind(SCAN_TO_16_CHAR); 3317 cmpl(cnt1, stride); 3318 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3319 if (UseAVX < 2) { 3320 movdl(vec1, ch); 3321 pxor(vec2, vec2); 3322 pshufb(vec1, vec2); 3323 } 3324 movl(tmp, cnt1); 3325 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3326 andl(cnt1,0x0000000F); //tail count (in bytes) 3327 3328 bind(SCAN_TO_16_CHAR_LOOP); 3329 movdqu(vec3, Address(result, 0)); 3330 pcmpeqb(vec3, vec1); 3331 ptest(vec2, vec3); 3332 jcc(Assembler::carryClear, FOUND_CHAR); 3333 addptr(result, 16); 3334 subl(tmp, stride); 3335 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3336 3337 bind(SCAN_TO_CHAR_INIT); 3338 testl(cnt1, cnt1); 3339 jcc(Assembler::zero, RET_NOT_FOUND); 3340 bind(SCAN_TO_CHAR_LOOP); 3341 load_unsigned_byte(tmp, Address(result, 0)); 3342 cmpl(ch, tmp); 3343 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3344 addptr(result, 1); 3345 subl(cnt1, 1); 3346 jccb(Assembler::zero, RET_NOT_FOUND); 3347 jmp(SCAN_TO_CHAR_LOOP); 3348 3349 bind(RET_NOT_FOUND); 3350 movl(result, -1); 3351 jmpb(DONE_LABEL); 3352 3353 bind(FOUND_CHAR); 3354 if (UseAVX >= 2) { 3355 vpmovmskb(tmp, vec3); 3356 } else { 3357 pmovmskb(tmp, vec3); 3358 } 3359 bsfl(ch, tmp); 3360 addptr(result, ch); 3361 3362 bind(FOUND_SEQ_CHAR); 3363 subptr(result, str1); 3364 3365 bind(DONE_LABEL); 3366 } // stringL_indexof_char 3367 3368 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3369 switch (eltype) { 3370 case T_BOOLEAN: return sizeof(jboolean); 3371 case T_BYTE: return sizeof(jbyte); 3372 case T_SHORT: return sizeof(jshort); 3373 case T_CHAR: return sizeof(jchar); 3374 case T_INT: return sizeof(jint); 3375 default: 3376 ShouldNotReachHere(); 3377 return -1; 3378 } 3379 } 3380 3381 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3382 switch (eltype) { 3383 // T_BOOLEAN used as surrogate for unsigned byte 3384 case T_BOOLEAN: movzbl(dst, src); break; 3385 case T_BYTE: movsbl(dst, src); break; 3386 case T_SHORT: movswl(dst, src); break; 3387 case T_CHAR: movzwl(dst, src); break; 3388 case T_INT: movl(dst, src); break; 3389 default: 3390 ShouldNotReachHere(); 3391 } 3392 } 3393 3394 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3395 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3396 } 3397 3398 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3399 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3400 } 3401 3402 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3403 const int vlen = Assembler::AVX_256bit; 3404 switch (eltype) { 3405 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3406 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3407 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3408 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3409 case T_INT: 3410 // do nothing 3411 break; 3412 default: 3413 ShouldNotReachHere(); 3414 } 3415 } 3416 3417 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3418 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3419 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3420 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3421 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3422 BasicType eltype) { 3423 ShortBranchVerifier sbv(this); 3424 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3425 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3426 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3427 3428 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3429 SHORT_UNROLLED_LOOP_EXIT, 3430 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3431 UNROLLED_VECTOR_LOOP_BEGIN, 3432 END; 3433 switch (eltype) { 3434 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3435 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3436 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3437 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3438 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3439 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3440 } 3441 3442 // For "renaming" for readibility of the code 3443 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3444 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3445 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3446 3447 const int elsize = arrays_hashcode_elsize(eltype); 3448 3449 /* 3450 if (cnt1 >= 2) { 3451 if (cnt1 >= 32) { 3452 UNROLLED VECTOR LOOP 3453 } 3454 UNROLLED SCALAR LOOP 3455 } 3456 SINGLE SCALAR 3457 */ 3458 3459 cmpl(cnt1, 32); 3460 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3461 3462 // cnt1 >= 32 && generate_vectorized_loop 3463 xorl(index, index); 3464 3465 // vresult = IntVector.zero(I256); 3466 for (int idx = 0; idx < 4; idx++) { 3467 vpxor(vresult[idx], vresult[idx]); 3468 } 3469 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3470 Register bound = tmp2; 3471 Register next = tmp3; 3472 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3473 movl(next, Address(tmp2, 0)); 3474 movdl(vnext, next); 3475 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3476 3477 // index = 0; 3478 // bound = cnt1 & ~(32 - 1); 3479 movl(bound, cnt1); 3480 andl(bound, ~(32 - 1)); 3481 // for (; index < bound; index += 32) { 3482 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3483 // result *= next; 3484 imull(result, next); 3485 // loop fission to upfront the cost of fetching from memory, OOO execution 3486 // can then hopefully do a better job of prefetching 3487 for (int idx = 0; idx < 4; idx++) { 3488 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3489 } 3490 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3491 for (int idx = 0; idx < 4; idx++) { 3492 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3493 arrays_hashcode_elvcast(vtmp[idx], eltype); 3494 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3495 } 3496 // index += 32; 3497 addl(index, 32); 3498 // index < bound; 3499 cmpl(index, bound); 3500 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3501 // } 3502 3503 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3504 subl(cnt1, bound); 3505 // release bound 3506 3507 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3508 for (int idx = 0; idx < 4; idx++) { 3509 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3510 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3511 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3512 } 3513 // result += vresult.reduceLanes(ADD); 3514 for (int idx = 0; idx < 4; idx++) { 3515 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3516 } 3517 3518 // } else if (cnt1 < 32) { 3519 3520 bind(SHORT_UNROLLED_BEGIN); 3521 // int i = 1; 3522 movl(index, 1); 3523 cmpl(index, cnt1); 3524 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3525 3526 // for (; i < cnt1 ; i += 2) { 3527 bind(SHORT_UNROLLED_LOOP_BEGIN); 3528 movl(tmp3, 961); 3529 imull(result, tmp3); 3530 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3531 movl(tmp3, tmp2); 3532 shll(tmp3, 5); 3533 subl(tmp3, tmp2); 3534 addl(result, tmp3); 3535 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3536 addl(result, tmp3); 3537 addl(index, 2); 3538 cmpl(index, cnt1); 3539 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3540 3541 // } 3542 // if (i >= cnt1) { 3543 bind(SHORT_UNROLLED_LOOP_EXIT); 3544 jccb(Assembler::greater, END); 3545 movl(tmp2, result); 3546 shll(result, 5); 3547 subl(result, tmp2); 3548 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3549 addl(result, tmp3); 3550 // } 3551 bind(END); 3552 3553 BLOCK_COMMENT("} // arrays_hashcode"); 3554 3555 } // arrays_hashcode 3556 3557 // helper function for string_compare 3558 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3559 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3560 Address::ScaleFactor scale2, Register index, int ae) { 3561 if (ae == StrIntrinsicNode::LL) { 3562 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3563 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3564 } else if (ae == StrIntrinsicNode::UU) { 3565 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3566 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3567 } else { 3568 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3569 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3570 } 3571 } 3572 3573 // Compare strings, used for char[] and byte[]. 3574 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3575 Register cnt1, Register cnt2, Register result, 3576 XMMRegister vec1, int ae, KRegister mask) { 3577 ShortBranchVerifier sbv(this); 3578 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3579 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3580 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3581 int stride2x2 = 0x40; 3582 Address::ScaleFactor scale = Address::no_scale; 3583 Address::ScaleFactor scale1 = Address::no_scale; 3584 Address::ScaleFactor scale2 = Address::no_scale; 3585 3586 if (ae != StrIntrinsicNode::LL) { 3587 stride2x2 = 0x20; 3588 } 3589 3590 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3591 shrl(cnt2, 1); 3592 } 3593 // Compute the minimum of the string lengths and the 3594 // difference of the string lengths (stack). 3595 // Do the conditional move stuff 3596 movl(result, cnt1); 3597 subl(cnt1, cnt2); 3598 push(cnt1); 3599 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3600 3601 // Is the minimum length zero? 3602 testl(cnt2, cnt2); 3603 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3604 if (ae == StrIntrinsicNode::LL) { 3605 // Load first bytes 3606 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3607 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3608 } else if (ae == StrIntrinsicNode::UU) { 3609 // Load first characters 3610 load_unsigned_short(result, Address(str1, 0)); 3611 load_unsigned_short(cnt1, Address(str2, 0)); 3612 } else { 3613 load_unsigned_byte(result, Address(str1, 0)); 3614 load_unsigned_short(cnt1, Address(str2, 0)); 3615 } 3616 subl(result, cnt1); 3617 jcc(Assembler::notZero, POP_LABEL); 3618 3619 if (ae == StrIntrinsicNode::UU) { 3620 // Divide length by 2 to get number of chars 3621 shrl(cnt2, 1); 3622 } 3623 cmpl(cnt2, 1); 3624 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3625 3626 // Check if the strings start at the same location and setup scale and stride 3627 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3628 cmpptr(str1, str2); 3629 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3630 if (ae == StrIntrinsicNode::LL) { 3631 scale = Address::times_1; 3632 stride = 16; 3633 } else { 3634 scale = Address::times_2; 3635 stride = 8; 3636 } 3637 } else { 3638 scale1 = Address::times_1; 3639 scale2 = Address::times_2; 3640 // scale not used 3641 stride = 8; 3642 } 3643 3644 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3645 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3646 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3647 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3648 Label COMPARE_TAIL_LONG; 3649 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3650 3651 int pcmpmask = 0x19; 3652 if (ae == StrIntrinsicNode::LL) { 3653 pcmpmask &= ~0x01; 3654 } 3655 3656 // Setup to compare 16-chars (32-bytes) vectors, 3657 // start from first character again because it has aligned address. 3658 if (ae == StrIntrinsicNode::LL) { 3659 stride2 = 32; 3660 } else { 3661 stride2 = 16; 3662 } 3663 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3664 adr_stride = stride << scale; 3665 } else { 3666 adr_stride1 = 8; //stride << scale1; 3667 adr_stride2 = 16; //stride << scale2; 3668 } 3669 3670 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3671 // rax and rdx are used by pcmpestri as elements counters 3672 movl(result, cnt2); 3673 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3674 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3675 3676 // fast path : compare first 2 8-char vectors. 3677 bind(COMPARE_16_CHARS); 3678 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3679 movdqu(vec1, Address(str1, 0)); 3680 } else { 3681 pmovzxbw(vec1, Address(str1, 0)); 3682 } 3683 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3684 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3685 3686 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3687 movdqu(vec1, Address(str1, adr_stride)); 3688 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3689 } else { 3690 pmovzxbw(vec1, Address(str1, adr_stride1)); 3691 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3692 } 3693 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3694 addl(cnt1, stride); 3695 3696 // Compare the characters at index in cnt1 3697 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3698 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3699 subl(result, cnt2); 3700 jmp(POP_LABEL); 3701 3702 // Setup the registers to start vector comparison loop 3703 bind(COMPARE_WIDE_VECTORS); 3704 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3705 lea(str1, Address(str1, result, scale)); 3706 lea(str2, Address(str2, result, scale)); 3707 } else { 3708 lea(str1, Address(str1, result, scale1)); 3709 lea(str2, Address(str2, result, scale2)); 3710 } 3711 subl(result, stride2); 3712 subl(cnt2, stride2); 3713 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3714 negptr(result); 3715 3716 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3717 bind(COMPARE_WIDE_VECTORS_LOOP); 3718 3719 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3720 cmpl(cnt2, stride2x2); 3721 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3722 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3723 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3724 3725 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3726 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3727 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3728 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3729 } else { 3730 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3731 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3732 } 3733 kortestql(mask, mask); 3734 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3735 addptr(result, stride2x2); // update since we already compared at this addr 3736 subl(cnt2, stride2x2); // and sub the size too 3737 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3738 3739 vpxor(vec1, vec1); 3740 jmpb(COMPARE_WIDE_TAIL); 3741 }//if (VM_Version::supports_avx512vlbw()) 3742 3743 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3744 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3745 vmovdqu(vec1, Address(str1, result, scale)); 3746 vpxor(vec1, Address(str2, result, scale)); 3747 } else { 3748 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3749 vpxor(vec1, Address(str2, result, scale2)); 3750 } 3751 vptest(vec1, vec1); 3752 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3753 addptr(result, stride2); 3754 subl(cnt2, stride2); 3755 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3756 // clean upper bits of YMM registers 3757 vpxor(vec1, vec1); 3758 3759 // compare wide vectors tail 3760 bind(COMPARE_WIDE_TAIL); 3761 testptr(result, result); 3762 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3763 3764 movl(result, stride2); 3765 movl(cnt2, result); 3766 negptr(result); 3767 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3768 3769 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3770 bind(VECTOR_NOT_EQUAL); 3771 // clean upper bits of YMM registers 3772 vpxor(vec1, vec1); 3773 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3774 lea(str1, Address(str1, result, scale)); 3775 lea(str2, Address(str2, result, scale)); 3776 } else { 3777 lea(str1, Address(str1, result, scale1)); 3778 lea(str2, Address(str2, result, scale2)); 3779 } 3780 jmp(COMPARE_16_CHARS); 3781 3782 // Compare tail chars, length between 1 to 15 chars 3783 bind(COMPARE_TAIL_LONG); 3784 movl(cnt2, result); 3785 cmpl(cnt2, stride); 3786 jcc(Assembler::less, COMPARE_SMALL_STR); 3787 3788 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3789 movdqu(vec1, Address(str1, 0)); 3790 } else { 3791 pmovzxbw(vec1, Address(str1, 0)); 3792 } 3793 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3794 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3795 subptr(cnt2, stride); 3796 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3797 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3798 lea(str1, Address(str1, result, scale)); 3799 lea(str2, Address(str2, result, scale)); 3800 } else { 3801 lea(str1, Address(str1, result, scale1)); 3802 lea(str2, Address(str2, result, scale2)); 3803 } 3804 negptr(cnt2); 3805 jmpb(WHILE_HEAD_LABEL); 3806 3807 bind(COMPARE_SMALL_STR); 3808 } else if (UseSSE42Intrinsics) { 3809 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3810 int pcmpmask = 0x19; 3811 // Setup to compare 8-char (16-byte) vectors, 3812 // start from first character again because it has aligned address. 3813 movl(result, cnt2); 3814 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3815 if (ae == StrIntrinsicNode::LL) { 3816 pcmpmask &= ~0x01; 3817 } 3818 jcc(Assembler::zero, COMPARE_TAIL); 3819 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3820 lea(str1, Address(str1, result, scale)); 3821 lea(str2, Address(str2, result, scale)); 3822 } else { 3823 lea(str1, Address(str1, result, scale1)); 3824 lea(str2, Address(str2, result, scale2)); 3825 } 3826 negptr(result); 3827 3828 // pcmpestri 3829 // inputs: 3830 // vec1- substring 3831 // rax - negative string length (elements count) 3832 // mem - scanned string 3833 // rdx - string length (elements count) 3834 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3835 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3836 // outputs: 3837 // rcx - first mismatched element index 3838 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3839 3840 bind(COMPARE_WIDE_VECTORS); 3841 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3842 movdqu(vec1, Address(str1, result, scale)); 3843 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3844 } else { 3845 pmovzxbw(vec1, Address(str1, result, scale1)); 3846 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3847 } 3848 // After pcmpestri cnt1(rcx) contains mismatched element index 3849 3850 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3851 addptr(result, stride); 3852 subptr(cnt2, stride); 3853 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3854 3855 // compare wide vectors tail 3856 testptr(result, result); 3857 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3858 3859 movl(cnt2, stride); 3860 movl(result, stride); 3861 negptr(result); 3862 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3863 movdqu(vec1, Address(str1, result, scale)); 3864 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3865 } else { 3866 pmovzxbw(vec1, Address(str1, result, scale1)); 3867 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3868 } 3869 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3870 3871 // Mismatched characters in the vectors 3872 bind(VECTOR_NOT_EQUAL); 3873 addptr(cnt1, result); 3874 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3875 subl(result, cnt2); 3876 jmpb(POP_LABEL); 3877 3878 bind(COMPARE_TAIL); // limit is zero 3879 movl(cnt2, result); 3880 // Fallthru to tail compare 3881 } 3882 // Shift str2 and str1 to the end of the arrays, negate min 3883 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3884 lea(str1, Address(str1, cnt2, scale)); 3885 lea(str2, Address(str2, cnt2, scale)); 3886 } else { 3887 lea(str1, Address(str1, cnt2, scale1)); 3888 lea(str2, Address(str2, cnt2, scale2)); 3889 } 3890 decrementl(cnt2); // first character was compared already 3891 negptr(cnt2); 3892 3893 // Compare the rest of the elements 3894 bind(WHILE_HEAD_LABEL); 3895 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3896 subl(result, cnt1); 3897 jccb(Assembler::notZero, POP_LABEL); 3898 increment(cnt2); 3899 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3900 3901 // Strings are equal up to min length. Return the length difference. 3902 bind(LENGTH_DIFF_LABEL); 3903 pop(result); 3904 if (ae == StrIntrinsicNode::UU) { 3905 // Divide diff by 2 to get number of chars 3906 sarl(result, 1); 3907 } 3908 jmpb(DONE_LABEL); 3909 3910 if (VM_Version::supports_avx512vlbw()) { 3911 3912 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3913 3914 kmovql(cnt1, mask); 3915 notq(cnt1); 3916 bsfq(cnt2, cnt1); 3917 if (ae != StrIntrinsicNode::LL) { 3918 // Divide diff by 2 to get number of chars 3919 sarl(cnt2, 1); 3920 } 3921 addq(result, cnt2); 3922 if (ae == StrIntrinsicNode::LL) { 3923 load_unsigned_byte(cnt1, Address(str2, result)); 3924 load_unsigned_byte(result, Address(str1, result)); 3925 } else if (ae == StrIntrinsicNode::UU) { 3926 load_unsigned_short(cnt1, Address(str2, result, scale)); 3927 load_unsigned_short(result, Address(str1, result, scale)); 3928 } else { 3929 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3930 load_unsigned_byte(result, Address(str1, result, scale1)); 3931 } 3932 subl(result, cnt1); 3933 jmpb(POP_LABEL); 3934 }//if (VM_Version::supports_avx512vlbw()) 3935 3936 // Discard the stored length difference 3937 bind(POP_LABEL); 3938 pop(cnt1); 3939 3940 // That's it 3941 bind(DONE_LABEL); 3942 if(ae == StrIntrinsicNode::UL) { 3943 negl(result); 3944 } 3945 3946 } 3947 3948 // Search for Non-ASCII character (Negative byte value) in a byte array, 3949 // return the index of the first such character, otherwise the length 3950 // of the array segment searched. 3951 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3952 // @IntrinsicCandidate 3953 // public static int countPositives(byte[] ba, int off, int len) { 3954 // for (int i = off; i < off + len; i++) { 3955 // if (ba[i] < 0) { 3956 // return i - off; 3957 // } 3958 // } 3959 // return len; 3960 // } 3961 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3962 Register result, Register tmp1, 3963 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3964 // rsi: byte array 3965 // rcx: len 3966 // rax: result 3967 ShortBranchVerifier sbv(this); 3968 assert_different_registers(ary1, len, result, tmp1); 3969 assert_different_registers(vec1, vec2); 3970 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3971 3972 movl(result, len); // copy 3973 // len == 0 3974 testl(len, len); 3975 jcc(Assembler::zero, DONE); 3976 3977 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3978 VM_Version::supports_avx512vlbw() && 3979 VM_Version::supports_bmi2()) { 3980 3981 Label test_64_loop, test_tail, BREAK_LOOP; 3982 movl(tmp1, len); 3983 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3984 3985 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 3986 andl(len, 0xffffffc0); // vector count (in chars) 3987 jccb(Assembler::zero, test_tail); 3988 3989 lea(ary1, Address(ary1, len, Address::times_1)); 3990 negptr(len); 3991 3992 bind(test_64_loop); 3993 // Check whether our 64 elements of size byte contain negatives 3994 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3995 kortestql(mask1, mask1); 3996 jcc(Assembler::notZero, BREAK_LOOP); 3997 3998 addptr(len, 64); 3999 jccb(Assembler::notZero, test_64_loop); 4000 4001 bind(test_tail); 4002 // bail out when there is nothing to be done 4003 testl(tmp1, -1); 4004 jcc(Assembler::zero, DONE); 4005 4006 4007 // check the tail for absense of negatives 4008 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4009 { 4010 Register tmp3_aliased = len; 4011 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4012 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4013 notq(tmp3_aliased); 4014 kmovql(mask2, tmp3_aliased); 4015 } 4016 4017 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4018 ktestq(mask1, mask2); 4019 jcc(Assembler::zero, DONE); 4020 4021 // do a full check for negative registers in the tail 4022 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4023 // ary1 already pointing to the right place 4024 jmpb(TAIL_START); 4025 4026 bind(BREAK_LOOP); 4027 // At least one byte in the last 64 byte block was negative. 4028 // Set up to look at the last 64 bytes as if they were a tail 4029 lea(ary1, Address(ary1, len, Address::times_1)); 4030 addptr(result, len); 4031 // Ignore the very last byte: if all others are positive, 4032 // it must be negative, so we can skip right to the 2+1 byte 4033 // end comparison at this point 4034 orl(result, 63); 4035 movl(len, 63); 4036 // Fallthru to tail compare 4037 } else { 4038 4039 if (UseAVX >= 2) { 4040 // With AVX2, use 32-byte vector compare 4041 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4042 4043 // Compare 32-byte vectors 4044 testl(len, 0xffffffe0); // vector count (in bytes) 4045 jccb(Assembler::zero, TAIL_START); 4046 4047 andl(len, 0xffffffe0); 4048 lea(ary1, Address(ary1, len, Address::times_1)); 4049 negptr(len); 4050 4051 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4052 movdl(vec2, tmp1); 4053 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4054 4055 bind(COMPARE_WIDE_VECTORS); 4056 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4057 vptest(vec1, vec2); 4058 jccb(Assembler::notZero, BREAK_LOOP); 4059 addptr(len, 32); 4060 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4061 4062 testl(result, 0x0000001f); // any bytes remaining? 4063 jcc(Assembler::zero, DONE); 4064 4065 // Quick test using the already prepared vector mask 4066 movl(len, result); 4067 andl(len, 0x0000001f); 4068 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4069 vptest(vec1, vec2); 4070 jcc(Assembler::zero, DONE); 4071 // There are zeros, jump to the tail to determine exactly where 4072 jmpb(TAIL_START); 4073 4074 bind(BREAK_LOOP); 4075 // At least one byte in the last 32-byte vector is negative. 4076 // Set up to look at the last 32 bytes as if they were a tail 4077 lea(ary1, Address(ary1, len, Address::times_1)); 4078 addptr(result, len); 4079 // Ignore the very last byte: if all others are positive, 4080 // it must be negative, so we can skip right to the 2+1 byte 4081 // end comparison at this point 4082 orl(result, 31); 4083 movl(len, 31); 4084 // Fallthru to tail compare 4085 } else if (UseSSE42Intrinsics) { 4086 // With SSE4.2, use double quad vector compare 4087 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4088 4089 // Compare 16-byte vectors 4090 testl(len, 0xfffffff0); // vector count (in bytes) 4091 jcc(Assembler::zero, TAIL_START); 4092 4093 andl(len, 0xfffffff0); 4094 lea(ary1, Address(ary1, len, Address::times_1)); 4095 negptr(len); 4096 4097 movl(tmp1, 0x80808080); 4098 movdl(vec2, tmp1); 4099 pshufd(vec2, vec2, 0); 4100 4101 bind(COMPARE_WIDE_VECTORS); 4102 movdqu(vec1, Address(ary1, len, Address::times_1)); 4103 ptest(vec1, vec2); 4104 jccb(Assembler::notZero, BREAK_LOOP); 4105 addptr(len, 16); 4106 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4107 4108 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4109 jcc(Assembler::zero, DONE); 4110 4111 // Quick test using the already prepared vector mask 4112 movl(len, result); 4113 andl(len, 0x0000000f); // tail count (in bytes) 4114 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4115 ptest(vec1, vec2); 4116 jcc(Assembler::zero, DONE); 4117 jmpb(TAIL_START); 4118 4119 bind(BREAK_LOOP); 4120 // At least one byte in the last 16-byte vector is negative. 4121 // Set up and look at the last 16 bytes as if they were a tail 4122 lea(ary1, Address(ary1, len, Address::times_1)); 4123 addptr(result, len); 4124 // Ignore the very last byte: if all others are positive, 4125 // it must be negative, so we can skip right to the 2+1 byte 4126 // end comparison at this point 4127 orl(result, 15); 4128 movl(len, 15); 4129 // Fallthru to tail compare 4130 } 4131 } 4132 4133 bind(TAIL_START); 4134 // Compare 4-byte vectors 4135 andl(len, 0xfffffffc); // vector count (in bytes) 4136 jccb(Assembler::zero, COMPARE_CHAR); 4137 4138 lea(ary1, Address(ary1, len, Address::times_1)); 4139 negptr(len); 4140 4141 bind(COMPARE_VECTORS); 4142 movl(tmp1, Address(ary1, len, Address::times_1)); 4143 andl(tmp1, 0x80808080); 4144 jccb(Assembler::notZero, TAIL_ADJUST); 4145 addptr(len, 4); 4146 jccb(Assembler::notZero, COMPARE_VECTORS); 4147 4148 // Compare trailing char (final 2-3 bytes), if any 4149 bind(COMPARE_CHAR); 4150 4151 testl(result, 0x2); // tail char 4152 jccb(Assembler::zero, COMPARE_BYTE); 4153 load_unsigned_short(tmp1, Address(ary1, 0)); 4154 andl(tmp1, 0x00008080); 4155 jccb(Assembler::notZero, CHAR_ADJUST); 4156 lea(ary1, Address(ary1, 2)); 4157 4158 bind(COMPARE_BYTE); 4159 testl(result, 0x1); // tail byte 4160 jccb(Assembler::zero, DONE); 4161 load_unsigned_byte(tmp1, Address(ary1, 0)); 4162 testl(tmp1, 0x00000080); 4163 jccb(Assembler::zero, DONE); 4164 subptr(result, 1); 4165 jmpb(DONE); 4166 4167 bind(TAIL_ADJUST); 4168 // there are negative bits in the last 4 byte block. 4169 // Adjust result and check the next three bytes 4170 addptr(result, len); 4171 orl(result, 3); 4172 lea(ary1, Address(ary1, len, Address::times_1)); 4173 jmpb(COMPARE_CHAR); 4174 4175 bind(CHAR_ADJUST); 4176 // We are looking at a char + optional byte tail, and found that one 4177 // of the bytes in the char is negative. Adjust the result, check the 4178 // first byte and readjust if needed. 4179 andl(result, 0xfffffffc); 4180 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4181 jccb(Assembler::notZero, DONE); 4182 addptr(result, 1); 4183 4184 // That's it 4185 bind(DONE); 4186 if (UseAVX >= 2) { 4187 // clean upper bits of YMM registers 4188 vpxor(vec1, vec1); 4189 vpxor(vec2, vec2); 4190 } 4191 } 4192 4193 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4194 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4195 Register limit, Register result, Register chr, 4196 XMMRegister vec1, XMMRegister vec2, bool is_char, 4197 KRegister mask, bool expand_ary2) { 4198 // for expand_ary2, limit is the (smaller) size of the second array. 4199 ShortBranchVerifier sbv(this); 4200 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4201 4202 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4203 "Expansion only implemented for AVX2"); 4204 4205 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4206 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4207 4208 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4209 int scaleIncr = expand_ary2 ? 8 : 16; 4210 4211 if (is_array_equ) { 4212 // Check the input args 4213 cmpoop(ary1, ary2); 4214 jcc(Assembler::equal, TRUE_LABEL); 4215 4216 // Need additional checks for arrays_equals. 4217 testptr(ary1, ary1); 4218 jcc(Assembler::zero, FALSE_LABEL); 4219 testptr(ary2, ary2); 4220 jcc(Assembler::zero, FALSE_LABEL); 4221 4222 // Check the lengths 4223 movl(limit, Address(ary1, length_offset)); 4224 cmpl(limit, Address(ary2, length_offset)); 4225 jcc(Assembler::notEqual, FALSE_LABEL); 4226 } 4227 4228 // count == 0 4229 testl(limit, limit); 4230 jcc(Assembler::zero, TRUE_LABEL); 4231 4232 if (is_array_equ) { 4233 // Load array address 4234 lea(ary1, Address(ary1, base_offset)); 4235 lea(ary2, Address(ary2, base_offset)); 4236 } 4237 4238 if (is_array_equ && is_char) { 4239 // arrays_equals when used for char[]. 4240 shll(limit, 1); // byte count != 0 4241 } 4242 movl(result, limit); // copy 4243 4244 if (UseAVX >= 2) { 4245 // With AVX2, use 32-byte vector compare 4246 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4247 4248 // Compare 32-byte vectors 4249 if (expand_ary2) { 4250 andl(result, 0x0000000f); // tail count (in bytes) 4251 andl(limit, 0xfffffff0); // vector count (in bytes) 4252 jcc(Assembler::zero, COMPARE_TAIL); 4253 } else { 4254 andl(result, 0x0000001f); // tail count (in bytes) 4255 andl(limit, 0xffffffe0); // vector count (in bytes) 4256 jcc(Assembler::zero, COMPARE_TAIL_16); 4257 } 4258 4259 lea(ary1, Address(ary1, limit, scaleFactor)); 4260 lea(ary2, Address(ary2, limit, Address::times_1)); 4261 negptr(limit); 4262 4263 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4264 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4265 4266 cmpl(limit, -64); 4267 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4268 4269 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4270 4271 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4272 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4273 kortestql(mask, mask); 4274 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4275 addptr(limit, 64); // update since we already compared at this addr 4276 cmpl(limit, -64); 4277 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4278 4279 // At this point we may still need to compare -limit+result bytes. 4280 // We could execute the next two instruction and just continue via non-wide path: 4281 // cmpl(limit, 0); 4282 // jcc(Assembler::equal, COMPARE_TAIL); // true 4283 // But since we stopped at the points ary{1,2}+limit which are 4284 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4285 // (|limit| <= 32 and result < 32), 4286 // we may just compare the last 64 bytes. 4287 // 4288 addptr(result, -64); // it is safe, bc we just came from this area 4289 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4290 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4291 kortestql(mask, mask); 4292 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4293 4294 jmp(TRUE_LABEL); 4295 4296 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4297 4298 }//if (VM_Version::supports_avx512vlbw()) 4299 4300 bind(COMPARE_WIDE_VECTORS); 4301 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4302 if (expand_ary2) { 4303 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4304 } else { 4305 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4306 } 4307 vpxor(vec1, vec2); 4308 4309 vptest(vec1, vec1); 4310 jcc(Assembler::notZero, FALSE_LABEL); 4311 addptr(limit, scaleIncr * 2); 4312 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4313 4314 testl(result, result); 4315 jcc(Assembler::zero, TRUE_LABEL); 4316 4317 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4318 if (expand_ary2) { 4319 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4320 } else { 4321 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4322 } 4323 vpxor(vec1, vec2); 4324 4325 vptest(vec1, vec1); 4326 jcc(Assembler::notZero, FALSE_LABEL); 4327 jmp(TRUE_LABEL); 4328 4329 bind(COMPARE_TAIL_16); // limit is zero 4330 movl(limit, result); 4331 4332 // Compare 16-byte chunks 4333 andl(result, 0x0000000f); // tail count (in bytes) 4334 andl(limit, 0xfffffff0); // vector count (in bytes) 4335 jcc(Assembler::zero, COMPARE_TAIL); 4336 4337 lea(ary1, Address(ary1, limit, scaleFactor)); 4338 lea(ary2, Address(ary2, limit, Address::times_1)); 4339 negptr(limit); 4340 4341 bind(COMPARE_WIDE_VECTORS_16); 4342 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4343 if (expand_ary2) { 4344 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4345 } else { 4346 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4347 } 4348 pxor(vec1, vec2); 4349 4350 ptest(vec1, vec1); 4351 jcc(Assembler::notZero, FALSE_LABEL); 4352 addptr(limit, scaleIncr); 4353 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4354 4355 bind(COMPARE_TAIL); // limit is zero 4356 movl(limit, result); 4357 // Fallthru to tail compare 4358 } else if (UseSSE42Intrinsics) { 4359 // With SSE4.2, use double quad vector compare 4360 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4361 4362 // Compare 16-byte vectors 4363 andl(result, 0x0000000f); // tail count (in bytes) 4364 andl(limit, 0xfffffff0); // vector count (in bytes) 4365 jcc(Assembler::zero, COMPARE_TAIL); 4366 4367 lea(ary1, Address(ary1, limit, Address::times_1)); 4368 lea(ary2, Address(ary2, limit, Address::times_1)); 4369 negptr(limit); 4370 4371 bind(COMPARE_WIDE_VECTORS); 4372 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4373 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4374 pxor(vec1, vec2); 4375 4376 ptest(vec1, vec1); 4377 jcc(Assembler::notZero, FALSE_LABEL); 4378 addptr(limit, 16); 4379 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4380 4381 testl(result, result); 4382 jcc(Assembler::zero, TRUE_LABEL); 4383 4384 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4385 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4386 pxor(vec1, vec2); 4387 4388 ptest(vec1, vec1); 4389 jccb(Assembler::notZero, FALSE_LABEL); 4390 jmpb(TRUE_LABEL); 4391 4392 bind(COMPARE_TAIL); // limit is zero 4393 movl(limit, result); 4394 // Fallthru to tail compare 4395 } 4396 4397 // Compare 4-byte vectors 4398 if (expand_ary2) { 4399 testl(result, result); 4400 jccb(Assembler::zero, TRUE_LABEL); 4401 } else { 4402 andl(limit, 0xfffffffc); // vector count (in bytes) 4403 jccb(Assembler::zero, COMPARE_CHAR); 4404 } 4405 4406 lea(ary1, Address(ary1, limit, scaleFactor)); 4407 lea(ary2, Address(ary2, limit, Address::times_1)); 4408 negptr(limit); 4409 4410 bind(COMPARE_VECTORS); 4411 if (expand_ary2) { 4412 // There are no "vector" operations for bytes to shorts 4413 movzbl(chr, Address(ary2, limit, Address::times_1)); 4414 cmpw(Address(ary1, limit, Address::times_2), chr); 4415 jccb(Assembler::notEqual, FALSE_LABEL); 4416 addptr(limit, 1); 4417 jcc(Assembler::notZero, COMPARE_VECTORS); 4418 jmp(TRUE_LABEL); 4419 } else { 4420 movl(chr, Address(ary1, limit, Address::times_1)); 4421 cmpl(chr, Address(ary2, limit, Address::times_1)); 4422 jccb(Assembler::notEqual, FALSE_LABEL); 4423 addptr(limit, 4); 4424 jcc(Assembler::notZero, COMPARE_VECTORS); 4425 } 4426 4427 // Compare trailing char (final 2 bytes), if any 4428 bind(COMPARE_CHAR); 4429 testl(result, 0x2); // tail char 4430 jccb(Assembler::zero, COMPARE_BYTE); 4431 load_unsigned_short(chr, Address(ary1, 0)); 4432 load_unsigned_short(limit, Address(ary2, 0)); 4433 cmpl(chr, limit); 4434 jccb(Assembler::notEqual, FALSE_LABEL); 4435 4436 if (is_array_equ && is_char) { 4437 bind(COMPARE_BYTE); 4438 } else { 4439 lea(ary1, Address(ary1, 2)); 4440 lea(ary2, Address(ary2, 2)); 4441 4442 bind(COMPARE_BYTE); 4443 testl(result, 0x1); // tail byte 4444 jccb(Assembler::zero, TRUE_LABEL); 4445 load_unsigned_byte(chr, Address(ary1, 0)); 4446 load_unsigned_byte(limit, Address(ary2, 0)); 4447 cmpl(chr, limit); 4448 jccb(Assembler::notEqual, FALSE_LABEL); 4449 } 4450 bind(TRUE_LABEL); 4451 movl(result, 1); // return true 4452 jmpb(DONE); 4453 4454 bind(FALSE_LABEL); 4455 xorl(result, result); // return false 4456 4457 // That's it 4458 bind(DONE); 4459 if (UseAVX >= 2) { 4460 // clean upper bits of YMM registers 4461 vpxor(vec1, vec1); 4462 vpxor(vec2, vec2); 4463 } 4464 } 4465 4466 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4467 #define __ masm. 4468 Register dst = stub.data<0>(); 4469 XMMRegister src = stub.data<1>(); 4470 address target = stub.data<2>(); 4471 __ bind(stub.entry()); 4472 __ subptr(rsp, 8); 4473 __ movdbl(Address(rsp), src); 4474 __ call(RuntimeAddress(target)); 4475 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte. 4476 __ pop(dst); 4477 __ jmp(stub.continuation()); 4478 #undef __ 4479 } 4480 4481 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4482 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4483 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4484 4485 address slowpath_target; 4486 if (dst_bt == T_INT) { 4487 if (src_bt == T_FLOAT) { 4488 cvttss2sil(dst, src); 4489 cmpl(dst, 0x80000000); 4490 slowpath_target = StubRoutines::x86::f2i_fixup(); 4491 } else { 4492 cvttsd2sil(dst, src); 4493 cmpl(dst, 0x80000000); 4494 slowpath_target = StubRoutines::x86::d2i_fixup(); 4495 } 4496 } else { 4497 if (src_bt == T_FLOAT) { 4498 cvttss2siq(dst, src); 4499 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4500 slowpath_target = StubRoutines::x86::f2l_fixup(); 4501 } else { 4502 cvttsd2siq(dst, src); 4503 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4504 slowpath_target = StubRoutines::x86::d2l_fixup(); 4505 } 4506 } 4507 4508 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte. 4509 int max_size = 23 + (UseAPX ? 1 : 0); 4510 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath); 4511 jcc(Assembler::equal, stub->entry()); 4512 bind(stub->continuation()); 4513 } 4514 4515 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4516 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4517 switch(ideal_opc) { 4518 case Op_LShiftVS: 4519 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4520 case Op_LShiftVI: 4521 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4522 case Op_LShiftVL: 4523 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4524 case Op_RShiftVS: 4525 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4526 case Op_RShiftVI: 4527 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4528 case Op_RShiftVL: 4529 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4530 case Op_URShiftVS: 4531 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4532 case Op_URShiftVI: 4533 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4534 case Op_URShiftVL: 4535 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4536 case Op_RotateRightV: 4537 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4538 case Op_RotateLeftV: 4539 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4540 default: 4541 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4542 break; 4543 } 4544 } 4545 4546 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4547 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4548 if (is_unsigned) { 4549 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4550 } else { 4551 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4552 } 4553 } 4554 4555 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4556 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4557 switch (elem_bt) { 4558 case T_BYTE: 4559 if (ideal_opc == Op_SaturatingAddV) { 4560 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4561 } else { 4562 assert(ideal_opc == Op_SaturatingSubV, ""); 4563 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4564 } 4565 break; 4566 case T_SHORT: 4567 if (ideal_opc == Op_SaturatingAddV) { 4568 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4569 } else { 4570 assert(ideal_opc == Op_SaturatingSubV, ""); 4571 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4572 } 4573 break; 4574 default: 4575 fatal("Unsupported type %s", type2name(elem_bt)); 4576 break; 4577 } 4578 } 4579 4580 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4581 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4582 switch (elem_bt) { 4583 case T_BYTE: 4584 if (ideal_opc == Op_SaturatingAddV) { 4585 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4586 } else { 4587 assert(ideal_opc == Op_SaturatingSubV, ""); 4588 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4589 } 4590 break; 4591 case T_SHORT: 4592 if (ideal_opc == Op_SaturatingAddV) { 4593 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4594 } else { 4595 assert(ideal_opc == Op_SaturatingSubV, ""); 4596 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4597 } 4598 break; 4599 default: 4600 fatal("Unsupported type %s", type2name(elem_bt)); 4601 break; 4602 } 4603 } 4604 4605 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4606 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4607 if (is_unsigned) { 4608 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4609 } else { 4610 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4611 } 4612 } 4613 4614 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4615 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4616 switch (elem_bt) { 4617 case T_BYTE: 4618 if (ideal_opc == Op_SaturatingAddV) { 4619 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4620 } else { 4621 assert(ideal_opc == Op_SaturatingSubV, ""); 4622 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4623 } 4624 break; 4625 case T_SHORT: 4626 if (ideal_opc == Op_SaturatingAddV) { 4627 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4628 } else { 4629 assert(ideal_opc == Op_SaturatingSubV, ""); 4630 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4631 } 4632 break; 4633 default: 4634 fatal("Unsupported type %s", type2name(elem_bt)); 4635 break; 4636 } 4637 } 4638 4639 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4640 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4641 switch (elem_bt) { 4642 case T_BYTE: 4643 if (ideal_opc == Op_SaturatingAddV) { 4644 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4645 } else { 4646 assert(ideal_opc == Op_SaturatingSubV, ""); 4647 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4648 } 4649 break; 4650 case T_SHORT: 4651 if (ideal_opc == Op_SaturatingAddV) { 4652 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4653 } else { 4654 assert(ideal_opc == Op_SaturatingSubV, ""); 4655 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4656 } 4657 break; 4658 default: 4659 fatal("Unsupported type %s", type2name(elem_bt)); 4660 break; 4661 } 4662 } 4663 4664 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4665 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4666 bool is_varshift) { 4667 switch (ideal_opc) { 4668 case Op_AddVB: 4669 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4670 case Op_AddVS: 4671 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4672 case Op_AddVI: 4673 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4674 case Op_AddVL: 4675 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4676 case Op_AddVF: 4677 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4678 case Op_AddVD: 4679 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4680 case Op_SubVB: 4681 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4682 case Op_SubVS: 4683 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4684 case Op_SubVI: 4685 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4686 case Op_SubVL: 4687 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4688 case Op_SubVF: 4689 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4690 case Op_SubVD: 4691 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4692 case Op_MulVS: 4693 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4694 case Op_MulVI: 4695 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4696 case Op_MulVL: 4697 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4698 case Op_MulVF: 4699 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4700 case Op_MulVD: 4701 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4702 case Op_DivVF: 4703 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4704 case Op_DivVD: 4705 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4706 case Op_SqrtVF: 4707 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4708 case Op_SqrtVD: 4709 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4710 case Op_AbsVB: 4711 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4712 case Op_AbsVS: 4713 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4714 case Op_AbsVI: 4715 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4716 case Op_AbsVL: 4717 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4718 case Op_FmaVF: 4719 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4720 case Op_FmaVD: 4721 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4722 case Op_VectorRearrange: 4723 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4724 case Op_LShiftVS: 4725 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4726 case Op_LShiftVI: 4727 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4728 case Op_LShiftVL: 4729 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4730 case Op_RShiftVS: 4731 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4732 case Op_RShiftVI: 4733 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4734 case Op_RShiftVL: 4735 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4736 case Op_URShiftVS: 4737 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4738 case Op_URShiftVI: 4739 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4740 case Op_URShiftVL: 4741 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4742 case Op_RotateLeftV: 4743 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4744 case Op_RotateRightV: 4745 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4746 case Op_MaxV: 4747 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4748 case Op_MinV: 4749 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4750 case Op_UMinV: 4751 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4752 case Op_UMaxV: 4753 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4754 case Op_XorV: 4755 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4756 case Op_OrV: 4757 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4758 case Op_AndV: 4759 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4760 default: 4761 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4762 break; 4763 } 4764 } 4765 4766 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4767 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4768 switch (ideal_opc) { 4769 case Op_AddVB: 4770 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4771 case Op_AddVS: 4772 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4773 case Op_AddVI: 4774 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4775 case Op_AddVL: 4776 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4777 case Op_AddVF: 4778 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4779 case Op_AddVD: 4780 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4781 case Op_SubVB: 4782 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4783 case Op_SubVS: 4784 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4785 case Op_SubVI: 4786 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4787 case Op_SubVL: 4788 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_SubVF: 4790 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4791 case Op_SubVD: 4792 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4793 case Op_MulVS: 4794 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4795 case Op_MulVI: 4796 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4797 case Op_MulVL: 4798 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4799 case Op_MulVF: 4800 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4801 case Op_MulVD: 4802 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4803 case Op_DivVF: 4804 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4805 case Op_DivVD: 4806 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4807 case Op_FmaVF: 4808 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4809 case Op_FmaVD: 4810 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4811 case Op_MaxV: 4812 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_MinV: 4814 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_UMaxV: 4816 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4817 case Op_UMinV: 4818 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4819 case Op_XorV: 4820 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4821 case Op_OrV: 4822 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4823 case Op_AndV: 4824 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4825 default: 4826 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4827 break; 4828 } 4829 } 4830 4831 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4832 KRegister src1, KRegister src2) { 4833 BasicType etype = T_ILLEGAL; 4834 switch(mask_len) { 4835 case 2: 4836 case 4: 4837 case 8: etype = T_BYTE; break; 4838 case 16: etype = T_SHORT; break; 4839 case 32: etype = T_INT; break; 4840 case 64: etype = T_LONG; break; 4841 default: fatal("Unsupported type"); break; 4842 } 4843 assert(etype != T_ILLEGAL, ""); 4844 switch(ideal_opc) { 4845 case Op_AndVMask: 4846 kand(etype, dst, src1, src2); break; 4847 case Op_OrVMask: 4848 kor(etype, dst, src1, src2); break; 4849 case Op_XorVMask: 4850 kxor(etype, dst, src1, src2); break; 4851 default: 4852 fatal("Unsupported masked operation"); break; 4853 } 4854 } 4855 4856 /* 4857 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4858 * If src is NaN, the result is 0. 4859 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4860 * the result is equal to the value of Integer.MIN_VALUE. 4861 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4862 * the result is equal to the value of Integer.MAX_VALUE. 4863 */ 4864 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4865 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4866 Register rscratch, AddressLiteral float_sign_flip, 4867 int vec_enc) { 4868 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4869 Label done; 4870 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4871 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4872 vptest(xtmp2, xtmp2, vec_enc); 4873 jccb(Assembler::equal, done); 4874 4875 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4876 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4877 4878 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4879 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4880 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4881 4882 // Recompute the mask for remaining special value. 4883 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4884 // Extract SRC values corresponding to TRUE mask lanes. 4885 vpand(xtmp4, xtmp2, src, vec_enc); 4886 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4887 // values are set. 4888 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4889 4890 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4891 bind(done); 4892 } 4893 4894 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4895 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4896 Register rscratch, AddressLiteral float_sign_flip, 4897 int vec_enc) { 4898 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4899 Label done; 4900 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4901 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4902 kortestwl(ktmp1, ktmp1); 4903 jccb(Assembler::equal, done); 4904 4905 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4906 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4907 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4908 4909 kxorwl(ktmp1, ktmp1, ktmp2); 4910 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4911 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4912 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4913 bind(done); 4914 } 4915 4916 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4917 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4918 Register rscratch, AddressLiteral double_sign_flip, 4919 int vec_enc) { 4920 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4921 4922 Label done; 4923 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4924 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4925 kortestwl(ktmp1, ktmp1); 4926 jccb(Assembler::equal, done); 4927 4928 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4929 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4930 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4931 4932 kxorwl(ktmp1, ktmp1, ktmp2); 4933 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4934 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4935 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4936 bind(done); 4937 } 4938 4939 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4940 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4941 Register rscratch, AddressLiteral float_sign_flip, 4942 int vec_enc) { 4943 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4944 Label done; 4945 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4946 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4947 kortestwl(ktmp1, ktmp1); 4948 jccb(Assembler::equal, done); 4949 4950 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4951 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4952 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4953 4954 kxorwl(ktmp1, ktmp1, ktmp2); 4955 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4956 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4957 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4958 bind(done); 4959 } 4960 4961 /* 4962 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4963 * If src is NaN, the result is 0. 4964 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4965 * the result is equal to the value of Long.MIN_VALUE. 4966 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4967 * the result is equal to the value of Long.MAX_VALUE. 4968 */ 4969 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4970 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4971 Register rscratch, AddressLiteral double_sign_flip, 4972 int vec_enc) { 4973 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4974 4975 Label done; 4976 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4977 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4978 kortestwl(ktmp1, ktmp1); 4979 jccb(Assembler::equal, done); 4980 4981 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4982 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4983 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4984 4985 kxorwl(ktmp1, ktmp1, ktmp2); 4986 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4987 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4988 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4989 bind(done); 4990 } 4991 4992 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4993 XMMRegister xtmp, int index, int vec_enc) { 4994 assert(vec_enc < Assembler::AVX_512bit, ""); 4995 if (vec_enc == Assembler::AVX_256bit) { 4996 vextractf128_high(xtmp, src); 4997 vshufps(dst, src, xtmp, index, vec_enc); 4998 } else { 4999 vshufps(dst, src, zero, index, vec_enc); 5000 } 5001 } 5002 5003 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5004 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5005 AddressLiteral float_sign_flip, int src_vec_enc) { 5006 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5007 5008 Label done; 5009 // Compare the destination lanes with float_sign_flip 5010 // value to get mask for all special values. 5011 movdqu(xtmp1, float_sign_flip, rscratch); 5012 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5013 ptest(xtmp2, xtmp2); 5014 jccb(Assembler::equal, done); 5015 5016 // Flip float_sign_flip to get max integer value. 5017 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5018 pxor(xtmp1, xtmp4); 5019 5020 // Set detination lanes corresponding to unordered source lanes as zero. 5021 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5022 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5023 5024 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5025 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5026 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5027 5028 // Recompute the mask for remaining special value. 5029 pxor(xtmp2, xtmp3); 5030 // Extract mask corresponding to non-negative source lanes. 5031 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5032 5033 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5034 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5035 pand(xtmp3, xtmp2); 5036 5037 // Replace destination lanes holding special value(0x80000000) with max int 5038 // if corresponding source lane holds a +ve value. 5039 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5040 bind(done); 5041 } 5042 5043 5044 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5045 XMMRegister xtmp, Register rscratch, int vec_enc) { 5046 switch(to_elem_bt) { 5047 case T_SHORT: 5048 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5049 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5050 vpackusdw(dst, dst, zero, vec_enc); 5051 if (vec_enc == Assembler::AVX_256bit) { 5052 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5053 } 5054 break; 5055 case T_BYTE: 5056 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5057 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5058 vpackusdw(dst, dst, zero, vec_enc); 5059 if (vec_enc == Assembler::AVX_256bit) { 5060 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5061 } 5062 vpackuswb(dst, dst, zero, vec_enc); 5063 break; 5064 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt)); 5065 } 5066 } 5067 5068 /* 5069 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):- 5070 * a) Perform vector D2L/F2I cast. 5071 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5072 * It signifies that source value could be any of the special floating point 5073 * values(NaN,-Inf,Inf,Max,-Min). 5074 * c) Set destination to zero if source is NaN value. 5075 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5076 */ 5077 5078 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5079 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5080 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5081 int to_elem_sz = type2aelembytes(to_elem_bt); 5082 assert(to_elem_sz <= 4, ""); 5083 vcvttps2dq(dst, src, vec_enc); 5084 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5085 if (to_elem_sz < 4) { 5086 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5087 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5088 } 5089 } 5090 5091 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5092 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5093 Register rscratch, int vec_enc) { 5094 int to_elem_sz = type2aelembytes(to_elem_bt); 5095 assert(to_elem_sz <= 4, ""); 5096 vcvttps2dq(dst, src, vec_enc); 5097 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5098 switch(to_elem_bt) { 5099 case T_INT: 5100 break; 5101 case T_SHORT: 5102 evpmovdw(dst, dst, vec_enc); 5103 break; 5104 case T_BYTE: 5105 evpmovdb(dst, dst, vec_enc); 5106 break; 5107 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt)); 5108 } 5109 } 5110 5111 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5112 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5113 Register rscratch, int vec_enc) { 5114 evcvttps2qq(dst, src, vec_enc); 5115 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5116 } 5117 5118 // Handling for downcasting from double to integer or sub-word types on AVX2. 5119 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5120 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5121 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5122 int to_elem_sz = type2aelembytes(to_elem_bt); 5123 assert(to_elem_sz < 8, ""); 5124 vcvttpd2dq(dst, src, vec_enc); 5125 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5126 float_sign_flip, vec_enc); 5127 if (to_elem_sz < 4) { 5128 // xtmp4 holds all zero lanes. 5129 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5130 } 5131 } 5132 5133 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5134 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5135 KRegister ktmp2, AddressLiteral sign_flip, 5136 Register rscratch, int vec_enc) { 5137 if (VM_Version::supports_avx512dq()) { 5138 evcvttpd2qq(dst, src, vec_enc); 5139 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5140 switch(to_elem_bt) { 5141 case T_LONG: 5142 break; 5143 case T_INT: 5144 evpmovsqd(dst, dst, vec_enc); 5145 break; 5146 case T_SHORT: 5147 evpmovsqd(dst, dst, vec_enc); 5148 evpmovdw(dst, dst, vec_enc); 5149 break; 5150 case T_BYTE: 5151 evpmovsqd(dst, dst, vec_enc); 5152 evpmovdb(dst, dst, vec_enc); 5153 break; 5154 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt)); 5155 } 5156 } else { 5157 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5158 vcvttpd2dq(dst, src, vec_enc); 5159 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5160 switch(to_elem_bt) { 5161 case T_INT: 5162 break; 5163 case T_SHORT: 5164 evpmovdw(dst, dst, vec_enc); 5165 break; 5166 case T_BYTE: 5167 evpmovdb(dst, dst, vec_enc); 5168 break; 5169 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt)); 5170 } 5171 } 5172 } 5173 5174 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5175 switch(to_elem_bt) { 5176 case T_LONG: 5177 evcvttps2qqs(dst, src, vec_enc); 5178 break; 5179 case T_INT: 5180 evcvttps2dqs(dst, src, vec_enc); 5181 break; 5182 case T_SHORT: 5183 evcvttps2dqs(dst, src, vec_enc); 5184 evpmovdw(dst, dst, vec_enc); 5185 break; 5186 case T_BYTE: 5187 evcvttps2dqs(dst, src, vec_enc); 5188 evpmovdb(dst, dst, vec_enc); 5189 break; 5190 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5191 } 5192 } 5193 5194 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5195 switch(to_elem_bt) { 5196 case T_LONG: 5197 evcvttps2qqs(dst, src, vec_enc); 5198 break; 5199 case T_INT: 5200 evcvttps2dqs(dst, src, vec_enc); 5201 break; 5202 case T_SHORT: 5203 evcvttps2dqs(dst, src, vec_enc); 5204 evpmovdw(dst, dst, vec_enc); 5205 break; 5206 case T_BYTE: 5207 evcvttps2dqs(dst, src, vec_enc); 5208 evpmovdb(dst, dst, vec_enc); 5209 break; 5210 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5211 } 5212 } 5213 5214 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5215 switch(to_elem_bt) { 5216 case T_LONG: 5217 evcvttpd2qqs(dst, src, vec_enc); 5218 break; 5219 case T_INT: 5220 evcvttpd2dqs(dst, src, vec_enc); 5221 break; 5222 case T_SHORT: 5223 evcvttpd2dqs(dst, src, vec_enc); 5224 evpmovdw(dst, dst, vec_enc); 5225 break; 5226 case T_BYTE: 5227 evcvttpd2dqs(dst, src, vec_enc); 5228 evpmovdb(dst, dst, vec_enc); 5229 break; 5230 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5231 } 5232 } 5233 5234 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5235 switch(to_elem_bt) { 5236 case T_LONG: 5237 evcvttpd2qqs(dst, src, vec_enc); 5238 break; 5239 case T_INT: 5240 evcvttpd2dqs(dst, src, vec_enc); 5241 break; 5242 case T_SHORT: 5243 evcvttpd2dqs(dst, src, vec_enc); 5244 evpmovdw(dst, dst, vec_enc); 5245 break; 5246 case T_BYTE: 5247 evcvttpd2dqs(dst, src, vec_enc); 5248 evpmovdb(dst, dst, vec_enc); 5249 break; 5250 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5251 } 5252 } 5253 5254 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5255 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5256 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5257 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5258 // and re-instantiate original MXCSR.RC mode after that. 5259 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5260 5261 mov64(tmp, julong_cast(0.5L)); 5262 evpbroadcastq(xtmp1, tmp, vec_enc); 5263 vaddpd(xtmp1, src , xtmp1, vec_enc); 5264 evcvtpd2qq(dst, xtmp1, vec_enc); 5265 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5266 double_sign_flip, vec_enc);; 5267 5268 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5269 } 5270 5271 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5272 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5273 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5274 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5275 // and re-instantiate original MXCSR.RC mode after that. 5276 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5277 5278 movl(tmp, jint_cast(0.5)); 5279 movq(xtmp1, tmp); 5280 vbroadcastss(xtmp1, xtmp1, vec_enc); 5281 vaddps(xtmp1, src , xtmp1, vec_enc); 5282 vcvtps2dq(dst, xtmp1, vec_enc); 5283 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5284 float_sign_flip, vec_enc); 5285 5286 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5287 } 5288 5289 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5290 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5291 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5292 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5293 // and re-instantiate original MXCSR.RC mode after that. 5294 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5295 5296 movl(tmp, jint_cast(0.5)); 5297 movq(xtmp1, tmp); 5298 vbroadcastss(xtmp1, xtmp1, vec_enc); 5299 vaddps(xtmp1, src , xtmp1, vec_enc); 5300 vcvtps2dq(dst, xtmp1, vec_enc); 5301 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5302 5303 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5304 } 5305 5306 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5307 BasicType from_elem_bt, BasicType to_elem_bt) { 5308 switch (from_elem_bt) { 5309 case T_BYTE: 5310 switch (to_elem_bt) { 5311 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5312 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5313 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5314 default: ShouldNotReachHere(); 5315 } 5316 break; 5317 case T_SHORT: 5318 switch (to_elem_bt) { 5319 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5320 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5321 default: ShouldNotReachHere(); 5322 } 5323 break; 5324 case T_INT: 5325 assert(to_elem_bt == T_LONG, ""); 5326 vpmovzxdq(dst, src, vlen_enc); 5327 break; 5328 default: 5329 ShouldNotReachHere(); 5330 } 5331 } 5332 5333 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5334 BasicType from_elem_bt, BasicType to_elem_bt) { 5335 switch (from_elem_bt) { 5336 case T_BYTE: 5337 switch (to_elem_bt) { 5338 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5339 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5340 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5341 default: ShouldNotReachHere(); 5342 } 5343 break; 5344 case T_SHORT: 5345 switch (to_elem_bt) { 5346 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5347 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5348 default: ShouldNotReachHere(); 5349 } 5350 break; 5351 case T_INT: 5352 assert(to_elem_bt == T_LONG, ""); 5353 vpmovsxdq(dst, src, vlen_enc); 5354 break; 5355 default: 5356 ShouldNotReachHere(); 5357 } 5358 } 5359 5360 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5361 BasicType dst_bt, BasicType src_bt, int vlen) { 5362 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5363 assert(vlen_enc != AVX_512bit, ""); 5364 5365 int dst_bt_size = type2aelembytes(dst_bt); 5366 int src_bt_size = type2aelembytes(src_bt); 5367 if (dst_bt_size > src_bt_size) { 5368 switch (dst_bt_size / src_bt_size) { 5369 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5370 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5371 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5372 default: ShouldNotReachHere(); 5373 } 5374 } else { 5375 assert(dst_bt_size < src_bt_size, ""); 5376 switch (src_bt_size / dst_bt_size) { 5377 case 2: { 5378 if (vlen_enc == AVX_128bit) { 5379 vpacksswb(dst, src, src, vlen_enc); 5380 } else { 5381 vpacksswb(dst, src, src, vlen_enc); 5382 vpermq(dst, dst, 0x08, vlen_enc); 5383 } 5384 break; 5385 } 5386 case 4: { 5387 if (vlen_enc == AVX_128bit) { 5388 vpackssdw(dst, src, src, vlen_enc); 5389 vpacksswb(dst, dst, dst, vlen_enc); 5390 } else { 5391 vpackssdw(dst, src, src, vlen_enc); 5392 vpermq(dst, dst, 0x08, vlen_enc); 5393 vpacksswb(dst, dst, dst, AVX_128bit); 5394 } 5395 break; 5396 } 5397 case 8: { 5398 if (vlen_enc == AVX_128bit) { 5399 vpshufd(dst, src, 0x08, vlen_enc); 5400 vpackssdw(dst, dst, dst, vlen_enc); 5401 vpacksswb(dst, dst, dst, vlen_enc); 5402 } else { 5403 vpshufd(dst, src, 0x08, vlen_enc); 5404 vpermq(dst, dst, 0x08, vlen_enc); 5405 vpackssdw(dst, dst, dst, AVX_128bit); 5406 vpacksswb(dst, dst, dst, AVX_128bit); 5407 } 5408 break; 5409 } 5410 default: ShouldNotReachHere(); 5411 } 5412 } 5413 } 5414 5415 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5416 bool merge, BasicType bt, int vlen_enc) { 5417 if (bt == T_INT) { 5418 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5419 } else { 5420 assert(bt == T_LONG, ""); 5421 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5422 } 5423 } 5424 5425 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5426 bool merge, BasicType bt, int vlen_enc) { 5427 if (bt == T_INT) { 5428 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5429 } else { 5430 assert(bt == T_LONG, ""); 5431 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5432 } 5433 } 5434 5435 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5436 Register rtmp2, XMMRegister xtmp, int mask_len, 5437 int vec_enc) { 5438 int index = 0; 5439 int vindex = 0; 5440 mov64(rtmp1, 0x0101010101010101L); 5441 pdepq(rtmp1, src, rtmp1); 5442 if (mask_len > 8) { 5443 movq(rtmp2, src); 5444 vpxor(xtmp, xtmp, xtmp, vec_enc); 5445 movq(xtmp, rtmp1); 5446 } 5447 movq(dst, rtmp1); 5448 5449 mask_len -= 8; 5450 while (mask_len > 0) { 5451 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5452 index++; 5453 if ((index % 2) == 0) { 5454 pxor(xtmp, xtmp); 5455 } 5456 mov64(rtmp1, 0x0101010101010101L); 5457 shrq(rtmp2, 8); 5458 pdepq(rtmp1, rtmp2, rtmp1); 5459 pinsrq(xtmp, rtmp1, index % 2); 5460 vindex = index / 2; 5461 if (vindex) { 5462 // Write entire 16 byte vector when both 64 bit 5463 // lanes are update to save redundant instructions. 5464 if (index % 2) { 5465 vinsertf128(dst, dst, xtmp, vindex); 5466 } 5467 } else { 5468 vmovdqu(dst, xtmp); 5469 } 5470 mask_len -= 8; 5471 } 5472 } 5473 5474 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5475 switch(opc) { 5476 case Op_VectorMaskTrueCount: 5477 popcntq(dst, tmp); 5478 break; 5479 case Op_VectorMaskLastTrue: 5480 if (VM_Version::supports_lzcnt()) { 5481 lzcntq(tmp, tmp); 5482 movl(dst, 63); 5483 subl(dst, tmp); 5484 } else { 5485 movl(dst, -1); 5486 bsrq(tmp, tmp); 5487 cmov32(Assembler::notZero, dst, tmp); 5488 } 5489 break; 5490 case Op_VectorMaskFirstTrue: 5491 if (VM_Version::supports_bmi1()) { 5492 if (masklen < 32) { 5493 orl(tmp, 1 << masklen); 5494 tzcntl(dst, tmp); 5495 } else if (masklen == 32) { 5496 tzcntl(dst, tmp); 5497 } else { 5498 assert(masklen == 64, ""); 5499 tzcntq(dst, tmp); 5500 } 5501 } else { 5502 if (masklen < 32) { 5503 orl(tmp, 1 << masklen); 5504 bsfl(dst, tmp); 5505 } else { 5506 assert(masklen == 32 || masklen == 64, ""); 5507 movl(dst, masklen); 5508 if (masklen == 32) { 5509 bsfl(tmp, tmp); 5510 } else { 5511 bsfq(tmp, tmp); 5512 } 5513 cmov32(Assembler::notZero, dst, tmp); 5514 } 5515 } 5516 break; 5517 case Op_VectorMaskToLong: 5518 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5519 break; 5520 default: assert(false, "Unhandled mask operation"); 5521 } 5522 } 5523 5524 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5525 int masklen, int masksize, int vec_enc) { 5526 assert(VM_Version::supports_popcnt(), ""); 5527 5528 if(VM_Version::supports_avx512bw()) { 5529 kmovql(tmp, mask); 5530 } else { 5531 assert(masklen <= 16, ""); 5532 kmovwl(tmp, mask); 5533 } 5534 5535 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5536 // operations needs to be clipped. 5537 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5538 andq(tmp, (1 << masklen) - 1); 5539 } 5540 5541 vector_mask_operation_helper(opc, dst, tmp, masklen); 5542 } 5543 5544 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5545 Register tmp, int masklen, BasicType bt, int vec_enc) { 5546 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5547 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5548 assert(VM_Version::supports_popcnt(), ""); 5549 5550 bool need_clip = false; 5551 switch(bt) { 5552 case T_BOOLEAN: 5553 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5554 vpxor(xtmp, xtmp, xtmp, vec_enc); 5555 vpsubb(xtmp, xtmp, mask, vec_enc); 5556 vpmovmskb(tmp, xtmp, vec_enc); 5557 need_clip = masklen < 16; 5558 break; 5559 case T_BYTE: 5560 vpmovmskb(tmp, mask, vec_enc); 5561 need_clip = masklen < 16; 5562 break; 5563 case T_SHORT: 5564 vpacksswb(xtmp, mask, mask, vec_enc); 5565 if (masklen >= 16) { 5566 vpermpd(xtmp, xtmp, 8, vec_enc); 5567 } 5568 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5569 need_clip = masklen < 16; 5570 break; 5571 case T_INT: 5572 case T_FLOAT: 5573 vmovmskps(tmp, mask, vec_enc); 5574 need_clip = masklen < 4; 5575 break; 5576 case T_LONG: 5577 case T_DOUBLE: 5578 vmovmskpd(tmp, mask, vec_enc); 5579 need_clip = masklen < 2; 5580 break; 5581 default: assert(false, "Unhandled type, %s", type2name(bt)); 5582 } 5583 5584 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5585 // operations needs to be clipped. 5586 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5587 // need_clip implies masklen < 32 5588 andq(tmp, (1 << masklen) - 1); 5589 } 5590 5591 vector_mask_operation_helper(opc, dst, tmp, masklen); 5592 } 5593 5594 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5595 Register rtmp2, int mask_len) { 5596 kmov(rtmp1, src); 5597 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5598 mov64(rtmp2, -1L); 5599 pextq(rtmp2, rtmp2, rtmp1); 5600 kmov(dst, rtmp2); 5601 } 5602 5603 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5604 XMMRegister mask, Register rtmp, Register rscratch, 5605 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5606 int vec_enc) { 5607 assert(type2aelembytes(bt) >= 4, ""); 5608 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5609 address compress_perm_table = nullptr; 5610 address expand_perm_table = nullptr; 5611 if (type2aelembytes(bt) == 8) { 5612 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5613 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5614 vmovmskpd(rtmp, mask, vec_enc); 5615 } else { 5616 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5617 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5618 vmovmskps(rtmp, mask, vec_enc); 5619 } 5620 shlq(rtmp, 5); // for 32 byte permute row. 5621 if (opcode == Op_CompressV) { 5622 lea(rscratch, ExternalAddress(compress_perm_table)); 5623 } else { 5624 lea(rscratch, ExternalAddress(expand_perm_table)); 5625 } 5626 addptr(rtmp, rscratch); 5627 vmovdqu(permv, Address(rtmp)); 5628 vpermps(dst, permv, src, Assembler::AVX_256bit); 5629 vpxor(xtmp, xtmp, xtmp, vec_enc); 5630 // Blend the result with zero vector using permute mask, each column entry 5631 // in a permute table row contains either a valid permute index or a -1 (default) 5632 // value, this can potentially be used as a blending mask after 5633 // compressing/expanding the source vector lanes. 5634 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv); 5635 } 5636 5637 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5638 bool merge, BasicType bt, int vec_enc) { 5639 if (opcode == Op_CompressV) { 5640 switch(bt) { 5641 case T_BYTE: 5642 evpcompressb(dst, mask, src, merge, vec_enc); 5643 break; 5644 case T_CHAR: 5645 case T_SHORT: 5646 evpcompressw(dst, mask, src, merge, vec_enc); 5647 break; 5648 case T_INT: 5649 evpcompressd(dst, mask, src, merge, vec_enc); 5650 break; 5651 case T_FLOAT: 5652 evcompressps(dst, mask, src, merge, vec_enc); 5653 break; 5654 case T_LONG: 5655 evpcompressq(dst, mask, src, merge, vec_enc); 5656 break; 5657 case T_DOUBLE: 5658 evcompresspd(dst, mask, src, merge, vec_enc); 5659 break; 5660 default: 5661 fatal("Unsupported type %s", type2name(bt)); 5662 break; 5663 } 5664 } else { 5665 assert(opcode == Op_ExpandV, ""); 5666 switch(bt) { 5667 case T_BYTE: 5668 evpexpandb(dst, mask, src, merge, vec_enc); 5669 break; 5670 case T_CHAR: 5671 case T_SHORT: 5672 evpexpandw(dst, mask, src, merge, vec_enc); 5673 break; 5674 case T_INT: 5675 evpexpandd(dst, mask, src, merge, vec_enc); 5676 break; 5677 case T_FLOAT: 5678 evexpandps(dst, mask, src, merge, vec_enc); 5679 break; 5680 case T_LONG: 5681 evpexpandq(dst, mask, src, merge, vec_enc); 5682 break; 5683 case T_DOUBLE: 5684 evexpandpd(dst, mask, src, merge, vec_enc); 5685 break; 5686 default: 5687 fatal("Unsupported type %s", type2name(bt)); 5688 break; 5689 } 5690 } 5691 } 5692 5693 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5694 KRegister ktmp1, int vec_enc) { 5695 if (opcode == Op_SignumVD) { 5696 vsubpd(dst, zero, one, vec_enc); 5697 // if src < 0 ? -1 : 1 5698 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5699 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5700 // if src == NaN, -0.0 or 0.0 return src. 5701 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5702 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5703 } else { 5704 assert(opcode == Op_SignumVF, ""); 5705 vsubps(dst, zero, one, vec_enc); 5706 // if src < 0 ? -1 : 1 5707 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5708 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5709 // if src == NaN, -0.0 or 0.0 return src. 5710 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5711 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5712 } 5713 } 5714 5715 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5716 XMMRegister xtmp1, int vec_enc) { 5717 if (opcode == Op_SignumVD) { 5718 vsubpd(dst, zero, one, vec_enc); 5719 // if src < 0 ? -1 : 1 5720 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5721 // if src == NaN, -0.0 or 0.0 return src. 5722 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5723 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5724 } else { 5725 assert(opcode == Op_SignumVF, ""); 5726 vsubps(dst, zero, one, vec_enc); 5727 // if src < 0 ? -1 : 1 5728 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5729 // if src == NaN, -0.0 or 0.0 return src. 5730 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5731 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5732 } 5733 } 5734 5735 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5736 if (VM_Version::supports_avx512bw()) { 5737 if (mask_len > 32) { 5738 kmovql(dst, src); 5739 } else { 5740 kmovdl(dst, src); 5741 if (mask_len != 32) { 5742 kshiftrdl(dst, dst, 32 - mask_len); 5743 } 5744 } 5745 } else { 5746 assert(mask_len <= 16, ""); 5747 kmovwl(dst, src); 5748 if (mask_len != 16) { 5749 kshiftrwl(dst, dst, 16 - mask_len); 5750 } 5751 } 5752 } 5753 5754 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5755 int lane_size = type2aelembytes(bt); 5756 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5757 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5758 movptr(rtmp, imm32); 5759 switch(lane_size) { 5760 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5761 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5762 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5763 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5764 fatal("Unsupported lane size %d", lane_size); 5765 break; 5766 } 5767 } else { 5768 movptr(rtmp, imm32); 5769 movq(dst, rtmp); 5770 switch(lane_size) { 5771 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5772 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5773 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5774 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5775 fatal("Unsupported lane size %d", lane_size); 5776 break; 5777 } 5778 } 5779 } 5780 5781 // 5782 // Following is lookup table based popcount computation algorithm:- 5783 // Index Bit set count 5784 // [ 0000 -> 0, 5785 // 0001 -> 1, 5786 // 0010 -> 1, 5787 // 0011 -> 2, 5788 // 0100 -> 1, 5789 // 0101 -> 2, 5790 // 0110 -> 2, 5791 // 0111 -> 3, 5792 // 1000 -> 1, 5793 // 1001 -> 2, 5794 // 1010 -> 3, 5795 // 1011 -> 3, 5796 // 1100 -> 2, 5797 // 1101 -> 3, 5798 // 1111 -> 4 ] 5799 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5800 // shuffle indices for lookup table access. 5801 // b. Right shift each byte of vector lane by 4 positions. 5802 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5803 // shuffle indices for lookup table access. 5804 // d. Add the bitset count of upper and lower 4 bits of each byte. 5805 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5806 // count of all the bytes of a quadword. 5807 // f. Perform step e. for upper 128bit vector lane. 5808 // g. Pack the bitset count of quadwords back to double word. 5809 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5810 5811 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5812 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5813 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5814 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5815 vpsrlw(dst, src, 4, vec_enc); 5816 vpand(dst, dst, xtmp1, vec_enc); 5817 vpand(xtmp1, src, xtmp1, vec_enc); 5818 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5819 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5820 vpshufb(dst, xtmp2, dst, vec_enc); 5821 vpaddb(dst, dst, xtmp1, vec_enc); 5822 } 5823 5824 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5825 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5826 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5827 // Following code is as per steps e,f,g and h of above algorithm. 5828 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5829 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5830 vpsadbw(dst, dst, xtmp2, vec_enc); 5831 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5832 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5833 vpackuswb(dst, xtmp1, dst, vec_enc); 5834 } 5835 5836 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5837 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5838 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5839 // Add the popcount of upper and lower bytes of word. 5840 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5841 vpsrlw(dst, xtmp1, 8, vec_enc); 5842 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5843 vpaddw(dst, dst, xtmp1, vec_enc); 5844 } 5845 5846 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5847 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5848 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5849 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5850 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5851 } 5852 5853 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5854 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5855 switch(bt) { 5856 case T_LONG: 5857 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5858 break; 5859 case T_INT: 5860 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5861 break; 5862 case T_CHAR: 5863 case T_SHORT: 5864 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5865 break; 5866 case T_BYTE: 5867 case T_BOOLEAN: 5868 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5869 break; 5870 default: 5871 fatal("Unsupported type %s", type2name(bt)); 5872 break; 5873 } 5874 } 5875 5876 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5877 KRegister mask, bool merge, int vec_enc) { 5878 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5879 switch(bt) { 5880 case T_LONG: 5881 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5882 evpopcntq(dst, mask, src, merge, vec_enc); 5883 break; 5884 case T_INT: 5885 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5886 evpopcntd(dst, mask, src, merge, vec_enc); 5887 break; 5888 case T_CHAR: 5889 case T_SHORT: 5890 assert(VM_Version::supports_avx512_bitalg(), ""); 5891 evpopcntw(dst, mask, src, merge, vec_enc); 5892 break; 5893 case T_BYTE: 5894 case T_BOOLEAN: 5895 assert(VM_Version::supports_avx512_bitalg(), ""); 5896 evpopcntb(dst, mask, src, merge, vec_enc); 5897 break; 5898 default: 5899 fatal("Unsupported type %s", type2name(bt)); 5900 break; 5901 } 5902 } 5903 5904 // Bit reversal algorithm first reverses the bits of each byte followed by 5905 // a byte level reversal for multi-byte primitive types (short/int/long). 5906 // Algorithm performs a lookup table access to get reverse bit sequence 5907 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5908 // is obtained by swapping the reverse bit sequences of upper and lower 5909 // nibble of a byte. 5910 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5911 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5912 if (VM_Version::supports_avx512vlbw()) { 5913 5914 // Get the reverse bit sequence of lower nibble of each byte. 5915 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5916 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5917 evpandq(dst, xtmp2, src, vec_enc); 5918 vpshufb(dst, xtmp1, dst, vec_enc); 5919 vpsllq(dst, dst, 4, vec_enc); 5920 5921 // Get the reverse bit sequence of upper nibble of each byte. 5922 vpandn(xtmp2, xtmp2, src, vec_enc); 5923 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5924 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5925 5926 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5927 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5928 evporq(xtmp2, dst, xtmp2, vec_enc); 5929 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5930 5931 } else if(vec_enc == Assembler::AVX_512bit) { 5932 // Shift based bit reversal. 5933 assert(bt == T_LONG || bt == T_INT, ""); 5934 5935 // Swap lower and upper nibble of each byte. 5936 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5937 5938 // Swap two least and most significant bits of each nibble. 5939 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5940 5941 // Swap adjacent pair of bits. 5942 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5943 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5944 5945 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5946 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5947 } else { 5948 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5949 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5950 5951 // Get the reverse bit sequence of lower nibble of each byte. 5952 vpand(dst, xtmp2, src, vec_enc); 5953 vpshufb(dst, xtmp1, dst, vec_enc); 5954 vpsllq(dst, dst, 4, vec_enc); 5955 5956 // Get the reverse bit sequence of upper nibble of each byte. 5957 vpandn(xtmp2, xtmp2, src, vec_enc); 5958 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5959 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5960 5961 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5962 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5963 vpor(xtmp2, dst, xtmp2, vec_enc); 5964 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5965 } 5966 } 5967 5968 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5969 XMMRegister xtmp, Register rscratch) { 5970 assert(VM_Version::supports_gfni(), ""); 5971 assert(rscratch != noreg || always_reachable(mask), "missing"); 5972 5973 // Galois field instruction based bit reversal based on following algorithm. 5974 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5975 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5976 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5977 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5978 } 5979 5980 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5981 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5982 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5983 evpandq(dst, xtmp1, src, vec_enc); 5984 vpsllq(dst, dst, nbits, vec_enc); 5985 vpandn(xtmp1, xtmp1, src, vec_enc); 5986 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5987 evporq(dst, dst, xtmp1, vec_enc); 5988 } 5989 5990 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5991 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5992 // Shift based bit reversal. 5993 assert(VM_Version::supports_evex(), ""); 5994 switch(bt) { 5995 case T_LONG: 5996 // Swap upper and lower double word of each quad word. 5997 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5998 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5999 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6000 break; 6001 case T_INT: 6002 // Swap upper and lower word of each double word. 6003 evprord(xtmp1, k0, src, 16, true, vec_enc); 6004 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6005 break; 6006 case T_CHAR: 6007 case T_SHORT: 6008 // Swap upper and lower byte of each word. 6009 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6010 break; 6011 case T_BYTE: 6012 evmovdquq(dst, k0, src, true, vec_enc); 6013 break; 6014 default: 6015 fatal("Unsupported type %s", type2name(bt)); 6016 break; 6017 } 6018 } 6019 6020 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6021 if (bt == T_BYTE) { 6022 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6023 evmovdquq(dst, k0, src, true, vec_enc); 6024 } else { 6025 vmovdqu(dst, src); 6026 } 6027 return; 6028 } 6029 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6030 // pre-computed shuffle indices. 6031 switch(bt) { 6032 case T_LONG: 6033 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6034 break; 6035 case T_INT: 6036 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6037 break; 6038 case T_CHAR: 6039 case T_SHORT: 6040 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6041 break; 6042 default: 6043 fatal("Unsupported type %s", type2name(bt)); 6044 break; 6045 } 6046 vpshufb(dst, src, dst, vec_enc); 6047 } 6048 6049 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6050 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6051 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6052 assert(is_integral_type(bt), ""); 6053 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6054 assert(VM_Version::supports_avx512cd(), ""); 6055 switch(bt) { 6056 case T_LONG: 6057 evplzcntq(dst, ktmp, src, merge, vec_enc); 6058 break; 6059 case T_INT: 6060 evplzcntd(dst, ktmp, src, merge, vec_enc); 6061 break; 6062 case T_SHORT: 6063 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6064 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6065 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6066 vpunpckhwd(dst, xtmp1, src, vec_enc); 6067 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6068 vpackusdw(dst, xtmp2, dst, vec_enc); 6069 break; 6070 case T_BYTE: 6071 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6072 // accessing the lookup table. 6073 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6074 // accessing the lookup table. 6075 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6076 assert(VM_Version::supports_avx512bw(), ""); 6077 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6078 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6079 vpand(xtmp2, dst, src, vec_enc); 6080 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6081 vpsrlw(xtmp3, src, 4, vec_enc); 6082 vpand(xtmp3, dst, xtmp3, vec_enc); 6083 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6084 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6085 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6086 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6087 break; 6088 default: 6089 fatal("Unsupported type %s", type2name(bt)); 6090 break; 6091 } 6092 } 6093 6094 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6095 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6096 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6097 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6098 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6099 // accessing the lookup table. 6100 vpand(dst, xtmp2, src, vec_enc); 6101 vpshufb(dst, xtmp1, dst, vec_enc); 6102 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6103 // accessing the lookup table. 6104 vpsrlw(xtmp3, src, 4, vec_enc); 6105 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6106 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6107 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6108 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6109 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6110 vpaddb(dst, dst, xtmp2, vec_enc); 6111 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6112 } 6113 6114 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6115 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6116 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6117 // Add zero counts of lower byte and upper byte of a word if 6118 // upper byte holds a zero value. 6119 vpsrlw(xtmp3, src, 8, vec_enc); 6120 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6121 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6122 vpsllw(xtmp2, dst, 8, vec_enc); 6123 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6124 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6125 vpsrlw(dst, dst, 8, vec_enc); 6126 } 6127 6128 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6129 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6130 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float. 6131 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the 6132 // exponent as the leading zero count. 6133 6134 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6135 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6136 // contributes to the leading number of zeros. 6137 vpsrld(dst, src, 1, vec_enc); 6138 vpandn(dst, dst, src, vec_enc); 6139 6140 vcvtdq2ps(dst, dst, vec_enc); 6141 6142 // By comparing the register to itself, all the bits in the destination are set. 6143 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6144 6145 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit. 6146 vpsrld(xtmp2, xtmp1, 24, vec_enc); 6147 vpsrld(dst, dst, 23, vec_enc); 6148 vpand(dst, xtmp2, dst, vec_enc); 6149 6150 // Subtract 127 from the exponent, which removes the bias from the exponent. 6151 vpsrld(xtmp2, xtmp1, 25, vec_enc); 6152 vpsubd(dst, dst, xtmp2, vec_enc); 6153 6154 vpsrld(xtmp2, xtmp1, 27, vec_enc); 6155 6156 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this 6157 // is found in any of the lanes, replace the lane with -1 from xtmp1. 6158 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3); 6159 6160 // If the original value is negative, replace the lane with 31. 6161 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3); 6162 6163 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1, 6164 // and for negative numbers the result is 0 as the exponent was replaced with 31. 6165 vpsubd(dst, xtmp2, dst, vec_enc); 6166 } 6167 6168 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6169 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6170 // Find the leading zeros of the top and bottom halves of the long individually. 6171 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6172 6173 // Move the top half result to the bottom half of xtmp1, setting the top half to 0. 6174 vpsrlq(xtmp1, dst, 32, vec_enc); 6175 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will 6176 // be in the most significant position of the bottom half. 6177 vpsrlq(xtmp2, dst, 6, vec_enc); 6178 6179 // In the bottom half, add the top half and bottom half results. 6180 vpaddq(dst, xtmp1, dst, vec_enc); 6181 6182 // For the bottom half, choose between the values using the most significant bit of xtmp2. 6183 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen, 6184 // which contains only the top half result. 6185 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears 6186 // the lane as required. 6187 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3); 6188 } 6189 6190 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6191 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6192 Register rtmp, int vec_enc) { 6193 assert(is_integral_type(bt), "unexpected type"); 6194 assert(vec_enc < Assembler::AVX_512bit, ""); 6195 switch(bt) { 6196 case T_LONG: 6197 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6198 break; 6199 case T_INT: 6200 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6201 break; 6202 case T_SHORT: 6203 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6204 break; 6205 case T_BYTE: 6206 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6207 break; 6208 default: 6209 fatal("Unsupported type %s", type2name(bt)); 6210 break; 6211 } 6212 } 6213 6214 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6215 switch(bt) { 6216 case T_BYTE: 6217 vpsubb(dst, src1, src2, vec_enc); 6218 break; 6219 case T_SHORT: 6220 vpsubw(dst, src1, src2, vec_enc); 6221 break; 6222 case T_INT: 6223 vpsubd(dst, src1, src2, vec_enc); 6224 break; 6225 case T_LONG: 6226 vpsubq(dst, src1, src2, vec_enc); 6227 break; 6228 default: 6229 fatal("Unsupported type %s", type2name(bt)); 6230 break; 6231 } 6232 } 6233 6234 // Trailing zero count computation is based on leading zero count operation as per 6235 // following equation. All AVX3 targets support AVX512CD feature which offers 6236 // direct vector instruction to compute leading zero count. 6237 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6238 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6239 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6240 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6241 assert(is_integral_type(bt), ""); 6242 // xtmp = -1 6243 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6244 // xtmp = xtmp + src 6245 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6246 // xtmp = xtmp & ~src 6247 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6248 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6249 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6250 vpsub(bt, dst, xtmp4, dst, vec_enc); 6251 } 6252 6253 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6254 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6255 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6256 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6257 assert(is_integral_type(bt), ""); 6258 // xtmp = 0 6259 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6260 // xtmp = 0 - src 6261 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6262 // xtmp = xtmp | src 6263 vpor(xtmp3, xtmp3, src, vec_enc); 6264 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6265 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6266 vpsub(bt, dst, xtmp1, dst, vec_enc); 6267 } 6268 6269 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6270 Label done; 6271 Label neg_divisor_fastpath; 6272 cmpl(divisor, 0); 6273 jccb(Assembler::less, neg_divisor_fastpath); 6274 xorl(rdx, rdx); 6275 divl(divisor); 6276 jmpb(done); 6277 bind(neg_divisor_fastpath); 6278 // Fastpath for divisor < 0: 6279 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6280 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6281 movl(rdx, rax); 6282 subl(rdx, divisor); 6283 if (VM_Version::supports_bmi1()) { 6284 andnl(rax, rdx, rax); 6285 } else { 6286 notl(rdx); 6287 andl(rax, rdx); 6288 } 6289 shrl(rax, 31); 6290 bind(done); 6291 } 6292 6293 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6294 Label done; 6295 Label neg_divisor_fastpath; 6296 cmpl(divisor, 0); 6297 jccb(Assembler::less, neg_divisor_fastpath); 6298 xorl(rdx, rdx); 6299 divl(divisor); 6300 jmpb(done); 6301 bind(neg_divisor_fastpath); 6302 // Fastpath when divisor < 0: 6303 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6304 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6305 movl(rdx, rax); 6306 subl(rax, divisor); 6307 if (VM_Version::supports_bmi1()) { 6308 andnl(rax, rax, rdx); 6309 } else { 6310 notl(rax); 6311 andl(rax, rdx); 6312 } 6313 sarl(rax, 31); 6314 andl(rax, divisor); 6315 subl(rdx, rax); 6316 bind(done); 6317 } 6318 6319 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6320 Label done; 6321 Label neg_divisor_fastpath; 6322 6323 cmpl(divisor, 0); 6324 jccb(Assembler::less, neg_divisor_fastpath); 6325 xorl(rdx, rdx); 6326 divl(divisor); 6327 jmpb(done); 6328 bind(neg_divisor_fastpath); 6329 // Fastpath for divisor < 0: 6330 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6331 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6332 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6333 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6334 movl(rdx, rax); 6335 subl(rax, divisor); 6336 if (VM_Version::supports_bmi1()) { 6337 andnl(rax, rax, rdx); 6338 } else { 6339 notl(rax); 6340 andl(rax, rdx); 6341 } 6342 movl(tmp, rax); 6343 shrl(rax, 31); // quotient 6344 sarl(tmp, 31); 6345 andl(tmp, divisor); 6346 subl(rdx, tmp); // remainder 6347 bind(done); 6348 } 6349 6350 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6351 XMMRegister xtmp2, Register rtmp) { 6352 if(VM_Version::supports_gfni()) { 6353 // Galois field instruction based bit reversal based on following algorithm. 6354 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6355 mov64(rtmp, 0x8040201008040201L); 6356 movq(xtmp1, src); 6357 movq(xtmp2, rtmp); 6358 gf2p8affineqb(xtmp1, xtmp2, 0); 6359 movq(dst, xtmp1); 6360 } else { 6361 // Swap even and odd numbered bits. 6362 movl(rtmp, src); 6363 andl(rtmp, 0x55555555); 6364 shll(rtmp, 1); 6365 movl(dst, src); 6366 andl(dst, 0xAAAAAAAA); 6367 shrl(dst, 1); 6368 orl(dst, rtmp); 6369 6370 // Swap LSB and MSB 2 bits of each nibble. 6371 movl(rtmp, dst); 6372 andl(rtmp, 0x33333333); 6373 shll(rtmp, 2); 6374 andl(dst, 0xCCCCCCCC); 6375 shrl(dst, 2); 6376 orl(dst, rtmp); 6377 6378 // Swap LSB and MSB 4 bits of each byte. 6379 movl(rtmp, dst); 6380 andl(rtmp, 0x0F0F0F0F); 6381 shll(rtmp, 4); 6382 andl(dst, 0xF0F0F0F0); 6383 shrl(dst, 4); 6384 orl(dst, rtmp); 6385 } 6386 bswapl(dst); 6387 } 6388 6389 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6390 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6391 if(VM_Version::supports_gfni()) { 6392 // Galois field instruction based bit reversal based on following algorithm. 6393 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6394 mov64(rtmp1, 0x8040201008040201L); 6395 movq(xtmp1, src); 6396 movq(xtmp2, rtmp1); 6397 gf2p8affineqb(xtmp1, xtmp2, 0); 6398 movq(dst, xtmp1); 6399 } else { 6400 // Swap even and odd numbered bits. 6401 movq(rtmp1, src); 6402 mov64(rtmp2, 0x5555555555555555L); 6403 andq(rtmp1, rtmp2); 6404 shlq(rtmp1, 1); 6405 movq(dst, src); 6406 notq(rtmp2); 6407 andq(dst, rtmp2); 6408 shrq(dst, 1); 6409 orq(dst, rtmp1); 6410 6411 // Swap LSB and MSB 2 bits of each nibble. 6412 movq(rtmp1, dst); 6413 mov64(rtmp2, 0x3333333333333333L); 6414 andq(rtmp1, rtmp2); 6415 shlq(rtmp1, 2); 6416 notq(rtmp2); 6417 andq(dst, rtmp2); 6418 shrq(dst, 2); 6419 orq(dst, rtmp1); 6420 6421 // Swap LSB and MSB 4 bits of each byte. 6422 movq(rtmp1, dst); 6423 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6424 andq(rtmp1, rtmp2); 6425 shlq(rtmp1, 4); 6426 notq(rtmp2); 6427 andq(dst, rtmp2); 6428 shrq(dst, 4); 6429 orq(dst, rtmp1); 6430 } 6431 bswapq(dst); 6432 } 6433 6434 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6435 Label done; 6436 Label neg_divisor_fastpath; 6437 cmpq(divisor, 0); 6438 jccb(Assembler::less, neg_divisor_fastpath); 6439 xorl(rdx, rdx); 6440 divq(divisor); 6441 jmpb(done); 6442 bind(neg_divisor_fastpath); 6443 // Fastpath for divisor < 0: 6444 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6445 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6446 movq(rdx, rax); 6447 subq(rdx, divisor); 6448 if (VM_Version::supports_bmi1()) { 6449 andnq(rax, rdx, rax); 6450 } else { 6451 notq(rdx); 6452 andq(rax, rdx); 6453 } 6454 shrq(rax, 63); 6455 bind(done); 6456 } 6457 6458 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6459 Label done; 6460 Label neg_divisor_fastpath; 6461 cmpq(divisor, 0); 6462 jccb(Assembler::less, neg_divisor_fastpath); 6463 xorq(rdx, rdx); 6464 divq(divisor); 6465 jmp(done); 6466 bind(neg_divisor_fastpath); 6467 // Fastpath when divisor < 0: 6468 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6469 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6470 movq(rdx, rax); 6471 subq(rax, divisor); 6472 if (VM_Version::supports_bmi1()) { 6473 andnq(rax, rax, rdx); 6474 } else { 6475 notq(rax); 6476 andq(rax, rdx); 6477 } 6478 sarq(rax, 63); 6479 andq(rax, divisor); 6480 subq(rdx, rax); 6481 bind(done); 6482 } 6483 6484 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6485 Label done; 6486 Label neg_divisor_fastpath; 6487 cmpq(divisor, 0); 6488 jccb(Assembler::less, neg_divisor_fastpath); 6489 xorq(rdx, rdx); 6490 divq(divisor); 6491 jmp(done); 6492 bind(neg_divisor_fastpath); 6493 // Fastpath for divisor < 0: 6494 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6495 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6496 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6497 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6498 movq(rdx, rax); 6499 subq(rax, divisor); 6500 if (VM_Version::supports_bmi1()) { 6501 andnq(rax, rax, rdx); 6502 } else { 6503 notq(rax); 6504 andq(rax, rdx); 6505 } 6506 movq(tmp, rax); 6507 shrq(rax, 63); // quotient 6508 sarq(tmp, 63); 6509 andq(tmp, divisor); 6510 subq(rdx, tmp); // remainder 6511 bind(done); 6512 } 6513 6514 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6515 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6516 int vlen_enc) { 6517 assert(VM_Version::supports_avx512bw(), ""); 6518 // Byte shuffles are inlane operations and indices are determined using 6519 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6520 // normalized to index range 0-15. This makes sure that all the multiples 6521 // of an index value are placed at same relative position in 128 bit 6522 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6523 // will be 16th element in their respective 128 bit lanes. 6524 movl(rtmp, 16); 6525 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6526 6527 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6528 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6529 // original shuffle indices and move the shuffled lanes corresponding to true 6530 // mask to destination vector. 6531 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6532 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6533 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6534 6535 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6536 // and broadcasting second 128 bit lane. 6537 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6538 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6539 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6540 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6541 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6542 6543 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6544 // and broadcasting third 128 bit lane. 6545 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6546 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6547 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6548 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6549 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6550 6551 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6552 // and broadcasting third 128 bit lane. 6553 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6554 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6555 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6556 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6557 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6558 } 6559 6560 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6561 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6562 if (vlen_enc == AVX_128bit) { 6563 vpermilps(dst, src, shuffle, vlen_enc); 6564 } else if (bt == T_INT) { 6565 vpermd(dst, shuffle, src, vlen_enc); 6566 } else { 6567 assert(bt == T_FLOAT, ""); 6568 vpermps(dst, shuffle, src, vlen_enc); 6569 } 6570 } 6571 6572 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6573 switch(opcode) { 6574 case Op_AddHF: vaddsh(dst, src1, src2); break; 6575 case Op_SubHF: vsubsh(dst, src1, src2); break; 6576 case Op_MulHF: vmulsh(dst, src1, src2); break; 6577 case Op_DivHF: vdivsh(dst, src1, src2); break; 6578 default: assert(false, "%s", NodeClassNames[opcode]); break; 6579 } 6580 } 6581 6582 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6583 switch(elem_bt) { 6584 case T_BYTE: 6585 if (ideal_opc == Op_SaturatingAddV) { 6586 vpaddsb(dst, src1, src2, vlen_enc); 6587 } else { 6588 assert(ideal_opc == Op_SaturatingSubV, ""); 6589 vpsubsb(dst, src1, src2, vlen_enc); 6590 } 6591 break; 6592 case T_SHORT: 6593 if (ideal_opc == Op_SaturatingAddV) { 6594 vpaddsw(dst, src1, src2, vlen_enc); 6595 } else { 6596 assert(ideal_opc == Op_SaturatingSubV, ""); 6597 vpsubsw(dst, src1, src2, vlen_enc); 6598 } 6599 break; 6600 default: 6601 fatal("Unsupported type %s", type2name(elem_bt)); 6602 break; 6603 } 6604 } 6605 6606 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6607 switch(elem_bt) { 6608 case T_BYTE: 6609 if (ideal_opc == Op_SaturatingAddV) { 6610 vpaddusb(dst, src1, src2, vlen_enc); 6611 } else { 6612 assert(ideal_opc == Op_SaturatingSubV, ""); 6613 vpsubusb(dst, src1, src2, vlen_enc); 6614 } 6615 break; 6616 case T_SHORT: 6617 if (ideal_opc == Op_SaturatingAddV) { 6618 vpaddusw(dst, src1, src2, vlen_enc); 6619 } else { 6620 assert(ideal_opc == Op_SaturatingSubV, ""); 6621 vpsubusw(dst, src1, src2, vlen_enc); 6622 } 6623 break; 6624 default: 6625 fatal("Unsupported type %s", type2name(elem_bt)); 6626 break; 6627 } 6628 } 6629 6630 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6631 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6632 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6633 // overflow_mask = Inp1 <u Inp2 6634 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6635 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6636 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6637 } 6638 6639 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6640 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6641 // Emulate unsigned comparison using signed comparison 6642 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6643 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6644 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6645 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6646 6647 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6648 6649 // Res = INP1 - INP2 (non-commutative and non-associative) 6650 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6651 // Res = Mask ? Zero : Res 6652 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6653 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6654 } 6655 6656 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6657 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6658 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6659 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6660 // Res = Signed Add INP1, INP2 6661 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6662 // T1 = SRC1 | SRC2 6663 vpor(xtmp1, src1, src2, vlen_enc); 6664 // Max_Unsigned = -1 6665 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6666 // Unsigned compare: Mask = Res <u T1 6667 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6668 // res = Mask ? Max_Unsigned : Res 6669 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6670 } 6671 6672 // 6673 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6674 // unsigned addition operation. 6675 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6676 // 6677 // We empirically determined its semantic equivalence to following reduced expression 6678 // overflow_mask = (a + b) <u (a | b) 6679 // 6680 // and also verified it though Alive2 solver. 6681 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6682 // 6683 6684 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6685 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6686 // Res = Signed Add INP1, INP2 6687 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6688 // Compute T1 = INP1 | INP2 6689 vpor(xtmp3, src1, src2, vlen_enc); 6690 // T1 = Minimum signed value. 6691 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6692 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6693 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6694 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6695 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6696 // Compute overflow detection mask = Res<1> <s T1 6697 if (elem_bt == T_INT) { 6698 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6699 } else { 6700 assert(elem_bt == T_LONG, ""); 6701 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6702 } 6703 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6704 } 6705 6706 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6707 int vlen_enc, bool xtmp2_hold_M1) { 6708 if (VM_Version::supports_avx512dq()) { 6709 evpmovq2m(ktmp, src, vlen_enc); 6710 } else { 6711 assert(VM_Version::supports_evex(), ""); 6712 if (!xtmp2_hold_M1) { 6713 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6714 } 6715 evpsraq(xtmp1, src, 63, vlen_enc); 6716 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6717 } 6718 } 6719 6720 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6721 int vlen_enc, bool xtmp2_hold_M1) { 6722 if (VM_Version::supports_avx512dq()) { 6723 evpmovd2m(ktmp, src, vlen_enc); 6724 } else { 6725 assert(VM_Version::supports_evex(), ""); 6726 if (!xtmp2_hold_M1) { 6727 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6728 } 6729 vpsrad(xtmp1, src, 31, vlen_enc); 6730 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6731 } 6732 } 6733 6734 6735 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6736 if (elem_bt == T_LONG) { 6737 if (VM_Version::supports_evex()) { 6738 evpsraq(dst, src, 63, vlen_enc); 6739 } else { 6740 vpsrad(dst, src, 31, vlen_enc); 6741 vpshufd(dst, dst, 0xF5, vlen_enc); 6742 } 6743 } else { 6744 assert(elem_bt == T_INT, ""); 6745 vpsrad(dst, src, 31, vlen_enc); 6746 } 6747 } 6748 6749 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6750 if (compute_allones) { 6751 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6752 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6753 } else { 6754 vpcmpeqq(allones, allones, allones, vlen_enc); 6755 } 6756 } 6757 if (elem_bt == T_LONG) { 6758 vpsrlq(dst, allones, 1, vlen_enc); 6759 } else { 6760 assert(elem_bt == T_INT, ""); 6761 vpsrld(dst, allones, 1, vlen_enc); 6762 } 6763 } 6764 6765 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6766 if (compute_allones) { 6767 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6768 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6769 } else { 6770 vpcmpeqq(allones, allones, allones, vlen_enc); 6771 } 6772 } 6773 if (elem_bt == T_LONG) { 6774 vpsllq(dst, allones, 63, vlen_enc); 6775 } else { 6776 assert(elem_bt == T_INT, ""); 6777 vpslld(dst, allones, 31, vlen_enc); 6778 } 6779 } 6780 6781 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6782 Assembler::ComparisonPredicate cond, int vlen_enc) { 6783 switch(elem_bt) { 6784 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6785 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6786 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6787 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6788 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6789 } 6790 } 6791 6792 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6793 switch(elem_bt) { 6794 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6795 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6796 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6797 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6798 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6799 } 6800 } 6801 6802 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6803 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6804 if (elem_bt == T_LONG) { 6805 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6806 } else { 6807 assert(elem_bt == T_INT, ""); 6808 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6809 } 6810 } 6811 6812 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6813 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6814 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6815 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6816 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6817 // Overflow detection based on Hacker's delight section 2-13. 6818 if (ideal_opc == Op_SaturatingAddV) { 6819 // res = src1 + src2 6820 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6821 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6822 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6823 vpxor(xtmp1, dst, src1, vlen_enc); 6824 vpxor(xtmp2, dst, src2, vlen_enc); 6825 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6826 } else { 6827 assert(ideal_opc == Op_SaturatingSubV, ""); 6828 // res = src1 - src2 6829 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6830 // Overflow occurs when both inputs have opposite polarity and 6831 // result polarity does not comply with first input polarity. 6832 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6833 vpxor(xtmp1, src1, src2, vlen_enc); 6834 vpxor(xtmp2, dst, src1, vlen_enc); 6835 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6836 } 6837 6838 // Compute overflow detection mask. 6839 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6840 // Note: xtmp1 hold -1 in all its lanes after above call. 6841 6842 // Compute mask based on first input polarity. 6843 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6844 6845 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6846 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6847 6848 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6849 // set bits in first input polarity mask holds a min value. 6850 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6851 // Blend destination lanes with saturated values using overflow detection mask. 6852 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6853 } 6854 6855 6856 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6857 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6858 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6859 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6860 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6861 // Overflow detection based on Hacker's delight section 2-13. 6862 if (ideal_opc == Op_SaturatingAddV) { 6863 // res = src1 + src2 6864 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6865 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6866 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6867 vpxor(xtmp1, dst, src1, vlen_enc); 6868 vpxor(xtmp2, dst, src2, vlen_enc); 6869 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6870 } else { 6871 assert(ideal_opc == Op_SaturatingSubV, ""); 6872 // res = src1 - src2 6873 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6874 // Overflow occurs when both inputs have opposite polarity and 6875 // result polarity does not comply with first input polarity. 6876 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6877 vpxor(xtmp1, src1, src2, vlen_enc); 6878 vpxor(xtmp2, dst, src1, vlen_enc); 6879 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6880 } 6881 6882 // Sign-extend to compute overflow detection mask. 6883 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6884 6885 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6886 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6887 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6888 6889 // Compose saturating min/max vector using first input polarity mask. 6890 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6891 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6892 6893 // Blend result with saturating vector using overflow detection mask. 6894 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6895 } 6896 6897 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6898 switch(elem_bt) { 6899 case T_BYTE: 6900 if (ideal_opc == Op_SaturatingAddV) { 6901 vpaddsb(dst, src1, src2, vlen_enc); 6902 } else { 6903 assert(ideal_opc == Op_SaturatingSubV, ""); 6904 vpsubsb(dst, src1, src2, vlen_enc); 6905 } 6906 break; 6907 case T_SHORT: 6908 if (ideal_opc == Op_SaturatingAddV) { 6909 vpaddsw(dst, src1, src2, vlen_enc); 6910 } else { 6911 assert(ideal_opc == Op_SaturatingSubV, ""); 6912 vpsubsw(dst, src1, src2, vlen_enc); 6913 } 6914 break; 6915 default: 6916 fatal("Unsupported type %s", type2name(elem_bt)); 6917 break; 6918 } 6919 } 6920 6921 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6922 switch(elem_bt) { 6923 case T_BYTE: 6924 if (ideal_opc == Op_SaturatingAddV) { 6925 vpaddusb(dst, src1, src2, vlen_enc); 6926 } else { 6927 assert(ideal_opc == Op_SaturatingSubV, ""); 6928 vpsubusb(dst, src1, src2, vlen_enc); 6929 } 6930 break; 6931 case T_SHORT: 6932 if (ideal_opc == Op_SaturatingAddV) { 6933 vpaddusw(dst, src1, src2, vlen_enc); 6934 } else { 6935 assert(ideal_opc == Op_SaturatingSubV, ""); 6936 vpsubusw(dst, src1, src2, vlen_enc); 6937 } 6938 break; 6939 default: 6940 fatal("Unsupported type %s", type2name(elem_bt)); 6941 break; 6942 } 6943 } 6944 6945 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6946 XMMRegister src2, int vlen_enc) { 6947 switch(elem_bt) { 6948 case T_BYTE: 6949 evpermi2b(dst, src1, src2, vlen_enc); 6950 break; 6951 case T_SHORT: 6952 evpermi2w(dst, src1, src2, vlen_enc); 6953 break; 6954 case T_INT: 6955 evpermi2d(dst, src1, src2, vlen_enc); 6956 break; 6957 case T_LONG: 6958 evpermi2q(dst, src1, src2, vlen_enc); 6959 break; 6960 case T_FLOAT: 6961 evpermi2ps(dst, src1, src2, vlen_enc); 6962 break; 6963 case T_DOUBLE: 6964 evpermi2pd(dst, src1, src2, vlen_enc); 6965 break; 6966 default: 6967 fatal("Unsupported type %s", type2name(elem_bt)); 6968 break; 6969 } 6970 } 6971 6972 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 6973 if (is_unsigned) { 6974 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6975 } else { 6976 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6977 } 6978 } 6979 6980 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 6981 if (is_unsigned) { 6982 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6983 } else { 6984 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6985 } 6986 } 6987 6988 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6989 switch(opcode) { 6990 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 6991 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 6992 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 6993 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 6994 default: assert(false, "%s", NodeClassNames[opcode]); break; 6995 } 6996 } 6997 6998 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6999 switch(opcode) { 7000 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7001 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7002 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7003 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7004 default: assert(false, "%s", NodeClassNames[opcode]); break; 7005 } 7006 } 7007 7008 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7009 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7010 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7011 } 7012 7013 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7014 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7015 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7016 // Move sign bits of src2 to mask register. 7017 evpmovw2m(ktmp, src2, vlen_enc); 7018 // xtmp1 = src2 < 0 ? src2 : src1 7019 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7020 // xtmp2 = src2 < 0 ? ? src1 : src2 7021 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7022 // Idea behind above swapping is to make seconds source operand a +ve value. 7023 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7024 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7025 // the second source operand, either a NaN or a valid floating-point value, is returned 7026 // dst = max(xtmp1, xtmp2) 7027 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7028 // isNaN = is_unordered_quiet(xtmp1) 7029 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7030 // Final result is same as first source if its a NaN value, 7031 // in case second operand holds a NaN value then as per above semantics 7032 // result is same as second operand. 7033 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7034 } else { 7035 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7036 // Move sign bits of src1 to mask register. 7037 evpmovw2m(ktmp, src1, vlen_enc); 7038 // xtmp1 = src1 < 0 ? src2 : src1 7039 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7040 // xtmp2 = src1 < 0 ? src1 : src2 7041 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7042 // Idea behind above swapping is to make seconds source operand a -ve value. 7043 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7044 // the second source operand is returned. 7045 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7046 // or a valid floating-point value, is written to the result. 7047 // dst = min(xtmp1, xtmp2) 7048 evminph(dst, xtmp1, xtmp2, vlen_enc); 7049 // isNaN = is_unordered_quiet(xtmp1) 7050 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7051 // Final result is same as first source if its a NaN value, 7052 // in case second operand holds a NaN value then as per above semantics 7053 // result is same as second operand. 7054 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7055 } 7056 } --- EOF ---