1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 53 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 54 55 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 56 // Remove word for return addr 57 framesize -= wordSize; 58 stack_bang_size -= wordSize; 59 60 // Calls to C2R adapters often do not accept exceptional returns. 61 // We require that their callers must bang for them. But be careful, because 62 // some VM calls (such as call site linkage) can use several kilobytes of 63 // stack. But the stack safety zone should account for that. 64 // See bugs 4446381, 4468289, 4497237. 65 if (stack_bang_size > 0) { 66 generate_stack_overflow_check(stack_bang_size); 67 68 // We always push rbp, so that on return to interpreter rbp, will be 69 // restored correctly and we can correct the stack. 70 push(rbp); 71 // Save caller's stack pointer into RBP if the frame pointer is preserved. 72 if (PreserveFramePointer) { 73 mov(rbp, rsp); 74 } 75 // Remove word for ebp 76 framesize -= wordSize; 77 78 // Create frame 79 if (framesize) { 80 subptr(rsp, framesize); 81 } 82 } else { 83 subptr(rsp, framesize); 84 85 // Save RBP register now. 86 framesize -= wordSize; 87 movptr(Address(rsp, framesize), rbp); 88 // Save caller's stack pointer into RBP if the frame pointer is preserved. 89 if (PreserveFramePointer) { 90 movptr(rbp, rsp); 91 if (framesize > 0) { 92 addptr(rbp, framesize); 93 } 94 } 95 } 96 97 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 98 framesize -= wordSize; 99 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 100 } 101 102 #ifdef ASSERT 103 if (VerifyStackAtCalls) { 104 Label L; 105 push(rax); 106 mov(rax, rsp); 107 andptr(rax, StackAlignmentInBytes-1); 108 cmpptr(rax, StackAlignmentInBytes-wordSize); 109 pop(rax); 110 jcc(Assembler::equal, L); 111 STOP("Stack is not properly aligned!"); 112 bind(L); 113 } 114 #endif 115 116 if (!is_stub) { 117 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 118 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 119 Label dummy_slow_path; 120 Label dummy_continuation; 121 Label* slow_path = &dummy_slow_path; 122 Label* continuation = &dummy_continuation; 123 if (!Compile::current()->output()->in_scratch_emit_size()) { 124 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 125 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 126 Compile::current()->output()->add_stub(stub); 127 slow_path = &stub->entry(); 128 continuation = &stub->continuation(); 129 } 130 bs->nmethod_entry_barrier(this, slow_path, continuation); 131 } 132 } 133 134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 135 switch (vlen_in_bytes) { 136 case 4: // fall-through 137 case 8: // fall-through 138 case 16: return Assembler::AVX_128bit; 139 case 32: return Assembler::AVX_256bit; 140 case 64: return Assembler::AVX_512bit; 141 142 default: { 143 ShouldNotReachHere(); 144 return Assembler::AVX_NoVec; 145 } 146 } 147 } 148 149 // fast_lock and fast_unlock used by C2 150 151 // Because the transitions from emitted code to the runtime 152 // monitorenter/exit helper stubs are so slow it's critical that 153 // we inline both the stack-locking fast path and the inflated fast path. 154 // 155 // See also: cmpFastLock and cmpFastUnlock. 156 // 157 // What follows is a specialized inline transliteration of the code 158 // in enter() and exit(). If we're concerned about I$ bloat another 159 // option would be to emit TrySlowEnter and TrySlowExit methods 160 // at startup-time. These methods would accept arguments as 161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 162 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 164 // In practice, however, the # of lock sites is bounded and is usually small. 165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 166 // if the processor uses simple bimodal branch predictors keyed by EIP 167 // Since the helper routines would be called from multiple synchronization 168 // sites. 169 // 170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 172 // to those specialized methods. That'd give us a mostly platform-independent 173 // implementation that the JITs could optimize and inline at their pleasure. 174 // Done correctly, the only time we'd need to cross to native could would be 175 // to park() or unpark() threads. We'd also need a few more unsafe operators 176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 177 // (b) explicit barriers or fence operations. 178 // 179 // TODO: 180 // 181 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 182 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 183 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 184 // the lock operators would typically be faster than reifying Self. 185 // 186 // * Ideally I'd define the primitives as: 187 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 188 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 189 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 190 // Instead, we're stuck with a rather awkward and brittle register assignments below. 191 // Furthermore the register assignments are overconstrained, possibly resulting in 192 // sub-optimal code near the synchronization site. 193 // 194 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 195 // Alternately, use a better sp-proximity test. 196 // 197 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 198 // Either one is sufficient to uniquely identify a thread. 199 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 200 // 201 // * Intrinsify notify() and notifyAll() for the common cases where the 202 // object is locked by the calling thread but the waitlist is empty. 203 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 204 // 205 // * use jccb and jmpb instead of jcc and jmp to improve code density. 206 // But beware of excessive branch density on AMD Opterons. 207 // 208 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 209 // or failure of the fast path. If the fast path fails then we pass 210 // control to the slow path, typically in C. In fast_lock and 211 // fast_unlock we often branch to DONE_LABEL, just to find that C2 212 // will emit a conditional branch immediately after the node. 213 // So we have branches to branches and lots of ICC.ZF games. 214 // Instead, it might be better to have C2 pass a "FailureLabel" 215 // into fast_lock and fast_unlock. In the case of success, control 216 // will drop through the node. ICC.ZF is undefined at exit. 217 // In the case of failure, the node will branch directly to the 218 // FailureLabel 219 220 221 // obj: object to lock 222 // box: on-stack box address -- KILLED 223 // rax: tmp -- KILLED 224 // t : tmp -- KILLED 225 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 226 Register t, Register thread) { 227 assert(rax_reg == rax, "Used for CAS"); 228 assert_different_registers(obj, box, rax_reg, t, thread); 229 230 // Handle inflated monitor. 231 Label inflated; 232 // Finish fast lock successfully. ZF value is irrelevant. 233 Label locked; 234 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 235 Label slow_path; 236 237 if (UseObjectMonitorTable) { 238 // Clear cache in case fast locking succeeds or we need to take the slow-path. 239 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 240 } 241 242 if (DiagnoseSyncOnValueBasedClasses != 0) { 243 load_klass(rax_reg, obj, t); 244 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 245 jcc(Assembler::notZero, slow_path); 246 } 247 248 const Register mark = t; 249 250 { // Lightweight Lock 251 252 Label push; 253 254 const Register top = UseObjectMonitorTable ? rax_reg : box; 255 256 // Load the mark. 257 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 258 259 // Prefetch top. 260 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 261 262 // Check for monitor (0b10). 263 testptr(mark, markWord::monitor_value); 264 jcc(Assembler::notZero, inflated); 265 266 // Check if lock-stack is full. 267 cmpl(top, LockStack::end_offset() - 1); 268 jcc(Assembler::greater, slow_path); 269 270 // Check if recursive. 271 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 272 jccb(Assembler::equal, push); 273 274 // Try to lock. Transition lock bits 0b01 => 0b00 275 movptr(rax_reg, mark); 276 orptr(rax_reg, markWord::unlocked_value); 277 andptr(mark, ~(int32_t)markWord::unlocked_value); 278 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 279 jcc(Assembler::notEqual, slow_path); 280 281 if (UseObjectMonitorTable) { 282 // Need to reload top, clobbered by CAS. 283 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 284 } 285 bind(push); 286 // After successful lock, push object on lock-stack. 287 movptr(Address(thread, top), obj); 288 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 289 jmpb(locked); 290 } 291 292 { // Handle inflated monitor. 293 bind(inflated); 294 295 const Register monitor = t; 296 297 if (!UseObjectMonitorTable) { 298 assert(mark == monitor, "should be the same here"); 299 } else { 300 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 301 // Fetch ObjectMonitor* from the cache or take the slow-path. 302 Label monitor_found; 303 304 // Load cache address 305 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 306 307 const int num_unrolled = 2; 308 for (int i = 0; i < num_unrolled; i++) { 309 cmpptr(obj, Address(t)); 310 jccb(Assembler::equal, monitor_found); 311 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 312 } 313 314 Label loop; 315 316 // Search for obj in cache. 317 bind(loop); 318 319 // Check for match. 320 cmpptr(obj, Address(t)); 321 jccb(Assembler::equal, monitor_found); 322 323 // Search until null encountered, guaranteed _null_sentinel at end. 324 cmpptr(Address(t), 1); 325 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 326 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 327 jmpb(loop); 328 329 // Cache hit. 330 bind(monitor_found); 331 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 332 } 333 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 334 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 335 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 336 337 Label monitor_locked; 338 // Lock the monitor. 339 340 if (UseObjectMonitorTable) { 341 // Cache the monitor for unlock before trashing box. On failure to acquire 342 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 343 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 344 } 345 346 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 347 xorptr(rax_reg, rax_reg); 348 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 349 lock(); cmpxchgptr(box, owner_address); 350 jccb(Assembler::equal, monitor_locked); 351 352 // Check if recursive. 353 cmpptr(box, rax_reg); 354 jccb(Assembler::notEqual, slow_path); 355 356 // Recursive. 357 increment(recursions_address); 358 359 bind(monitor_locked); 360 } 361 362 bind(locked); 363 // Set ZF = 1 364 xorl(rax_reg, rax_reg); 365 366 #ifdef ASSERT 367 // Check that locked label is reached with ZF set. 368 Label zf_correct; 369 Label zf_bad_zero; 370 jcc(Assembler::zero, zf_correct); 371 jmp(zf_bad_zero); 372 #endif 373 374 bind(slow_path); 375 #ifdef ASSERT 376 // Check that slow_path label is reached with ZF not set. 377 jcc(Assembler::notZero, zf_correct); 378 stop("Fast Lock ZF != 0"); 379 bind(zf_bad_zero); 380 stop("Fast Lock ZF != 1"); 381 bind(zf_correct); 382 #endif 383 // C2 uses the value of ZF to determine the continuation. 384 } 385 386 // obj: object to lock 387 // rax: tmp -- KILLED 388 // t : tmp - cannot be obj nor rax -- KILLED 389 // 390 // Some commentary on balanced locking: 391 // 392 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 393 // Methods that don't have provably balanced locking are forced to run in the 394 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 395 // The interpreter provides two properties: 396 // I1: At return-time the interpreter automatically and quietly unlocks any 397 // objects acquired in the current activation (frame). Recall that the 398 // interpreter maintains an on-stack list of locks currently held by 399 // a frame. 400 // I2: If a method attempts to unlock an object that is not held by the 401 // frame the interpreter throws IMSX. 402 // 403 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 404 // B() doesn't have provably balanced locking so it runs in the interpreter. 405 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 406 // is still locked by A(). 407 // 408 // The only other source of unbalanced locking would be JNI. The "Java Native Interface 409 // Specification" states that an object locked by JNI's MonitorEnter should not be 410 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't 411 // specify what will occur if a program engages in such mixed-mode locking, however. 412 // Arguably given that the spec legislates the JNI case as undefined our implementation 413 // could reasonably *avoid* checking owner in fast_unlock(). 414 // In the interest of performance we elide m->Owner==Self check in unlock. 415 // A perfectly viable alternative is to elide the owner check except when 416 // Xcheck:jni is enabled. 417 418 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 419 assert(reg_rax == rax, "Used for CAS"); 420 assert_different_registers(obj, reg_rax, t); 421 422 // Handle inflated monitor. 423 Label inflated, inflated_check_lock_stack; 424 // Finish fast unlock successfully. MUST jump with ZF == 1 425 Label unlocked, slow_path; 426 427 const Register mark = t; 428 const Register monitor = t; 429 const Register top = UseObjectMonitorTable ? t : reg_rax; 430 const Register box = reg_rax; 431 432 Label dummy; 433 C2FastUnlockLightweightStub* stub = nullptr; 434 435 if (!Compile::current()->output()->in_scratch_emit_size()) { 436 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 437 Compile::current()->output()->add_stub(stub); 438 } 439 440 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 441 442 { // Lightweight Unlock 443 444 // Load top. 445 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 446 447 if (!UseObjectMonitorTable) { 448 // Prefetch mark. 449 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 450 } 451 452 // Check if obj is top of lock-stack. 453 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 454 // Top of lock stack was not obj. Must be monitor. 455 jcc(Assembler::notEqual, inflated_check_lock_stack); 456 457 // Pop lock-stack. 458 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 459 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 460 461 // Check if recursive. 462 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 463 jcc(Assembler::equal, unlocked); 464 465 // We elide the monitor check, let the CAS fail instead. 466 467 if (UseObjectMonitorTable) { 468 // Load mark. 469 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 470 } 471 472 // Try to unlock. Transition lock bits 0b00 => 0b01 473 movptr(reg_rax, mark); 474 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 475 orptr(mark, markWord::unlocked_value); 476 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 477 jcc(Assembler::notEqual, push_and_slow_path); 478 jmp(unlocked); 479 } 480 481 482 { // Handle inflated monitor. 483 bind(inflated_check_lock_stack); 484 #ifdef ASSERT 485 Label check_done; 486 subl(top, oopSize); 487 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 488 jcc(Assembler::below, check_done); 489 cmpptr(obj, Address(thread, top)); 490 jccb(Assembler::notEqual, inflated_check_lock_stack); 491 stop("Fast Unlock lock on stack"); 492 bind(check_done); 493 if (UseObjectMonitorTable) { 494 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 495 } 496 testptr(mark, markWord::monitor_value); 497 jccb(Assembler::notZero, inflated); 498 stop("Fast Unlock not monitor"); 499 #endif 500 501 bind(inflated); 502 503 if (!UseObjectMonitorTable) { 504 assert(mark == monitor, "should be the same here"); 505 } else { 506 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 507 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 508 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 509 cmpptr(monitor, alignof(ObjectMonitor*)); 510 jcc(Assembler::below, slow_path); 511 } 512 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 513 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 514 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 515 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 516 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 517 518 Label recursive; 519 520 // Check if recursive. 521 cmpptr(recursions_address, 0); 522 jccb(Assembler::notZero, recursive); 523 524 // Set owner to null. 525 // Release to satisfy the JMM 526 movptr(owner_address, NULL_WORD); 527 // We need a full fence after clearing owner to avoid stranding. 528 // StoreLoad achieves this. 529 membar(StoreLoad); 530 531 // Check if the entry_list is empty. 532 cmpptr(entry_list_address, NULL_WORD); 533 jccb(Assembler::zero, unlocked); // If so we are done. 534 535 // Check if there is a successor. 536 cmpptr(succ_address, NULL_WORD); 537 jccb(Assembler::notZero, unlocked); // If so we are done. 538 539 // Save the monitor pointer in the current thread, so we can try to 540 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 541 if (!UseObjectMonitorTable) { 542 andptr(monitor, ~(int32_t)markWord::monitor_value); 543 } 544 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 545 546 orl(t, 1); // Fast Unlock ZF = 0 547 jmpb(slow_path); 548 549 // Recursive unlock. 550 bind(recursive); 551 decrement(recursions_address); 552 } 553 554 bind(unlocked); 555 xorl(t, t); // Fast Unlock ZF = 1 556 557 #ifdef ASSERT 558 // Check that unlocked label is reached with ZF set. 559 Label zf_correct; 560 Label zf_bad_zero; 561 jcc(Assembler::zero, zf_correct); 562 jmp(zf_bad_zero); 563 #endif 564 565 bind(slow_path); 566 if (stub != nullptr) { 567 bind(stub->slow_path_continuation()); 568 } 569 #ifdef ASSERT 570 // Check that stub->continuation() label is reached with ZF not set. 571 jcc(Assembler::notZero, zf_correct); 572 stop("Fast Unlock ZF != 0"); 573 bind(zf_bad_zero); 574 stop("Fast Unlock ZF != 1"); 575 bind(zf_correct); 576 #endif 577 // C2 uses the value of ZF to determine the continuation. 578 } 579 580 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 581 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 582 } 583 584 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) { 585 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 586 masm->movptr(dst, rsp); 587 if (framesize > 2 * wordSize) { 588 masm->addptr(dst, framesize - 2 * wordSize); 589 } 590 } 591 592 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 593 if (PreserveFramePointer) { 594 // frame pointer is valid 595 #ifdef ASSERT 596 // Verify frame pointer value in rbp. 597 reconstruct_frame_pointer_helper(this, rtmp); 598 Label L_success; 599 cmpq(rbp, rtmp); 600 jccb(Assembler::equal, L_success); 601 STOP("frame pointer mismatch"); 602 bind(L_success); 603 #endif // ASSERT 604 } else { 605 reconstruct_frame_pointer_helper(this, rbp); 606 } 607 } 608 609 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) { 610 jint lo = t->_lo; 611 jint hi = t->_hi; 612 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi); 613 if (t == TypeInt::INT) { 614 return; 615 } 616 617 BLOCK_COMMENT("CastII {"); 618 Label fail; 619 Label succeed; 620 if (hi == max_jint) { 621 cmpl(val, lo); 622 jccb(Assembler::greaterEqual, succeed); 623 } else { 624 if (lo != min_jint) { 625 cmpl(val, lo); 626 jccb(Assembler::less, fail); 627 } 628 cmpl(val, hi); 629 jccb(Assembler::lessEqual, succeed); 630 } 631 632 bind(fail); 633 movl(c_rarg0, idx); 634 movl(c_rarg1, val); 635 movl(c_rarg2, lo); 636 movl(c_rarg3, hi); 637 reconstruct_frame_pointer(rscratch1); 638 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range))); 639 hlt(); 640 bind(succeed); 641 BLOCK_COMMENT("} // CastII"); 642 } 643 644 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 645 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 646 } 647 648 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) { 649 jlong lo = t->_lo; 650 jlong hi = t->_hi; 651 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi); 652 if (t == TypeLong::LONG) { 653 return; 654 } 655 656 BLOCK_COMMENT("CastLL {"); 657 Label fail; 658 Label succeed; 659 660 auto cmp_val = [&](jlong bound) { 661 if (is_simm32(bound)) { 662 cmpq(val, checked_cast<int>(bound)); 663 } else { 664 mov64(tmp, bound); 665 cmpq(val, tmp); 666 } 667 }; 668 669 if (hi == max_jlong) { 670 cmp_val(lo); 671 jccb(Assembler::greaterEqual, succeed); 672 } else { 673 if (lo != min_jlong) { 674 cmp_val(lo); 675 jccb(Assembler::less, fail); 676 } 677 cmp_val(hi); 678 jccb(Assembler::lessEqual, succeed); 679 } 680 681 bind(fail); 682 movl(c_rarg0, idx); 683 movq(c_rarg1, val); 684 mov64(c_rarg2, lo); 685 mov64(c_rarg3, hi); 686 reconstruct_frame_pointer(rscratch1); 687 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range))); 688 hlt(); 689 bind(succeed); 690 BLOCK_COMMENT("} // CastLL"); 691 } 692 693 //------------------------------------------------------------------------------------------- 694 // Generic instructions support for use in .ad files C2 code generation 695 696 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 697 if (dst != src) { 698 movdqu(dst, src); 699 } 700 if (opcode == Op_AbsVD) { 701 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 702 } else { 703 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 704 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 705 } 706 } 707 708 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 709 if (opcode == Op_AbsVD) { 710 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 711 } else { 712 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 713 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 714 } 715 } 716 717 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 718 if (dst != src) { 719 movdqu(dst, src); 720 } 721 if (opcode == Op_AbsVF) { 722 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 723 } else { 724 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 725 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 726 } 727 } 728 729 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 730 if (opcode == Op_AbsVF) { 731 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 732 } else { 733 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 734 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 735 } 736 } 737 738 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 739 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 740 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 741 742 if (opcode == Op_MinV) { 743 if (elem_bt == T_BYTE) { 744 pminsb(dst, src); 745 } else if (elem_bt == T_SHORT) { 746 pminsw(dst, src); 747 } else if (elem_bt == T_INT) { 748 pminsd(dst, src); 749 } else { 750 assert(elem_bt == T_LONG, "required"); 751 assert(tmp == xmm0, "required"); 752 assert_different_registers(dst, src, tmp); 753 movdqu(xmm0, dst); 754 pcmpgtq(xmm0, src); 755 blendvpd(dst, src); // xmm0 as mask 756 } 757 } else { // opcode == Op_MaxV 758 if (elem_bt == T_BYTE) { 759 pmaxsb(dst, src); 760 } else if (elem_bt == T_SHORT) { 761 pmaxsw(dst, src); 762 } else if (elem_bt == T_INT) { 763 pmaxsd(dst, src); 764 } else { 765 assert(elem_bt == T_LONG, "required"); 766 assert(tmp == xmm0, "required"); 767 assert_different_registers(dst, src, tmp); 768 movdqu(xmm0, src); 769 pcmpgtq(xmm0, dst); 770 blendvpd(dst, src); // xmm0 as mask 771 } 772 } 773 } 774 775 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 776 XMMRegister src1, Address src2, int vlen_enc) { 777 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 778 if (opcode == Op_UMinV) { 779 switch(elem_bt) { 780 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 781 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 782 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 783 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 784 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 785 } 786 } else { 787 assert(opcode == Op_UMaxV, "required"); 788 switch(elem_bt) { 789 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 790 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 791 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 792 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 793 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 794 } 795 } 796 } 797 798 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 799 // For optimality, leverage a full vector width of 512 bits 800 // for operations over smaller vector sizes on AVX512 targets. 801 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 802 if (opcode == Op_UMaxV) { 803 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 804 } else { 805 assert(opcode == Op_UMinV, "required"); 806 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 807 } 808 } else { 809 // T1 = -1 810 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 811 // T1 = -1 << 63 812 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 813 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 814 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 815 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 816 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 817 // Mask = T2 > T1 818 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 819 if (opcode == Op_UMaxV) { 820 // Res = Mask ? Src2 : Src1 821 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 822 } else { 823 // Res = Mask ? Src1 : Src2 824 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 825 } 826 } 827 } 828 829 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 830 XMMRegister src1, XMMRegister src2, int vlen_enc) { 831 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 832 if (opcode == Op_UMinV) { 833 switch(elem_bt) { 834 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 835 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 836 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 837 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 838 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 839 } 840 } else { 841 assert(opcode == Op_UMaxV, "required"); 842 switch(elem_bt) { 843 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 844 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 845 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 846 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 847 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 848 } 849 } 850 } 851 852 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 853 XMMRegister dst, XMMRegister src1, XMMRegister src2, 854 int vlen_enc) { 855 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 856 857 if (opcode == Op_MinV) { 858 if (elem_bt == T_BYTE) { 859 vpminsb(dst, src1, src2, vlen_enc); 860 } else if (elem_bt == T_SHORT) { 861 vpminsw(dst, src1, src2, vlen_enc); 862 } else if (elem_bt == T_INT) { 863 vpminsd(dst, src1, src2, vlen_enc); 864 } else { 865 assert(elem_bt == T_LONG, "required"); 866 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 867 vpminsq(dst, src1, src2, vlen_enc); 868 } else { 869 assert_different_registers(dst, src1, src2); 870 vpcmpgtq(dst, src1, src2, vlen_enc); 871 vblendvpd(dst, src1, src2, dst, vlen_enc); 872 } 873 } 874 } else { // opcode == Op_MaxV 875 if (elem_bt == T_BYTE) { 876 vpmaxsb(dst, src1, src2, vlen_enc); 877 } else if (elem_bt == T_SHORT) { 878 vpmaxsw(dst, src1, src2, vlen_enc); 879 } else if (elem_bt == T_INT) { 880 vpmaxsd(dst, src1, src2, vlen_enc); 881 } else { 882 assert(elem_bt == T_LONG, "required"); 883 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 884 vpmaxsq(dst, src1, src2, vlen_enc); 885 } else { 886 assert_different_registers(dst, src1, src2); 887 vpcmpgtq(dst, src1, src2, vlen_enc); 888 vblendvpd(dst, src2, src1, dst, vlen_enc); 889 } 890 } 891 } 892 } 893 894 // Float/Double min max 895 896 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 897 XMMRegister dst, XMMRegister a, XMMRegister b, 898 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 899 int vlen_enc) { 900 assert(UseAVX > 0, "required"); 901 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 902 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 903 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 904 assert_different_registers(a, tmp, atmp, btmp); 905 assert_different_registers(b, tmp, atmp, btmp); 906 907 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 908 bool is_double_word = is_double_word_type(elem_bt); 909 910 /* Note on 'non-obvious' assembly sequence: 911 * 912 * While there are vminps/vmaxps instructions, there are two important differences between hardware 913 * and Java on how they handle floats: 914 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 915 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 916 * 917 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 918 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 919 * (only useful when signs differ, noop otherwise) 920 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 921 922 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 923 * btmp = (b < +0.0) ? a : b 924 * atmp = (b < +0.0) ? b : a 925 * Tmp = Max_Float(atmp , btmp) 926 * Res = (atmp == NaN) ? atmp : Tmp 927 */ 928 929 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 930 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 931 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 932 XMMRegister mask; 933 934 if (!is_double_word && is_min) { 935 mask = a; 936 vblend = &MacroAssembler::vblendvps; 937 vmaxmin = &MacroAssembler::vminps; 938 vcmp = &MacroAssembler::vcmpps; 939 } else if (!is_double_word && !is_min) { 940 mask = b; 941 vblend = &MacroAssembler::vblendvps; 942 vmaxmin = &MacroAssembler::vmaxps; 943 vcmp = &MacroAssembler::vcmpps; 944 } else if (is_double_word && is_min) { 945 mask = a; 946 vblend = &MacroAssembler::vblendvpd; 947 vmaxmin = &MacroAssembler::vminpd; 948 vcmp = &MacroAssembler::vcmppd; 949 } else { 950 assert(is_double_word && !is_min, "sanity"); 951 mask = b; 952 vblend = &MacroAssembler::vblendvpd; 953 vmaxmin = &MacroAssembler::vmaxpd; 954 vcmp = &MacroAssembler::vcmppd; 955 } 956 957 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 958 XMMRegister maxmin, scratch; 959 if (dst == btmp) { 960 maxmin = btmp; 961 scratch = tmp; 962 } else { 963 maxmin = tmp; 964 scratch = btmp; 965 } 966 967 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 968 if (precompute_mask && !is_double_word) { 969 vpsrad(tmp, mask, 32, vlen_enc); 970 mask = tmp; 971 } else if (precompute_mask && is_double_word) { 972 vpxor(tmp, tmp, tmp, vlen_enc); 973 vpcmpgtq(tmp, tmp, mask, vlen_enc); 974 mask = tmp; 975 } 976 977 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 978 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 979 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 980 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 981 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 982 } 983 984 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 985 XMMRegister dst, XMMRegister a, XMMRegister b, 986 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 987 int vlen_enc) { 988 assert(UseAVX > 2, "required"); 989 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 990 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 991 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 992 assert_different_registers(dst, a, atmp, btmp); 993 assert_different_registers(dst, b, atmp, btmp); 994 995 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 996 bool is_double_word = is_double_word_type(elem_bt); 997 bool merge = true; 998 999 if (!is_double_word && is_min) { 1000 evpmovd2m(ktmp, a, vlen_enc); 1001 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1002 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1003 vminps(dst, atmp, btmp, vlen_enc); 1004 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1005 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1006 } else if (!is_double_word && !is_min) { 1007 evpmovd2m(ktmp, b, vlen_enc); 1008 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1009 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1010 vmaxps(dst, atmp, btmp, vlen_enc); 1011 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1012 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1013 } else if (is_double_word && is_min) { 1014 evpmovq2m(ktmp, a, vlen_enc); 1015 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1016 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1017 vminpd(dst, atmp, btmp, vlen_enc); 1018 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1019 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1020 } else { 1021 assert(is_double_word && !is_min, "sanity"); 1022 evpmovq2m(ktmp, b, vlen_enc); 1023 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1024 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1025 vmaxpd(dst, atmp, btmp, vlen_enc); 1026 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1027 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1028 } 1029 } 1030 1031 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask, 1032 XMMRegister src1, XMMRegister src2, int vlen_enc) { 1033 assert(opc == Op_MinV || opc == Op_MinReductionV || 1034 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity"); 1035 1036 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN 1037 : AVX10_MINMAX_MAX_COMPARE_SIGN; 1038 if (elem_bt == T_FLOAT) { 1039 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc); 1040 } else { 1041 assert(elem_bt == T_DOUBLE, ""); 1042 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc); 1043 } 1044 } 1045 1046 // Float/Double signum 1047 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1048 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1049 1050 Label DONE_LABEL; 1051 1052 if (opcode == Op_SignumF) { 1053 ucomiss(dst, zero); 1054 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1055 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1056 movflt(dst, one); 1057 jcc(Assembler::above, DONE_LABEL); 1058 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1059 } else if (opcode == Op_SignumD) { 1060 ucomisd(dst, zero); 1061 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1062 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1063 movdbl(dst, one); 1064 jcc(Assembler::above, DONE_LABEL); 1065 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1066 } 1067 1068 bind(DONE_LABEL); 1069 } 1070 1071 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1072 if (sign) { 1073 pmovsxbw(dst, src); 1074 } else { 1075 pmovzxbw(dst, src); 1076 } 1077 } 1078 1079 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1080 if (sign) { 1081 vpmovsxbw(dst, src, vector_len); 1082 } else { 1083 vpmovzxbw(dst, src, vector_len); 1084 } 1085 } 1086 1087 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1088 if (sign) { 1089 vpmovsxbd(dst, src, vector_len); 1090 } else { 1091 vpmovzxbd(dst, src, vector_len); 1092 } 1093 } 1094 1095 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1096 if (sign) { 1097 vpmovsxwd(dst, src, vector_len); 1098 } else { 1099 vpmovzxwd(dst, src, vector_len); 1100 } 1101 } 1102 1103 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1104 int shift, int vector_len) { 1105 if (opcode == Op_RotateLeftV) { 1106 if (etype == T_INT) { 1107 evprold(dst, src, shift, vector_len); 1108 } else { 1109 assert(etype == T_LONG, "expected type T_LONG"); 1110 evprolq(dst, src, shift, vector_len); 1111 } 1112 } else { 1113 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1114 if (etype == T_INT) { 1115 evprord(dst, src, shift, vector_len); 1116 } else { 1117 assert(etype == T_LONG, "expected type T_LONG"); 1118 evprorq(dst, src, shift, vector_len); 1119 } 1120 } 1121 } 1122 1123 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1124 XMMRegister shift, int vector_len) { 1125 if (opcode == Op_RotateLeftV) { 1126 if (etype == T_INT) { 1127 evprolvd(dst, src, shift, vector_len); 1128 } else { 1129 assert(etype == T_LONG, "expected type T_LONG"); 1130 evprolvq(dst, src, shift, vector_len); 1131 } 1132 } else { 1133 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1134 if (etype == T_INT) { 1135 evprorvd(dst, src, shift, vector_len); 1136 } else { 1137 assert(etype == T_LONG, "expected type T_LONG"); 1138 evprorvq(dst, src, shift, vector_len); 1139 } 1140 } 1141 } 1142 1143 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1144 if (opcode == Op_RShiftVI) { 1145 psrad(dst, shift); 1146 } else if (opcode == Op_LShiftVI) { 1147 pslld(dst, shift); 1148 } else { 1149 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1150 psrld(dst, shift); 1151 } 1152 } 1153 1154 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1155 switch (opcode) { 1156 case Op_RShiftVI: psrad(dst, shift); break; 1157 case Op_LShiftVI: pslld(dst, shift); break; 1158 case Op_URShiftVI: psrld(dst, shift); break; 1159 1160 default: assert(false, "%s", NodeClassNames[opcode]); 1161 } 1162 } 1163 1164 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1165 if (opcode == Op_RShiftVI) { 1166 vpsrad(dst, nds, shift, vector_len); 1167 } else if (opcode == Op_LShiftVI) { 1168 vpslld(dst, nds, shift, vector_len); 1169 } else { 1170 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1171 vpsrld(dst, nds, shift, vector_len); 1172 } 1173 } 1174 1175 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1176 switch (opcode) { 1177 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1178 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1179 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1180 1181 default: assert(false, "%s", NodeClassNames[opcode]); 1182 } 1183 } 1184 1185 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1186 switch (opcode) { 1187 case Op_RShiftVB: // fall-through 1188 case Op_RShiftVS: psraw(dst, shift); break; 1189 1190 case Op_LShiftVB: // fall-through 1191 case Op_LShiftVS: psllw(dst, shift); break; 1192 1193 case Op_URShiftVS: // fall-through 1194 case Op_URShiftVB: psrlw(dst, shift); break; 1195 1196 default: assert(false, "%s", NodeClassNames[opcode]); 1197 } 1198 } 1199 1200 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1201 switch (opcode) { 1202 case Op_RShiftVB: // fall-through 1203 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1204 1205 case Op_LShiftVB: // fall-through 1206 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1207 1208 case Op_URShiftVS: // fall-through 1209 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1210 1211 default: assert(false, "%s", NodeClassNames[opcode]); 1212 } 1213 } 1214 1215 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1216 switch (opcode) { 1217 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1218 case Op_LShiftVL: psllq(dst, shift); break; 1219 case Op_URShiftVL: psrlq(dst, shift); break; 1220 1221 default: assert(false, "%s", NodeClassNames[opcode]); 1222 } 1223 } 1224 1225 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1226 if (opcode == Op_RShiftVL) { 1227 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1228 } else if (opcode == Op_LShiftVL) { 1229 psllq(dst, shift); 1230 } else { 1231 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1232 psrlq(dst, shift); 1233 } 1234 } 1235 1236 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1237 switch (opcode) { 1238 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1239 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1240 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1241 1242 default: assert(false, "%s", NodeClassNames[opcode]); 1243 } 1244 } 1245 1246 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1247 if (opcode == Op_RShiftVL) { 1248 evpsraq(dst, nds, shift, vector_len); 1249 } else if (opcode == Op_LShiftVL) { 1250 vpsllq(dst, nds, shift, vector_len); 1251 } else { 1252 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1253 vpsrlq(dst, nds, shift, vector_len); 1254 } 1255 } 1256 1257 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1258 switch (opcode) { 1259 case Op_RShiftVB: // fall-through 1260 case Op_RShiftVS: // fall-through 1261 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1262 1263 case Op_LShiftVB: // fall-through 1264 case Op_LShiftVS: // fall-through 1265 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1266 1267 case Op_URShiftVB: // fall-through 1268 case Op_URShiftVS: // fall-through 1269 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1270 1271 default: assert(false, "%s", NodeClassNames[opcode]); 1272 } 1273 } 1274 1275 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1276 switch (opcode) { 1277 case Op_RShiftVB: // fall-through 1278 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1279 1280 case Op_LShiftVB: // fall-through 1281 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1282 1283 case Op_URShiftVB: // fall-through 1284 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1285 1286 default: assert(false, "%s", NodeClassNames[opcode]); 1287 } 1288 } 1289 1290 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1291 assert(UseAVX >= 2, "required"); 1292 switch (opcode) { 1293 case Op_RShiftVL: { 1294 if (UseAVX > 2) { 1295 assert(tmp == xnoreg, "not used"); 1296 if (!VM_Version::supports_avx512vl()) { 1297 vlen_enc = Assembler::AVX_512bit; 1298 } 1299 evpsravq(dst, src, shift, vlen_enc); 1300 } else { 1301 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1302 vpsrlvq(dst, src, shift, vlen_enc); 1303 vpsrlvq(tmp, tmp, shift, vlen_enc); 1304 vpxor(dst, dst, tmp, vlen_enc); 1305 vpsubq(dst, dst, tmp, vlen_enc); 1306 } 1307 break; 1308 } 1309 case Op_LShiftVL: { 1310 assert(tmp == xnoreg, "not used"); 1311 vpsllvq(dst, src, shift, vlen_enc); 1312 break; 1313 } 1314 case Op_URShiftVL: { 1315 assert(tmp == xnoreg, "not used"); 1316 vpsrlvq(dst, src, shift, vlen_enc); 1317 break; 1318 } 1319 default: assert(false, "%s", NodeClassNames[opcode]); 1320 } 1321 } 1322 1323 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1324 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1325 assert(opcode == Op_LShiftVB || 1326 opcode == Op_RShiftVB || 1327 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1328 bool sign = (opcode != Op_URShiftVB); 1329 assert(vector_len == 0, "required"); 1330 vextendbd(sign, dst, src, 1); 1331 vpmovzxbd(vtmp, shift, 1); 1332 varshiftd(opcode, dst, dst, vtmp, 1); 1333 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1334 vextracti128_high(vtmp, dst); 1335 vpackusdw(dst, dst, vtmp, 0); 1336 } 1337 1338 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1339 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1340 assert(opcode == Op_LShiftVB || 1341 opcode == Op_RShiftVB || 1342 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1343 bool sign = (opcode != Op_URShiftVB); 1344 int ext_vector_len = vector_len + 1; 1345 vextendbw(sign, dst, src, ext_vector_len); 1346 vpmovzxbw(vtmp, shift, ext_vector_len); 1347 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1348 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1349 if (vector_len == 0) { 1350 vextracti128_high(vtmp, dst); 1351 vpackuswb(dst, dst, vtmp, vector_len); 1352 } else { 1353 vextracti64x4_high(vtmp, dst); 1354 vpackuswb(dst, dst, vtmp, vector_len); 1355 vpermq(dst, dst, 0xD8, vector_len); 1356 } 1357 } 1358 1359 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1360 switch(typ) { 1361 case T_BYTE: 1362 pinsrb(dst, val, idx); 1363 break; 1364 case T_SHORT: 1365 pinsrw(dst, val, idx); 1366 break; 1367 case T_INT: 1368 pinsrd(dst, val, idx); 1369 break; 1370 case T_LONG: 1371 pinsrq(dst, val, idx); 1372 break; 1373 default: 1374 assert(false,"Should not reach here."); 1375 break; 1376 } 1377 } 1378 1379 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1380 switch(typ) { 1381 case T_BYTE: 1382 vpinsrb(dst, src, val, idx); 1383 break; 1384 case T_SHORT: 1385 vpinsrw(dst, src, val, idx); 1386 break; 1387 case T_INT: 1388 vpinsrd(dst, src, val, idx); 1389 break; 1390 case T_LONG: 1391 vpinsrq(dst, src, val, idx); 1392 break; 1393 default: 1394 assert(false,"Should not reach here."); 1395 break; 1396 } 1397 } 1398 1399 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst, 1400 Register base, Register idx_base, 1401 Register mask, Register mask_idx, 1402 Register rtmp, int vlen_enc) { 1403 vpxor(dst, dst, dst, vlen_enc); 1404 if (elem_bt == T_SHORT) { 1405 for (int i = 0; i < 4; i++) { 1406 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1407 Label skip_load; 1408 btq(mask, mask_idx); 1409 jccb(Assembler::carryClear, skip_load); 1410 movl(rtmp, Address(idx_base, i * 4)); 1411 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1412 bind(skip_load); 1413 incq(mask_idx); 1414 } 1415 } else { 1416 assert(elem_bt == T_BYTE, ""); 1417 for (int i = 0; i < 8; i++) { 1418 // dst[i] = mask[i] ? src[idx_base[i]] : 0 1419 Label skip_load; 1420 btq(mask, mask_idx); 1421 jccb(Assembler::carryClear, skip_load); 1422 movl(rtmp, Address(idx_base, i * 4)); 1423 pinsrb(dst, Address(base, rtmp), i); 1424 bind(skip_load); 1425 incq(mask_idx); 1426 } 1427 } 1428 } 1429 1430 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst, 1431 Register base, Register idx_base, 1432 Register rtmp, int vlen_enc) { 1433 vpxor(dst, dst, dst, vlen_enc); 1434 if (elem_bt == T_SHORT) { 1435 for (int i = 0; i < 4; i++) { 1436 // dst[i] = src[idx_base[i]] 1437 movl(rtmp, Address(idx_base, i * 4)); 1438 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1439 } 1440 } else { 1441 assert(elem_bt == T_BYTE, ""); 1442 for (int i = 0; i < 8; i++) { 1443 // dst[i] = src[idx_base[i]] 1444 movl(rtmp, Address(idx_base, i * 4)); 1445 pinsrb(dst, Address(base, rtmp), i); 1446 } 1447 } 1448 } 1449 1450 /* 1451 * Gather using hybrid algorithm, first partially unroll scalar loop 1452 * to accumulate values from gather indices into a quad-word(64bit) slice. 1453 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1454 * permutation to place the slice into appropriate vector lane 1455 * locations in destination vector. Following pseudo code describes the 1456 * algorithm in detail: 1457 * 1458 * DST_VEC = ZERO_VEC 1459 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1460 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1461 * FOREACH_ITER: 1462 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1463 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1464 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1465 * PERM_INDEX = PERM_INDEX - TWO_VEC 1466 * 1467 * With each iteration, doubleword permute indices (0,1) corresponding 1468 * to gathered quadword gets right shifted by two lane positions. 1469 * 1470 */ 1471 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1472 Register base, Register idx_base, 1473 Register mask, XMMRegister xtmp1, 1474 XMMRegister xtmp2, XMMRegister temp_dst, 1475 Register rtmp, Register mask_idx, 1476 Register length, int vector_len, int vlen_enc) { 1477 Label GATHER8_LOOP; 1478 assert(is_subword_type(elem_ty), ""); 1479 movl(length, vector_len); 1480 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1481 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1482 vallones(xtmp2, vlen_enc); 1483 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1484 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1485 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1486 1487 bind(GATHER8_LOOP); 1488 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1489 if (mask == noreg) { 1490 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc); 1491 } else { 1492 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc); 1493 } 1494 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1495 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1496 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1497 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1498 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1499 vpor(dst, dst, temp_dst, vlen_enc); 1500 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1501 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1502 jcc(Assembler::notEqual, GATHER8_LOOP); 1503 } 1504 1505 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1506 switch(typ) { 1507 case T_INT: 1508 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1509 break; 1510 case T_FLOAT: 1511 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1512 break; 1513 case T_LONG: 1514 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1515 break; 1516 case T_DOUBLE: 1517 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1518 break; 1519 default: 1520 assert(false,"Should not reach here."); 1521 break; 1522 } 1523 } 1524 1525 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1526 switch(typ) { 1527 case T_INT: 1528 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1529 break; 1530 case T_FLOAT: 1531 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1532 break; 1533 case T_LONG: 1534 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1535 break; 1536 case T_DOUBLE: 1537 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1538 break; 1539 default: 1540 assert(false,"Should not reach here."); 1541 break; 1542 } 1543 } 1544 1545 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1546 switch(typ) { 1547 case T_INT: 1548 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1549 break; 1550 case T_FLOAT: 1551 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1552 break; 1553 case T_LONG: 1554 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1555 break; 1556 case T_DOUBLE: 1557 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1558 break; 1559 default: 1560 assert(false,"Should not reach here."); 1561 break; 1562 } 1563 } 1564 1565 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1566 if (vlen_in_bytes <= 16) { 1567 pxor (dst, dst); 1568 psubb(dst, src); 1569 switch (elem_bt) { 1570 case T_BYTE: /* nothing to do */ break; 1571 case T_SHORT: pmovsxbw(dst, dst); break; 1572 case T_INT: pmovsxbd(dst, dst); break; 1573 case T_FLOAT: pmovsxbd(dst, dst); break; 1574 case T_LONG: pmovsxbq(dst, dst); break; 1575 case T_DOUBLE: pmovsxbq(dst, dst); break; 1576 1577 default: assert(false, "%s", type2name(elem_bt)); 1578 } 1579 } else { 1580 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1581 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1582 1583 vpxor (dst, dst, dst, vlen_enc); 1584 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1585 1586 switch (elem_bt) { 1587 case T_BYTE: /* nothing to do */ break; 1588 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1589 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1590 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1591 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1592 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1593 1594 default: assert(false, "%s", type2name(elem_bt)); 1595 } 1596 } 1597 } 1598 1599 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1600 if (novlbwdq) { 1601 vpmovsxbd(xtmp, src, vlen_enc); 1602 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1603 Assembler::eq, true, vlen_enc, noreg); 1604 } else { 1605 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1606 vpsubb(xtmp, xtmp, src, vlen_enc); 1607 evpmovb2m(dst, xtmp, vlen_enc); 1608 } 1609 } 1610 1611 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1612 if (is_integral_type(bt)) { 1613 switch (vlen_in_bytes) { 1614 case 4: movdl(dst, src); break; 1615 case 8: movq(dst, src); break; 1616 case 16: movdqu(dst, src); break; 1617 case 32: vmovdqu(dst, src); break; 1618 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1619 default: ShouldNotReachHere(); 1620 } 1621 } else { 1622 switch (vlen_in_bytes) { 1623 case 4: movflt(dst, src); break; 1624 case 8: movdbl(dst, src); break; 1625 case 16: movups(dst, src); break; 1626 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1627 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1628 default: ShouldNotReachHere(); 1629 } 1630 } 1631 } 1632 1633 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1634 assert(rscratch != noreg || always_reachable(src), "missing"); 1635 1636 if (reachable(src)) { 1637 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1638 } else { 1639 lea(rscratch, src); 1640 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1641 } 1642 } 1643 1644 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1645 int vlen_enc = vector_length_encoding(vlen); 1646 if (VM_Version::supports_avx()) { 1647 if (bt == T_LONG) { 1648 if (VM_Version::supports_avx2()) { 1649 vpbroadcastq(dst, src, vlen_enc); 1650 } else { 1651 vmovddup(dst, src, vlen_enc); 1652 } 1653 } else if (bt == T_DOUBLE) { 1654 if (vlen_enc != Assembler::AVX_128bit) { 1655 vbroadcastsd(dst, src, vlen_enc, noreg); 1656 } else { 1657 vmovddup(dst, src, vlen_enc); 1658 } 1659 } else { 1660 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1661 vpbroadcastd(dst, src, vlen_enc); 1662 } else { 1663 vbroadcastss(dst, src, vlen_enc); 1664 } 1665 } 1666 } else if (VM_Version::supports_sse3()) { 1667 movddup(dst, src); 1668 } else { 1669 load_vector(bt, dst, src, vlen); 1670 } 1671 } 1672 1673 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1674 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1675 int offset = exact_log2(type2aelembytes(bt)) << 6; 1676 if (is_floating_point_type(bt)) { 1677 offset += 128; 1678 } 1679 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1680 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1681 } 1682 1683 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1684 1685 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1686 int vector_len = Assembler::AVX_128bit; 1687 1688 switch (opcode) { 1689 case Op_AndReductionV: pand(dst, src); break; 1690 case Op_OrReductionV: por (dst, src); break; 1691 case Op_XorReductionV: pxor(dst, src); break; 1692 case Op_MinReductionV: 1693 switch (typ) { 1694 case T_BYTE: pminsb(dst, src); break; 1695 case T_SHORT: pminsw(dst, src); break; 1696 case T_INT: pminsd(dst, src); break; 1697 case T_LONG: assert(UseAVX > 2, "required"); 1698 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1699 default: assert(false, "wrong type"); 1700 } 1701 break; 1702 case Op_MaxReductionV: 1703 switch (typ) { 1704 case T_BYTE: pmaxsb(dst, src); break; 1705 case T_SHORT: pmaxsw(dst, src); break; 1706 case T_INT: pmaxsd(dst, src); break; 1707 case T_LONG: assert(UseAVX > 2, "required"); 1708 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1709 default: assert(false, "wrong type"); 1710 } 1711 break; 1712 case Op_AddReductionVF: addss(dst, src); break; 1713 case Op_AddReductionVD: addsd(dst, src); break; 1714 case Op_AddReductionVI: 1715 switch (typ) { 1716 case T_BYTE: paddb(dst, src); break; 1717 case T_SHORT: paddw(dst, src); break; 1718 case T_INT: paddd(dst, src); break; 1719 default: assert(false, "wrong type"); 1720 } 1721 break; 1722 case Op_AddReductionVL: paddq(dst, src); break; 1723 case Op_MulReductionVF: mulss(dst, src); break; 1724 case Op_MulReductionVD: mulsd(dst, src); break; 1725 case Op_MulReductionVI: 1726 switch (typ) { 1727 case T_SHORT: pmullw(dst, src); break; 1728 case T_INT: pmulld(dst, src); break; 1729 default: assert(false, "wrong type"); 1730 } 1731 break; 1732 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1733 evpmullq(dst, dst, src, vector_len); break; 1734 default: assert(false, "wrong opcode"); 1735 } 1736 } 1737 1738 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1739 switch (opcode) { 1740 case Op_AddReductionVF: addps(dst, src); break; 1741 case Op_AddReductionVD: addpd(dst, src); break; 1742 case Op_MulReductionVF: mulps(dst, src); break; 1743 case Op_MulReductionVD: mulpd(dst, src); break; 1744 default: assert(false, "%s", NodeClassNames[opcode]); 1745 } 1746 } 1747 1748 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1749 int vector_len = Assembler::AVX_256bit; 1750 1751 switch (opcode) { 1752 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1753 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1754 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1755 case Op_MinReductionV: 1756 switch (typ) { 1757 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1758 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1759 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1760 case T_LONG: assert(UseAVX > 2, "required"); 1761 vpminsq(dst, src1, src2, vector_len); break; 1762 default: assert(false, "wrong type"); 1763 } 1764 break; 1765 case Op_MaxReductionV: 1766 switch (typ) { 1767 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1768 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1769 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1770 case T_LONG: assert(UseAVX > 2, "required"); 1771 vpmaxsq(dst, src1, src2, vector_len); break; 1772 default: assert(false, "wrong type"); 1773 } 1774 break; 1775 case Op_AddReductionVI: 1776 switch (typ) { 1777 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1778 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1779 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1780 default: assert(false, "wrong type"); 1781 } 1782 break; 1783 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1784 case Op_MulReductionVI: 1785 switch (typ) { 1786 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1787 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1788 default: assert(false, "wrong type"); 1789 } 1790 break; 1791 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1792 default: assert(false, "wrong opcode"); 1793 } 1794 } 1795 1796 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1797 int vector_len = Assembler::AVX_256bit; 1798 1799 switch (opcode) { 1800 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1801 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1802 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1803 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1804 default: assert(false, "%s", NodeClassNames[opcode]); 1805 } 1806 } 1807 1808 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1809 XMMRegister dst, XMMRegister src, 1810 XMMRegister vtmp1, XMMRegister vtmp2) { 1811 switch (opcode) { 1812 case Op_AddReductionVF: 1813 case Op_MulReductionVF: 1814 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1815 break; 1816 1817 case Op_AddReductionVD: 1818 case Op_MulReductionVD: 1819 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1820 break; 1821 1822 default: assert(false, "wrong opcode"); 1823 } 1824 } 1825 1826 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1827 XMMRegister dst, XMMRegister src, 1828 XMMRegister vtmp1, XMMRegister vtmp2) { 1829 switch (opcode) { 1830 case Op_AddReductionVF: 1831 case Op_MulReductionVF: 1832 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1833 break; 1834 1835 case Op_AddReductionVD: 1836 case Op_MulReductionVD: 1837 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1838 break; 1839 1840 default: assert(false, "%s", NodeClassNames[opcode]); 1841 } 1842 } 1843 1844 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1845 Register dst, Register src1, XMMRegister src2, 1846 XMMRegister vtmp1, XMMRegister vtmp2) { 1847 switch (vlen) { 1848 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1849 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1850 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1851 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1852 1853 default: assert(false, "wrong vector length"); 1854 } 1855 } 1856 1857 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1858 Register dst, Register src1, XMMRegister src2, 1859 XMMRegister vtmp1, XMMRegister vtmp2) { 1860 switch (vlen) { 1861 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1862 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1863 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1864 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1865 1866 default: assert(false, "wrong vector length"); 1867 } 1868 } 1869 1870 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1871 Register dst, Register src1, XMMRegister src2, 1872 XMMRegister vtmp1, XMMRegister vtmp2) { 1873 switch (vlen) { 1874 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1875 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1876 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1877 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1878 1879 default: assert(false, "wrong vector length"); 1880 } 1881 } 1882 1883 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1884 Register dst, Register src1, XMMRegister src2, 1885 XMMRegister vtmp1, XMMRegister vtmp2) { 1886 switch (vlen) { 1887 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1888 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1889 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1890 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1891 1892 default: assert(false, "wrong vector length"); 1893 } 1894 } 1895 1896 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1897 Register dst, Register src1, XMMRegister src2, 1898 XMMRegister vtmp1, XMMRegister vtmp2) { 1899 switch (vlen) { 1900 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1901 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1902 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1903 1904 default: assert(false, "wrong vector length"); 1905 } 1906 } 1907 1908 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1909 switch (vlen) { 1910 case 2: 1911 assert(vtmp2 == xnoreg, ""); 1912 reduce2F(opcode, dst, src, vtmp1); 1913 break; 1914 case 4: 1915 assert(vtmp2 == xnoreg, ""); 1916 reduce4F(opcode, dst, src, vtmp1); 1917 break; 1918 case 8: 1919 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1920 break; 1921 case 16: 1922 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1923 break; 1924 default: assert(false, "wrong vector length"); 1925 } 1926 } 1927 1928 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1929 switch (vlen) { 1930 case 2: 1931 assert(vtmp2 == xnoreg, ""); 1932 reduce2D(opcode, dst, src, vtmp1); 1933 break; 1934 case 4: 1935 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1936 break; 1937 case 8: 1938 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1939 break; 1940 default: assert(false, "wrong vector length"); 1941 } 1942 } 1943 1944 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1945 switch (vlen) { 1946 case 2: 1947 assert(vtmp1 == xnoreg, ""); 1948 assert(vtmp2 == xnoreg, ""); 1949 unorderedReduce2F(opcode, dst, src); 1950 break; 1951 case 4: 1952 assert(vtmp2 == xnoreg, ""); 1953 unorderedReduce4F(opcode, dst, src, vtmp1); 1954 break; 1955 case 8: 1956 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 1957 break; 1958 case 16: 1959 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 1960 break; 1961 default: assert(false, "wrong vector length"); 1962 } 1963 } 1964 1965 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1966 switch (vlen) { 1967 case 2: 1968 assert(vtmp1 == xnoreg, ""); 1969 assert(vtmp2 == xnoreg, ""); 1970 unorderedReduce2D(opcode, dst, src); 1971 break; 1972 case 4: 1973 assert(vtmp2 == xnoreg, ""); 1974 unorderedReduce4D(opcode, dst, src, vtmp1); 1975 break; 1976 case 8: 1977 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 1978 break; 1979 default: assert(false, "wrong vector length"); 1980 } 1981 } 1982 1983 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1984 if (opcode == Op_AddReductionVI) { 1985 if (vtmp1 != src2) { 1986 movdqu(vtmp1, src2); 1987 } 1988 phaddd(vtmp1, vtmp1); 1989 } else { 1990 pshufd(vtmp1, src2, 0x1); 1991 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1992 } 1993 movdl(vtmp2, src1); 1994 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1995 movdl(dst, vtmp1); 1996 } 1997 1998 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1999 if (opcode == Op_AddReductionVI) { 2000 if (vtmp1 != src2) { 2001 movdqu(vtmp1, src2); 2002 } 2003 phaddd(vtmp1, src2); 2004 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2005 } else { 2006 pshufd(vtmp2, src2, 0xE); 2007 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2008 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2009 } 2010 } 2011 2012 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2013 if (opcode == Op_AddReductionVI) { 2014 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2015 vextracti128_high(vtmp2, vtmp1); 2016 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2017 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2018 } else { 2019 vextracti128_high(vtmp1, src2); 2020 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2021 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2022 } 2023 } 2024 2025 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2026 vextracti64x4_high(vtmp2, src2); 2027 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2028 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2029 } 2030 2031 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2032 pshufd(vtmp2, src2, 0x1); 2033 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2034 movdqu(vtmp1, vtmp2); 2035 psrldq(vtmp1, 2); 2036 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2037 movdqu(vtmp2, vtmp1); 2038 psrldq(vtmp2, 1); 2039 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2040 movdl(vtmp2, src1); 2041 pmovsxbd(vtmp1, vtmp1); 2042 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2043 pextrb(dst, vtmp1, 0x0); 2044 movsbl(dst, dst); 2045 } 2046 2047 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2048 pshufd(vtmp1, src2, 0xE); 2049 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2050 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2051 } 2052 2053 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2054 vextracti128_high(vtmp2, src2); 2055 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2056 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2057 } 2058 2059 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2060 vextracti64x4_high(vtmp1, src2); 2061 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2062 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2063 } 2064 2065 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2066 pmovsxbw(vtmp2, src2); 2067 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2068 } 2069 2070 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2071 if (UseAVX > 1) { 2072 int vector_len = Assembler::AVX_256bit; 2073 vpmovsxbw(vtmp1, src2, vector_len); 2074 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2075 } else { 2076 pmovsxbw(vtmp2, src2); 2077 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2078 pshufd(vtmp2, src2, 0x1); 2079 pmovsxbw(vtmp2, src2); 2080 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2081 } 2082 } 2083 2084 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2085 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2086 int vector_len = Assembler::AVX_512bit; 2087 vpmovsxbw(vtmp1, src2, vector_len); 2088 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2089 } else { 2090 assert(UseAVX >= 2,"Should not reach here."); 2091 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2092 vextracti128_high(vtmp2, src2); 2093 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2094 } 2095 } 2096 2097 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2098 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2099 vextracti64x4_high(vtmp2, src2); 2100 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2101 } 2102 2103 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2104 if (opcode == Op_AddReductionVI) { 2105 if (vtmp1 != src2) { 2106 movdqu(vtmp1, src2); 2107 } 2108 phaddw(vtmp1, vtmp1); 2109 phaddw(vtmp1, vtmp1); 2110 } else { 2111 pshufd(vtmp2, src2, 0x1); 2112 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2113 movdqu(vtmp1, vtmp2); 2114 psrldq(vtmp1, 2); 2115 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2116 } 2117 movdl(vtmp2, src1); 2118 pmovsxwd(vtmp1, vtmp1); 2119 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2120 pextrw(dst, vtmp1, 0x0); 2121 movswl(dst, dst); 2122 } 2123 2124 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2125 if (opcode == Op_AddReductionVI) { 2126 if (vtmp1 != src2) { 2127 movdqu(vtmp1, src2); 2128 } 2129 phaddw(vtmp1, src2); 2130 } else { 2131 pshufd(vtmp1, src2, 0xE); 2132 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2133 } 2134 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2135 } 2136 2137 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2138 if (opcode == Op_AddReductionVI) { 2139 int vector_len = Assembler::AVX_256bit; 2140 vphaddw(vtmp2, src2, src2, vector_len); 2141 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2142 } else { 2143 vextracti128_high(vtmp2, src2); 2144 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2145 } 2146 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2147 } 2148 2149 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2150 int vector_len = Assembler::AVX_256bit; 2151 vextracti64x4_high(vtmp1, src2); 2152 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2153 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2154 } 2155 2156 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2157 pshufd(vtmp2, src2, 0xE); 2158 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2159 movdq(vtmp1, src1); 2160 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2161 movdq(dst, vtmp1); 2162 } 2163 2164 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2165 vextracti128_high(vtmp1, src2); 2166 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2167 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2168 } 2169 2170 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2171 vextracti64x4_high(vtmp2, src2); 2172 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2173 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2174 } 2175 2176 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2177 mov64(temp, -1L); 2178 bzhiq(temp, temp, len); 2179 kmovql(dst, temp); 2180 } 2181 2182 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2183 reduce_operation_128(T_FLOAT, opcode, dst, src); 2184 pshufd(vtmp, src, 0x1); 2185 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2186 } 2187 2188 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2189 reduce2F(opcode, dst, src, vtmp); 2190 pshufd(vtmp, src, 0x2); 2191 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2192 pshufd(vtmp, src, 0x3); 2193 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2194 } 2195 2196 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2197 reduce4F(opcode, dst, src, vtmp2); 2198 vextractf128_high(vtmp2, src); 2199 reduce4F(opcode, dst, vtmp2, vtmp1); 2200 } 2201 2202 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2203 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2204 vextracti64x4_high(vtmp1, src); 2205 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2206 } 2207 2208 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2209 pshufd(dst, src, 0x1); 2210 reduce_operation_128(T_FLOAT, opcode, dst, src); 2211 } 2212 2213 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2214 pshufd(vtmp, src, 0xE); 2215 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2216 unorderedReduce2F(opcode, dst, vtmp); 2217 } 2218 2219 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2220 vextractf128_high(vtmp1, src); 2221 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2222 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2223 } 2224 2225 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2226 vextractf64x4_high(vtmp2, src); 2227 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2228 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2229 } 2230 2231 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2232 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2233 pshufd(vtmp, src, 0xE); 2234 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2235 } 2236 2237 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2238 reduce2D(opcode, dst, src, vtmp2); 2239 vextractf128_high(vtmp2, src); 2240 reduce2D(opcode, dst, vtmp2, vtmp1); 2241 } 2242 2243 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2244 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2245 vextracti64x4_high(vtmp1, src); 2246 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2247 } 2248 2249 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2250 pshufd(dst, src, 0xE); 2251 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2252 } 2253 2254 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2255 vextractf128_high(vtmp, src); 2256 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2257 unorderedReduce2D(opcode, dst, vtmp); 2258 } 2259 2260 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2261 vextractf64x4_high(vtmp2, src); 2262 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2263 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2264 } 2265 2266 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2267 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2268 } 2269 2270 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2271 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2272 } 2273 2274 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2275 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2276 } 2277 2278 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2279 int vec_enc) { 2280 switch(elem_bt) { 2281 case T_INT: 2282 case T_FLOAT: 2283 vmaskmovps(dst, src, mask, vec_enc); 2284 break; 2285 case T_LONG: 2286 case T_DOUBLE: 2287 vmaskmovpd(dst, src, mask, vec_enc); 2288 break; 2289 default: 2290 fatal("Unsupported type %s", type2name(elem_bt)); 2291 break; 2292 } 2293 } 2294 2295 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2296 int vec_enc) { 2297 switch(elem_bt) { 2298 case T_INT: 2299 case T_FLOAT: 2300 vmaskmovps(dst, src, mask, vec_enc); 2301 break; 2302 case T_LONG: 2303 case T_DOUBLE: 2304 vmaskmovpd(dst, src, mask, vec_enc); 2305 break; 2306 default: 2307 fatal("Unsupported type %s", type2name(elem_bt)); 2308 break; 2309 } 2310 } 2311 2312 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2313 XMMRegister dst, XMMRegister src, 2314 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2315 XMMRegister xmm_0, XMMRegister xmm_1) { 2316 const int permconst[] = {1, 14}; 2317 XMMRegister wsrc = src; 2318 XMMRegister wdst = xmm_0; 2319 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2320 2321 int vlen_enc = Assembler::AVX_128bit; 2322 if (vlen == 16) { 2323 vlen_enc = Assembler::AVX_256bit; 2324 } 2325 2326 for (int i = log2(vlen) - 1; i >=0; i--) { 2327 if (i == 0 && !is_dst_valid) { 2328 wdst = dst; 2329 } 2330 if (i == 3) { 2331 vextracti64x4_high(wtmp, wsrc); 2332 } else if (i == 2) { 2333 vextracti128_high(wtmp, wsrc); 2334 } else { // i = [0,1] 2335 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2336 } 2337 2338 if (VM_Version::supports_avx10_2()) { 2339 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc); 2340 } else { 2341 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2342 } 2343 wsrc = wdst; 2344 vlen_enc = Assembler::AVX_128bit; 2345 } 2346 if (is_dst_valid) { 2347 if (VM_Version::supports_avx10_2()) { 2348 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit); 2349 } else { 2350 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2351 } 2352 } 2353 } 2354 2355 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2356 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2357 XMMRegister xmm_0, XMMRegister xmm_1) { 2358 XMMRegister wsrc = src; 2359 XMMRegister wdst = xmm_0; 2360 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2361 int vlen_enc = Assembler::AVX_128bit; 2362 if (vlen == 8) { 2363 vlen_enc = Assembler::AVX_256bit; 2364 } 2365 for (int i = log2(vlen) - 1; i >=0; i--) { 2366 if (i == 0 && !is_dst_valid) { 2367 wdst = dst; 2368 } 2369 if (i == 1) { 2370 vextracti128_high(wtmp, wsrc); 2371 } else if (i == 2) { 2372 vextracti64x4_high(wtmp, wsrc); 2373 } else { 2374 assert(i == 0, "%d", i); 2375 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2376 } 2377 2378 if (VM_Version::supports_avx10_2()) { 2379 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc); 2380 } else { 2381 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2382 } 2383 2384 wsrc = wdst; 2385 vlen_enc = Assembler::AVX_128bit; 2386 } 2387 2388 if (is_dst_valid) { 2389 if (VM_Version::supports_avx10_2()) { 2390 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit); 2391 } else { 2392 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2393 } 2394 } 2395 } 2396 2397 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2398 switch (bt) { 2399 case T_BYTE: pextrb(dst, src, idx); break; 2400 case T_SHORT: pextrw(dst, src, idx); break; 2401 case T_INT: pextrd(dst, src, idx); break; 2402 case T_LONG: pextrq(dst, src, idx); break; 2403 2404 default: 2405 assert(false,"Should not reach here."); 2406 break; 2407 } 2408 } 2409 2410 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2411 int esize = type2aelembytes(typ); 2412 int elem_per_lane = 16/esize; 2413 int lane = elemindex / elem_per_lane; 2414 int eindex = elemindex % elem_per_lane; 2415 2416 if (lane >= 2) { 2417 assert(UseAVX > 2, "required"); 2418 vextractf32x4(dst, src, lane & 3); 2419 return dst; 2420 } else if (lane > 0) { 2421 assert(UseAVX > 0, "required"); 2422 vextractf128(dst, src, lane); 2423 return dst; 2424 } else { 2425 return src; 2426 } 2427 } 2428 2429 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2430 if (typ == T_BYTE) { 2431 movsbl(dst, dst); 2432 } else if (typ == T_SHORT) { 2433 movswl(dst, dst); 2434 } 2435 } 2436 2437 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2438 int esize = type2aelembytes(typ); 2439 int elem_per_lane = 16/esize; 2440 int eindex = elemindex % elem_per_lane; 2441 assert(is_integral_type(typ),"required"); 2442 2443 if (eindex == 0) { 2444 if (typ == T_LONG) { 2445 movq(dst, src); 2446 } else { 2447 movdl(dst, src); 2448 movsxl(typ, dst); 2449 } 2450 } else { 2451 extract(typ, dst, src, eindex); 2452 movsxl(typ, dst); 2453 } 2454 } 2455 2456 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2457 int esize = type2aelembytes(typ); 2458 int elem_per_lane = 16/esize; 2459 int eindex = elemindex % elem_per_lane; 2460 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2461 2462 if (eindex == 0) { 2463 movq(dst, src); 2464 } else { 2465 if (typ == T_FLOAT) { 2466 if (UseAVX == 0) { 2467 movdqu(dst, src); 2468 shufps(dst, dst, eindex); 2469 } else { 2470 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2471 } 2472 } else { 2473 if (UseAVX == 0) { 2474 movdqu(dst, src); 2475 psrldq(dst, eindex*esize); 2476 } else { 2477 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2478 } 2479 movq(dst, dst); 2480 } 2481 } 2482 // Zero upper bits 2483 if (typ == T_FLOAT) { 2484 if (UseAVX == 0) { 2485 assert(vtmp != xnoreg, "required."); 2486 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2487 pand(dst, vtmp); 2488 } else { 2489 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2490 } 2491 } 2492 } 2493 2494 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2495 switch(typ) { 2496 case T_BYTE: 2497 case T_BOOLEAN: 2498 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2499 break; 2500 case T_SHORT: 2501 case T_CHAR: 2502 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2503 break; 2504 case T_INT: 2505 case T_FLOAT: 2506 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2507 break; 2508 case T_LONG: 2509 case T_DOUBLE: 2510 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2511 break; 2512 default: 2513 assert(false,"Should not reach here."); 2514 break; 2515 } 2516 } 2517 2518 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2519 assert(rscratch != noreg || always_reachable(src2), "missing"); 2520 2521 switch(typ) { 2522 case T_BOOLEAN: 2523 case T_BYTE: 2524 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2525 break; 2526 case T_CHAR: 2527 case T_SHORT: 2528 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2529 break; 2530 case T_INT: 2531 case T_FLOAT: 2532 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2533 break; 2534 case T_LONG: 2535 case T_DOUBLE: 2536 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2537 break; 2538 default: 2539 assert(false,"Should not reach here."); 2540 break; 2541 } 2542 } 2543 2544 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2545 switch(typ) { 2546 case T_BYTE: 2547 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2548 break; 2549 case T_SHORT: 2550 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2551 break; 2552 case T_INT: 2553 case T_FLOAT: 2554 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2555 break; 2556 case T_LONG: 2557 case T_DOUBLE: 2558 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2559 break; 2560 default: 2561 assert(false,"Should not reach here."); 2562 break; 2563 } 2564 } 2565 2566 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2567 assert(vlen_in_bytes <= 32, ""); 2568 int esize = type2aelembytes(bt); 2569 if (vlen_in_bytes == 32) { 2570 assert(vtmp == xnoreg, "required."); 2571 if (esize >= 4) { 2572 vtestps(src1, src2, AVX_256bit); 2573 } else { 2574 vptest(src1, src2, AVX_256bit); 2575 } 2576 return; 2577 } 2578 if (vlen_in_bytes < 16) { 2579 // Duplicate the lower part to fill the whole register, 2580 // Don't need to do so for src2 2581 assert(vtmp != xnoreg, "required"); 2582 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2583 pshufd(vtmp, src1, shuffle_imm); 2584 } else { 2585 assert(vtmp == xnoreg, "required"); 2586 vtmp = src1; 2587 } 2588 if (esize >= 4 && VM_Version::supports_avx()) { 2589 vtestps(vtmp, src2, AVX_128bit); 2590 } else { 2591 ptest(vtmp, src2); 2592 } 2593 } 2594 2595 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2596 #ifdef ASSERT 2597 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2598 bool is_bw_supported = VM_Version::supports_avx512bw(); 2599 if (is_bw && !is_bw_supported) { 2600 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2601 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2602 "XMM register should be 0-15"); 2603 } 2604 #endif // ASSERT 2605 switch (elem_bt) { 2606 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2607 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2608 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2609 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2610 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2611 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2612 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2613 } 2614 } 2615 2616 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2617 assert(UseAVX >= 2, "required"); 2618 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2619 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2620 if ((UseAVX > 2) && 2621 (!is_bw || VM_Version::supports_avx512bw()) && 2622 (!is_vl || VM_Version::supports_avx512vl())) { 2623 switch (elem_bt) { 2624 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2625 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2626 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2627 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2628 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2629 } 2630 } else { 2631 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2632 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2633 switch (elem_bt) { 2634 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2635 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2636 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2637 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2638 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2639 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2640 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2641 } 2642 } 2643 } 2644 2645 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2646 switch (to_elem_bt) { 2647 case T_SHORT: 2648 vpmovsxbw(dst, src, vlen_enc); 2649 break; 2650 case T_INT: 2651 vpmovsxbd(dst, src, vlen_enc); 2652 break; 2653 case T_FLOAT: 2654 vpmovsxbd(dst, src, vlen_enc); 2655 vcvtdq2ps(dst, dst, vlen_enc); 2656 break; 2657 case T_LONG: 2658 vpmovsxbq(dst, src, vlen_enc); 2659 break; 2660 case T_DOUBLE: { 2661 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2662 vpmovsxbd(dst, src, mid_vlen_enc); 2663 vcvtdq2pd(dst, dst, vlen_enc); 2664 break; 2665 } 2666 default: 2667 fatal("Unsupported type %s", type2name(to_elem_bt)); 2668 break; 2669 } 2670 } 2671 2672 //------------------------------------------------------------------------------------------- 2673 2674 // IndexOf for constant substrings with size >= 8 chars 2675 // which don't need to be loaded through stack. 2676 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2677 Register cnt1, Register cnt2, 2678 int int_cnt2, Register result, 2679 XMMRegister vec, Register tmp, 2680 int ae) { 2681 ShortBranchVerifier sbv(this); 2682 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2683 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2684 2685 // This method uses the pcmpestri instruction with bound registers 2686 // inputs: 2687 // xmm - substring 2688 // rax - substring length (elements count) 2689 // mem - scanned string 2690 // rdx - string length (elements count) 2691 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2692 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2693 // outputs: 2694 // rcx - matched index in string 2695 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2696 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2697 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2698 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2699 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2700 2701 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2702 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2703 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2704 2705 // Note, inline_string_indexOf() generates checks: 2706 // if (substr.count > string.count) return -1; 2707 // if (substr.count == 0) return 0; 2708 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2709 2710 // Load substring. 2711 if (ae == StrIntrinsicNode::UL) { 2712 pmovzxbw(vec, Address(str2, 0)); 2713 } else { 2714 movdqu(vec, Address(str2, 0)); 2715 } 2716 movl(cnt2, int_cnt2); 2717 movptr(result, str1); // string addr 2718 2719 if (int_cnt2 > stride) { 2720 jmpb(SCAN_TO_SUBSTR); 2721 2722 // Reload substr for rescan, this code 2723 // is executed only for large substrings (> 8 chars) 2724 bind(RELOAD_SUBSTR); 2725 if (ae == StrIntrinsicNode::UL) { 2726 pmovzxbw(vec, Address(str2, 0)); 2727 } else { 2728 movdqu(vec, Address(str2, 0)); 2729 } 2730 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2731 2732 bind(RELOAD_STR); 2733 // We came here after the beginning of the substring was 2734 // matched but the rest of it was not so we need to search 2735 // again. Start from the next element after the previous match. 2736 2737 // cnt2 is number of substring reminding elements and 2738 // cnt1 is number of string reminding elements when cmp failed. 2739 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2740 subl(cnt1, cnt2); 2741 addl(cnt1, int_cnt2); 2742 movl(cnt2, int_cnt2); // Now restore cnt2 2743 2744 decrementl(cnt1); // Shift to next element 2745 cmpl(cnt1, cnt2); 2746 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2747 2748 addptr(result, (1<<scale1)); 2749 2750 } // (int_cnt2 > 8) 2751 2752 // Scan string for start of substr in 16-byte vectors 2753 bind(SCAN_TO_SUBSTR); 2754 pcmpestri(vec, Address(result, 0), mode); 2755 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2756 subl(cnt1, stride); 2757 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2758 cmpl(cnt1, cnt2); 2759 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2760 addptr(result, 16); 2761 jmpb(SCAN_TO_SUBSTR); 2762 2763 // Found a potential substr 2764 bind(FOUND_CANDIDATE); 2765 // Matched whole vector if first element matched (tmp(rcx) == 0). 2766 if (int_cnt2 == stride) { 2767 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2768 } else { // int_cnt2 > 8 2769 jccb(Assembler::overflow, FOUND_SUBSTR); 2770 } 2771 // After pcmpestri tmp(rcx) contains matched element index 2772 // Compute start addr of substr 2773 lea(result, Address(result, tmp, scale1)); 2774 2775 // Make sure string is still long enough 2776 subl(cnt1, tmp); 2777 cmpl(cnt1, cnt2); 2778 if (int_cnt2 == stride) { 2779 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2780 } else { // int_cnt2 > 8 2781 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2782 } 2783 // Left less then substring. 2784 2785 bind(RET_NOT_FOUND); 2786 movl(result, -1); 2787 jmp(EXIT); 2788 2789 if (int_cnt2 > stride) { 2790 // This code is optimized for the case when whole substring 2791 // is matched if its head is matched. 2792 bind(MATCH_SUBSTR_HEAD); 2793 pcmpestri(vec, Address(result, 0), mode); 2794 // Reload only string if does not match 2795 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2796 2797 Label CONT_SCAN_SUBSTR; 2798 // Compare the rest of substring (> 8 chars). 2799 bind(FOUND_SUBSTR); 2800 // First 8 chars are already matched. 2801 negptr(cnt2); 2802 addptr(cnt2, stride); 2803 2804 bind(SCAN_SUBSTR); 2805 subl(cnt1, stride); 2806 cmpl(cnt2, -stride); // Do not read beyond substring 2807 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2808 // Back-up strings to avoid reading beyond substring: 2809 // cnt1 = cnt1 - cnt2 + 8 2810 addl(cnt1, cnt2); // cnt2 is negative 2811 addl(cnt1, stride); 2812 movl(cnt2, stride); negptr(cnt2); 2813 bind(CONT_SCAN_SUBSTR); 2814 if (int_cnt2 < (int)G) { 2815 int tail_off1 = int_cnt2<<scale1; 2816 int tail_off2 = int_cnt2<<scale2; 2817 if (ae == StrIntrinsicNode::UL) { 2818 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2819 } else { 2820 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2821 } 2822 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2823 } else { 2824 // calculate index in register to avoid integer overflow (int_cnt2*2) 2825 movl(tmp, int_cnt2); 2826 addptr(tmp, cnt2); 2827 if (ae == StrIntrinsicNode::UL) { 2828 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2829 } else { 2830 movdqu(vec, Address(str2, tmp, scale2, 0)); 2831 } 2832 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2833 } 2834 // Need to reload strings pointers if not matched whole vector 2835 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2836 addptr(cnt2, stride); 2837 jcc(Assembler::negative, SCAN_SUBSTR); 2838 // Fall through if found full substring 2839 2840 } // (int_cnt2 > 8) 2841 2842 bind(RET_FOUND); 2843 // Found result if we matched full small substring. 2844 // Compute substr offset 2845 subptr(result, str1); 2846 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2847 shrl(result, 1); // index 2848 } 2849 bind(EXIT); 2850 2851 } // string_indexofC8 2852 2853 // Small strings are loaded through stack if they cross page boundary. 2854 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2855 Register cnt1, Register cnt2, 2856 int int_cnt2, Register result, 2857 XMMRegister vec, Register tmp, 2858 int ae) { 2859 ShortBranchVerifier sbv(this); 2860 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2861 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2862 2863 // 2864 // int_cnt2 is length of small (< 8 chars) constant substring 2865 // or (-1) for non constant substring in which case its length 2866 // is in cnt2 register. 2867 // 2868 // Note, inline_string_indexOf() generates checks: 2869 // if (substr.count > string.count) return -1; 2870 // if (substr.count == 0) return 0; 2871 // 2872 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2873 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2874 // This method uses the pcmpestri instruction with bound registers 2875 // inputs: 2876 // xmm - substring 2877 // rax - substring length (elements count) 2878 // mem - scanned string 2879 // rdx - string length (elements count) 2880 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2881 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2882 // outputs: 2883 // rcx - matched index in string 2884 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2885 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2886 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2887 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2888 2889 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2890 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2891 FOUND_CANDIDATE; 2892 2893 { //======================================================== 2894 // We don't know where these strings are located 2895 // and we can't read beyond them. Load them through stack. 2896 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2897 2898 movptr(tmp, rsp); // save old SP 2899 2900 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2901 if (int_cnt2 == (1>>scale2)) { // One byte 2902 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2903 load_unsigned_byte(result, Address(str2, 0)); 2904 movdl(vec, result); // move 32 bits 2905 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2906 // Not enough header space in 32-bit VM: 12+3 = 15. 2907 movl(result, Address(str2, -1)); 2908 shrl(result, 8); 2909 movdl(vec, result); // move 32 bits 2910 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2911 load_unsigned_short(result, Address(str2, 0)); 2912 movdl(vec, result); // move 32 bits 2913 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2914 movdl(vec, Address(str2, 0)); // move 32 bits 2915 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2916 movq(vec, Address(str2, 0)); // move 64 bits 2917 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2918 // Array header size is 12 bytes in 32-bit VM 2919 // + 6 bytes for 3 chars == 18 bytes, 2920 // enough space to load vec and shift. 2921 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2922 if (ae == StrIntrinsicNode::UL) { 2923 int tail_off = int_cnt2-8; 2924 pmovzxbw(vec, Address(str2, tail_off)); 2925 psrldq(vec, -2*tail_off); 2926 } 2927 else { 2928 int tail_off = int_cnt2*(1<<scale2); 2929 movdqu(vec, Address(str2, tail_off-16)); 2930 psrldq(vec, 16-tail_off); 2931 } 2932 } 2933 } else { // not constant substring 2934 cmpl(cnt2, stride); 2935 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2936 2937 // We can read beyond string if srt+16 does not cross page boundary 2938 // since heaps are aligned and mapped by pages. 2939 assert(os::vm_page_size() < (int)G, "default page should be small"); 2940 movl(result, str2); // We need only low 32 bits 2941 andl(result, ((int)os::vm_page_size()-1)); 2942 cmpl(result, ((int)os::vm_page_size()-16)); 2943 jccb(Assembler::belowEqual, CHECK_STR); 2944 2945 // Move small strings to stack to allow load 16 bytes into vec. 2946 subptr(rsp, 16); 2947 int stk_offset = wordSize-(1<<scale2); 2948 push(cnt2); 2949 2950 bind(COPY_SUBSTR); 2951 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2952 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2953 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2954 } else if (ae == StrIntrinsicNode::UU) { 2955 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2956 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2957 } 2958 decrement(cnt2); 2959 jccb(Assembler::notZero, COPY_SUBSTR); 2960 2961 pop(cnt2); 2962 movptr(str2, rsp); // New substring address 2963 } // non constant 2964 2965 bind(CHECK_STR); 2966 cmpl(cnt1, stride); 2967 jccb(Assembler::aboveEqual, BIG_STRINGS); 2968 2969 // Check cross page boundary. 2970 movl(result, str1); // We need only low 32 bits 2971 andl(result, ((int)os::vm_page_size()-1)); 2972 cmpl(result, ((int)os::vm_page_size()-16)); 2973 jccb(Assembler::belowEqual, BIG_STRINGS); 2974 2975 subptr(rsp, 16); 2976 int stk_offset = -(1<<scale1); 2977 if (int_cnt2 < 0) { // not constant 2978 push(cnt2); 2979 stk_offset += wordSize; 2980 } 2981 movl(cnt2, cnt1); 2982 2983 bind(COPY_STR); 2984 if (ae == StrIntrinsicNode::LL) { 2985 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2986 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2987 } else { 2988 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2989 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2990 } 2991 decrement(cnt2); 2992 jccb(Assembler::notZero, COPY_STR); 2993 2994 if (int_cnt2 < 0) { // not constant 2995 pop(cnt2); 2996 } 2997 movptr(str1, rsp); // New string address 2998 2999 bind(BIG_STRINGS); 3000 // Load substring. 3001 if (int_cnt2 < 0) { // -1 3002 if (ae == StrIntrinsicNode::UL) { 3003 pmovzxbw(vec, Address(str2, 0)); 3004 } else { 3005 movdqu(vec, Address(str2, 0)); 3006 } 3007 push(cnt2); // substr count 3008 push(str2); // substr addr 3009 push(str1); // string addr 3010 } else { 3011 // Small (< 8 chars) constant substrings are loaded already. 3012 movl(cnt2, int_cnt2); 3013 } 3014 push(tmp); // original SP 3015 3016 } // Finished loading 3017 3018 //======================================================== 3019 // Start search 3020 // 3021 3022 movptr(result, str1); // string addr 3023 3024 if (int_cnt2 < 0) { // Only for non constant substring 3025 jmpb(SCAN_TO_SUBSTR); 3026 3027 // SP saved at sp+0 3028 // String saved at sp+1*wordSize 3029 // Substr saved at sp+2*wordSize 3030 // Substr count saved at sp+3*wordSize 3031 3032 // Reload substr for rescan, this code 3033 // is executed only for large substrings (> 8 chars) 3034 bind(RELOAD_SUBSTR); 3035 movptr(str2, Address(rsp, 2*wordSize)); 3036 movl(cnt2, Address(rsp, 3*wordSize)); 3037 if (ae == StrIntrinsicNode::UL) { 3038 pmovzxbw(vec, Address(str2, 0)); 3039 } else { 3040 movdqu(vec, Address(str2, 0)); 3041 } 3042 // We came here after the beginning of the substring was 3043 // matched but the rest of it was not so we need to search 3044 // again. Start from the next element after the previous match. 3045 subptr(str1, result); // Restore counter 3046 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3047 shrl(str1, 1); 3048 } 3049 addl(cnt1, str1); 3050 decrementl(cnt1); // Shift to next element 3051 cmpl(cnt1, cnt2); 3052 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3053 3054 addptr(result, (1<<scale1)); 3055 } // non constant 3056 3057 // Scan string for start of substr in 16-byte vectors 3058 bind(SCAN_TO_SUBSTR); 3059 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3060 pcmpestri(vec, Address(result, 0), mode); 3061 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3062 subl(cnt1, stride); 3063 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3064 cmpl(cnt1, cnt2); 3065 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3066 addptr(result, 16); 3067 3068 bind(ADJUST_STR); 3069 cmpl(cnt1, stride); // Do not read beyond string 3070 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3071 // Back-up string to avoid reading beyond string. 3072 lea(result, Address(result, cnt1, scale1, -16)); 3073 movl(cnt1, stride); 3074 jmpb(SCAN_TO_SUBSTR); 3075 3076 // Found a potential substr 3077 bind(FOUND_CANDIDATE); 3078 // After pcmpestri tmp(rcx) contains matched element index 3079 3080 // Make sure string is still long enough 3081 subl(cnt1, tmp); 3082 cmpl(cnt1, cnt2); 3083 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3084 // Left less then substring. 3085 3086 bind(RET_NOT_FOUND); 3087 movl(result, -1); 3088 jmp(CLEANUP); 3089 3090 bind(FOUND_SUBSTR); 3091 // Compute start addr of substr 3092 lea(result, Address(result, tmp, scale1)); 3093 if (int_cnt2 > 0) { // Constant substring 3094 // Repeat search for small substring (< 8 chars) 3095 // from new point without reloading substring. 3096 // Have to check that we don't read beyond string. 3097 cmpl(tmp, stride-int_cnt2); 3098 jccb(Assembler::greater, ADJUST_STR); 3099 // Fall through if matched whole substring. 3100 } else { // non constant 3101 assert(int_cnt2 == -1, "should be != 0"); 3102 3103 addl(tmp, cnt2); 3104 // Found result if we matched whole substring. 3105 cmpl(tmp, stride); 3106 jcc(Assembler::lessEqual, RET_FOUND); 3107 3108 // Repeat search for small substring (<= 8 chars) 3109 // from new point 'str1' without reloading substring. 3110 cmpl(cnt2, stride); 3111 // Have to check that we don't read beyond string. 3112 jccb(Assembler::lessEqual, ADJUST_STR); 3113 3114 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3115 // Compare the rest of substring (> 8 chars). 3116 movptr(str1, result); 3117 3118 cmpl(tmp, cnt2); 3119 // First 8 chars are already matched. 3120 jccb(Assembler::equal, CHECK_NEXT); 3121 3122 bind(SCAN_SUBSTR); 3123 pcmpestri(vec, Address(str1, 0), mode); 3124 // Need to reload strings pointers if not matched whole vector 3125 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3126 3127 bind(CHECK_NEXT); 3128 subl(cnt2, stride); 3129 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3130 addptr(str1, 16); 3131 if (ae == StrIntrinsicNode::UL) { 3132 addptr(str2, 8); 3133 } else { 3134 addptr(str2, 16); 3135 } 3136 subl(cnt1, stride); 3137 cmpl(cnt2, stride); // Do not read beyond substring 3138 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3139 // Back-up strings to avoid reading beyond substring. 3140 3141 if (ae == StrIntrinsicNode::UL) { 3142 lea(str2, Address(str2, cnt2, scale2, -8)); 3143 lea(str1, Address(str1, cnt2, scale1, -16)); 3144 } else { 3145 lea(str2, Address(str2, cnt2, scale2, -16)); 3146 lea(str1, Address(str1, cnt2, scale1, -16)); 3147 } 3148 subl(cnt1, cnt2); 3149 movl(cnt2, stride); 3150 addl(cnt1, stride); 3151 bind(CONT_SCAN_SUBSTR); 3152 if (ae == StrIntrinsicNode::UL) { 3153 pmovzxbw(vec, Address(str2, 0)); 3154 } else { 3155 movdqu(vec, Address(str2, 0)); 3156 } 3157 jmp(SCAN_SUBSTR); 3158 3159 bind(RET_FOUND_LONG); 3160 movptr(str1, Address(rsp, wordSize)); 3161 } // non constant 3162 3163 bind(RET_FOUND); 3164 // Compute substr offset 3165 subptr(result, str1); 3166 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3167 shrl(result, 1); // index 3168 } 3169 bind(CLEANUP); 3170 pop(rsp); // restore SP 3171 3172 } // string_indexof 3173 3174 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3175 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3176 ShortBranchVerifier sbv(this); 3177 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3178 3179 int stride = 8; 3180 3181 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3182 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3183 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3184 FOUND_SEQ_CHAR, DONE_LABEL; 3185 3186 movptr(result, str1); 3187 if (UseAVX >= 2) { 3188 cmpl(cnt1, stride); 3189 jcc(Assembler::less, SCAN_TO_CHAR); 3190 cmpl(cnt1, 2*stride); 3191 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3192 movdl(vec1, ch); 3193 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3194 vpxor(vec2, vec2); 3195 movl(tmp, cnt1); 3196 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3197 andl(cnt1,0x0000000F); //tail count (in chars) 3198 3199 bind(SCAN_TO_16_CHAR_LOOP); 3200 vmovdqu(vec3, Address(result, 0)); 3201 vpcmpeqw(vec3, vec3, vec1, 1); 3202 vptest(vec2, vec3); 3203 jcc(Assembler::carryClear, FOUND_CHAR); 3204 addptr(result, 32); 3205 subl(tmp, 2*stride); 3206 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3207 jmp(SCAN_TO_8_CHAR); 3208 bind(SCAN_TO_8_CHAR_INIT); 3209 movdl(vec1, ch); 3210 pshuflw(vec1, vec1, 0x00); 3211 pshufd(vec1, vec1, 0); 3212 pxor(vec2, vec2); 3213 } 3214 bind(SCAN_TO_8_CHAR); 3215 cmpl(cnt1, stride); 3216 jcc(Assembler::less, SCAN_TO_CHAR); 3217 if (UseAVX < 2) { 3218 movdl(vec1, ch); 3219 pshuflw(vec1, vec1, 0x00); 3220 pshufd(vec1, vec1, 0); 3221 pxor(vec2, vec2); 3222 } 3223 movl(tmp, cnt1); 3224 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3225 andl(cnt1,0x00000007); //tail count (in chars) 3226 3227 bind(SCAN_TO_8_CHAR_LOOP); 3228 movdqu(vec3, Address(result, 0)); 3229 pcmpeqw(vec3, vec1); 3230 ptest(vec2, vec3); 3231 jcc(Assembler::carryClear, FOUND_CHAR); 3232 addptr(result, 16); 3233 subl(tmp, stride); 3234 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3235 bind(SCAN_TO_CHAR); 3236 testl(cnt1, cnt1); 3237 jcc(Assembler::zero, RET_NOT_FOUND); 3238 bind(SCAN_TO_CHAR_LOOP); 3239 load_unsigned_short(tmp, Address(result, 0)); 3240 cmpl(ch, tmp); 3241 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3242 addptr(result, 2); 3243 subl(cnt1, 1); 3244 jccb(Assembler::zero, RET_NOT_FOUND); 3245 jmp(SCAN_TO_CHAR_LOOP); 3246 3247 bind(RET_NOT_FOUND); 3248 movl(result, -1); 3249 jmpb(DONE_LABEL); 3250 3251 bind(FOUND_CHAR); 3252 if (UseAVX >= 2) { 3253 vpmovmskb(tmp, vec3); 3254 } else { 3255 pmovmskb(tmp, vec3); 3256 } 3257 bsfl(ch, tmp); 3258 addptr(result, ch); 3259 3260 bind(FOUND_SEQ_CHAR); 3261 subptr(result, str1); 3262 shrl(result, 1); 3263 3264 bind(DONE_LABEL); 3265 } // string_indexof_char 3266 3267 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3268 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3269 ShortBranchVerifier sbv(this); 3270 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3271 3272 int stride = 16; 3273 3274 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3275 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3276 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3277 FOUND_SEQ_CHAR, DONE_LABEL; 3278 3279 movptr(result, str1); 3280 if (UseAVX >= 2) { 3281 cmpl(cnt1, stride); 3282 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3283 cmpl(cnt1, stride*2); 3284 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3285 movdl(vec1, ch); 3286 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3287 vpxor(vec2, vec2); 3288 movl(tmp, cnt1); 3289 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3290 andl(cnt1,0x0000001F); //tail count (in chars) 3291 3292 bind(SCAN_TO_32_CHAR_LOOP); 3293 vmovdqu(vec3, Address(result, 0)); 3294 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3295 vptest(vec2, vec3); 3296 jcc(Assembler::carryClear, FOUND_CHAR); 3297 addptr(result, 32); 3298 subl(tmp, stride*2); 3299 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3300 jmp(SCAN_TO_16_CHAR); 3301 3302 bind(SCAN_TO_16_CHAR_INIT); 3303 movdl(vec1, ch); 3304 pxor(vec2, vec2); 3305 pshufb(vec1, vec2); 3306 } 3307 3308 bind(SCAN_TO_16_CHAR); 3309 cmpl(cnt1, stride); 3310 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3311 if (UseAVX < 2) { 3312 movdl(vec1, ch); 3313 pxor(vec2, vec2); 3314 pshufb(vec1, vec2); 3315 } 3316 movl(tmp, cnt1); 3317 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3318 andl(cnt1,0x0000000F); //tail count (in bytes) 3319 3320 bind(SCAN_TO_16_CHAR_LOOP); 3321 movdqu(vec3, Address(result, 0)); 3322 pcmpeqb(vec3, vec1); 3323 ptest(vec2, vec3); 3324 jcc(Assembler::carryClear, FOUND_CHAR); 3325 addptr(result, 16); 3326 subl(tmp, stride); 3327 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3328 3329 bind(SCAN_TO_CHAR_INIT); 3330 testl(cnt1, cnt1); 3331 jcc(Assembler::zero, RET_NOT_FOUND); 3332 bind(SCAN_TO_CHAR_LOOP); 3333 load_unsigned_byte(tmp, Address(result, 0)); 3334 cmpl(ch, tmp); 3335 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3336 addptr(result, 1); 3337 subl(cnt1, 1); 3338 jccb(Assembler::zero, RET_NOT_FOUND); 3339 jmp(SCAN_TO_CHAR_LOOP); 3340 3341 bind(RET_NOT_FOUND); 3342 movl(result, -1); 3343 jmpb(DONE_LABEL); 3344 3345 bind(FOUND_CHAR); 3346 if (UseAVX >= 2) { 3347 vpmovmskb(tmp, vec3); 3348 } else { 3349 pmovmskb(tmp, vec3); 3350 } 3351 bsfl(ch, tmp); 3352 addptr(result, ch); 3353 3354 bind(FOUND_SEQ_CHAR); 3355 subptr(result, str1); 3356 3357 bind(DONE_LABEL); 3358 } // stringL_indexof_char 3359 3360 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3361 switch (eltype) { 3362 case T_BOOLEAN: return sizeof(jboolean); 3363 case T_BYTE: return sizeof(jbyte); 3364 case T_SHORT: return sizeof(jshort); 3365 case T_CHAR: return sizeof(jchar); 3366 case T_INT: return sizeof(jint); 3367 default: 3368 ShouldNotReachHere(); 3369 return -1; 3370 } 3371 } 3372 3373 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3374 switch (eltype) { 3375 // T_BOOLEAN used as surrogate for unsigned byte 3376 case T_BOOLEAN: movzbl(dst, src); break; 3377 case T_BYTE: movsbl(dst, src); break; 3378 case T_SHORT: movswl(dst, src); break; 3379 case T_CHAR: movzwl(dst, src); break; 3380 case T_INT: movl(dst, src); break; 3381 default: 3382 ShouldNotReachHere(); 3383 } 3384 } 3385 3386 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3387 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3388 } 3389 3390 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3391 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3392 } 3393 3394 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3395 const int vlen = Assembler::AVX_256bit; 3396 switch (eltype) { 3397 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3398 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3399 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3400 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3401 case T_INT: 3402 // do nothing 3403 break; 3404 default: 3405 ShouldNotReachHere(); 3406 } 3407 } 3408 3409 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3410 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3411 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3412 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3413 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3414 BasicType eltype) { 3415 ShortBranchVerifier sbv(this); 3416 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3417 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3418 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3419 3420 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3421 SHORT_UNROLLED_LOOP_EXIT, 3422 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3423 UNROLLED_VECTOR_LOOP_BEGIN, 3424 END; 3425 switch (eltype) { 3426 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3427 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3428 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3429 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3430 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3431 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3432 } 3433 3434 // For "renaming" for readibility of the code 3435 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3436 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3437 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3438 3439 const int elsize = arrays_hashcode_elsize(eltype); 3440 3441 /* 3442 if (cnt1 >= 2) { 3443 if (cnt1 >= 32) { 3444 UNROLLED VECTOR LOOP 3445 } 3446 UNROLLED SCALAR LOOP 3447 } 3448 SINGLE SCALAR 3449 */ 3450 3451 cmpl(cnt1, 32); 3452 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3453 3454 // cnt1 >= 32 && generate_vectorized_loop 3455 xorl(index, index); 3456 3457 // vresult = IntVector.zero(I256); 3458 for (int idx = 0; idx < 4; idx++) { 3459 vpxor(vresult[idx], vresult[idx]); 3460 } 3461 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3462 Register bound = tmp2; 3463 Register next = tmp3; 3464 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3465 movl(next, Address(tmp2, 0)); 3466 movdl(vnext, next); 3467 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3468 3469 // index = 0; 3470 // bound = cnt1 & ~(32 - 1); 3471 movl(bound, cnt1); 3472 andl(bound, ~(32 - 1)); 3473 // for (; index < bound; index += 32) { 3474 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3475 // result *= next; 3476 imull(result, next); 3477 // loop fission to upfront the cost of fetching from memory, OOO execution 3478 // can then hopefully do a better job of prefetching 3479 for (int idx = 0; idx < 4; idx++) { 3480 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3481 } 3482 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3483 for (int idx = 0; idx < 4; idx++) { 3484 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3485 arrays_hashcode_elvcast(vtmp[idx], eltype); 3486 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3487 } 3488 // index += 32; 3489 addl(index, 32); 3490 // index < bound; 3491 cmpl(index, bound); 3492 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3493 // } 3494 3495 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3496 subl(cnt1, bound); 3497 // release bound 3498 3499 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3500 for (int idx = 0; idx < 4; idx++) { 3501 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3502 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3503 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3504 } 3505 // result += vresult.reduceLanes(ADD); 3506 for (int idx = 0; idx < 4; idx++) { 3507 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3508 } 3509 3510 // } else if (cnt1 < 32) { 3511 3512 bind(SHORT_UNROLLED_BEGIN); 3513 // int i = 1; 3514 movl(index, 1); 3515 cmpl(index, cnt1); 3516 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3517 3518 // for (; i < cnt1 ; i += 2) { 3519 bind(SHORT_UNROLLED_LOOP_BEGIN); 3520 movl(tmp3, 961); 3521 imull(result, tmp3); 3522 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3523 movl(tmp3, tmp2); 3524 shll(tmp3, 5); 3525 subl(tmp3, tmp2); 3526 addl(result, tmp3); 3527 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3528 addl(result, tmp3); 3529 addl(index, 2); 3530 cmpl(index, cnt1); 3531 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3532 3533 // } 3534 // if (i >= cnt1) { 3535 bind(SHORT_UNROLLED_LOOP_EXIT); 3536 jccb(Assembler::greater, END); 3537 movl(tmp2, result); 3538 shll(result, 5); 3539 subl(result, tmp2); 3540 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3541 addl(result, tmp3); 3542 // } 3543 bind(END); 3544 3545 BLOCK_COMMENT("} // arrays_hashcode"); 3546 3547 } // arrays_hashcode 3548 3549 // helper function for string_compare 3550 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3551 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3552 Address::ScaleFactor scale2, Register index, int ae) { 3553 if (ae == StrIntrinsicNode::LL) { 3554 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3555 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3556 } else if (ae == StrIntrinsicNode::UU) { 3557 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3558 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3559 } else { 3560 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3561 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3562 } 3563 } 3564 3565 // Compare strings, used for char[] and byte[]. 3566 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3567 Register cnt1, Register cnt2, Register result, 3568 XMMRegister vec1, int ae, KRegister mask) { 3569 ShortBranchVerifier sbv(this); 3570 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3571 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3572 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3573 int stride2x2 = 0x40; 3574 Address::ScaleFactor scale = Address::no_scale; 3575 Address::ScaleFactor scale1 = Address::no_scale; 3576 Address::ScaleFactor scale2 = Address::no_scale; 3577 3578 if (ae != StrIntrinsicNode::LL) { 3579 stride2x2 = 0x20; 3580 } 3581 3582 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3583 shrl(cnt2, 1); 3584 } 3585 // Compute the minimum of the string lengths and the 3586 // difference of the string lengths (stack). 3587 // Do the conditional move stuff 3588 movl(result, cnt1); 3589 subl(cnt1, cnt2); 3590 push(cnt1); 3591 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3592 3593 // Is the minimum length zero? 3594 testl(cnt2, cnt2); 3595 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3596 if (ae == StrIntrinsicNode::LL) { 3597 // Load first bytes 3598 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3599 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3600 } else if (ae == StrIntrinsicNode::UU) { 3601 // Load first characters 3602 load_unsigned_short(result, Address(str1, 0)); 3603 load_unsigned_short(cnt1, Address(str2, 0)); 3604 } else { 3605 load_unsigned_byte(result, Address(str1, 0)); 3606 load_unsigned_short(cnt1, Address(str2, 0)); 3607 } 3608 subl(result, cnt1); 3609 jcc(Assembler::notZero, POP_LABEL); 3610 3611 if (ae == StrIntrinsicNode::UU) { 3612 // Divide length by 2 to get number of chars 3613 shrl(cnt2, 1); 3614 } 3615 cmpl(cnt2, 1); 3616 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3617 3618 // Check if the strings start at the same location and setup scale and stride 3619 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3620 cmpptr(str1, str2); 3621 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3622 if (ae == StrIntrinsicNode::LL) { 3623 scale = Address::times_1; 3624 stride = 16; 3625 } else { 3626 scale = Address::times_2; 3627 stride = 8; 3628 } 3629 } else { 3630 scale1 = Address::times_1; 3631 scale2 = Address::times_2; 3632 // scale not used 3633 stride = 8; 3634 } 3635 3636 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3637 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3638 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3639 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3640 Label COMPARE_TAIL_LONG; 3641 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3642 3643 int pcmpmask = 0x19; 3644 if (ae == StrIntrinsicNode::LL) { 3645 pcmpmask &= ~0x01; 3646 } 3647 3648 // Setup to compare 16-chars (32-bytes) vectors, 3649 // start from first character again because it has aligned address. 3650 if (ae == StrIntrinsicNode::LL) { 3651 stride2 = 32; 3652 } else { 3653 stride2 = 16; 3654 } 3655 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3656 adr_stride = stride << scale; 3657 } else { 3658 adr_stride1 = 8; //stride << scale1; 3659 adr_stride2 = 16; //stride << scale2; 3660 } 3661 3662 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3663 // rax and rdx are used by pcmpestri as elements counters 3664 movl(result, cnt2); 3665 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3666 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3667 3668 // fast path : compare first 2 8-char vectors. 3669 bind(COMPARE_16_CHARS); 3670 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3671 movdqu(vec1, Address(str1, 0)); 3672 } else { 3673 pmovzxbw(vec1, Address(str1, 0)); 3674 } 3675 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3676 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3677 3678 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3679 movdqu(vec1, Address(str1, adr_stride)); 3680 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3681 } else { 3682 pmovzxbw(vec1, Address(str1, adr_stride1)); 3683 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3684 } 3685 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3686 addl(cnt1, stride); 3687 3688 // Compare the characters at index in cnt1 3689 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3690 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3691 subl(result, cnt2); 3692 jmp(POP_LABEL); 3693 3694 // Setup the registers to start vector comparison loop 3695 bind(COMPARE_WIDE_VECTORS); 3696 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3697 lea(str1, Address(str1, result, scale)); 3698 lea(str2, Address(str2, result, scale)); 3699 } else { 3700 lea(str1, Address(str1, result, scale1)); 3701 lea(str2, Address(str2, result, scale2)); 3702 } 3703 subl(result, stride2); 3704 subl(cnt2, stride2); 3705 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3706 negptr(result); 3707 3708 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3709 bind(COMPARE_WIDE_VECTORS_LOOP); 3710 3711 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3712 cmpl(cnt2, stride2x2); 3713 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3714 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3715 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3716 3717 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3718 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3719 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3720 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3721 } else { 3722 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3723 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3724 } 3725 kortestql(mask, mask); 3726 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3727 addptr(result, stride2x2); // update since we already compared at this addr 3728 subl(cnt2, stride2x2); // and sub the size too 3729 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3730 3731 vpxor(vec1, vec1); 3732 jmpb(COMPARE_WIDE_TAIL); 3733 }//if (VM_Version::supports_avx512vlbw()) 3734 3735 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3736 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3737 vmovdqu(vec1, Address(str1, result, scale)); 3738 vpxor(vec1, Address(str2, result, scale)); 3739 } else { 3740 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3741 vpxor(vec1, Address(str2, result, scale2)); 3742 } 3743 vptest(vec1, vec1); 3744 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3745 addptr(result, stride2); 3746 subl(cnt2, stride2); 3747 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3748 // clean upper bits of YMM registers 3749 vpxor(vec1, vec1); 3750 3751 // compare wide vectors tail 3752 bind(COMPARE_WIDE_TAIL); 3753 testptr(result, result); 3754 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3755 3756 movl(result, stride2); 3757 movl(cnt2, result); 3758 negptr(result); 3759 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3760 3761 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3762 bind(VECTOR_NOT_EQUAL); 3763 // clean upper bits of YMM registers 3764 vpxor(vec1, vec1); 3765 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3766 lea(str1, Address(str1, result, scale)); 3767 lea(str2, Address(str2, result, scale)); 3768 } else { 3769 lea(str1, Address(str1, result, scale1)); 3770 lea(str2, Address(str2, result, scale2)); 3771 } 3772 jmp(COMPARE_16_CHARS); 3773 3774 // Compare tail chars, length between 1 to 15 chars 3775 bind(COMPARE_TAIL_LONG); 3776 movl(cnt2, result); 3777 cmpl(cnt2, stride); 3778 jcc(Assembler::less, COMPARE_SMALL_STR); 3779 3780 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3781 movdqu(vec1, Address(str1, 0)); 3782 } else { 3783 pmovzxbw(vec1, Address(str1, 0)); 3784 } 3785 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3786 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3787 subptr(cnt2, stride); 3788 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3789 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3790 lea(str1, Address(str1, result, scale)); 3791 lea(str2, Address(str2, result, scale)); 3792 } else { 3793 lea(str1, Address(str1, result, scale1)); 3794 lea(str2, Address(str2, result, scale2)); 3795 } 3796 negptr(cnt2); 3797 jmpb(WHILE_HEAD_LABEL); 3798 3799 bind(COMPARE_SMALL_STR); 3800 } else if (UseSSE42Intrinsics) { 3801 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3802 int pcmpmask = 0x19; 3803 // Setup to compare 8-char (16-byte) vectors, 3804 // start from first character again because it has aligned address. 3805 movl(result, cnt2); 3806 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3807 if (ae == StrIntrinsicNode::LL) { 3808 pcmpmask &= ~0x01; 3809 } 3810 jcc(Assembler::zero, COMPARE_TAIL); 3811 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3812 lea(str1, Address(str1, result, scale)); 3813 lea(str2, Address(str2, result, scale)); 3814 } else { 3815 lea(str1, Address(str1, result, scale1)); 3816 lea(str2, Address(str2, result, scale2)); 3817 } 3818 negptr(result); 3819 3820 // pcmpestri 3821 // inputs: 3822 // vec1- substring 3823 // rax - negative string length (elements count) 3824 // mem - scanned string 3825 // rdx - string length (elements count) 3826 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3827 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3828 // outputs: 3829 // rcx - first mismatched element index 3830 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3831 3832 bind(COMPARE_WIDE_VECTORS); 3833 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3834 movdqu(vec1, Address(str1, result, scale)); 3835 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3836 } else { 3837 pmovzxbw(vec1, Address(str1, result, scale1)); 3838 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3839 } 3840 // After pcmpestri cnt1(rcx) contains mismatched element index 3841 3842 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3843 addptr(result, stride); 3844 subptr(cnt2, stride); 3845 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3846 3847 // compare wide vectors tail 3848 testptr(result, result); 3849 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3850 3851 movl(cnt2, stride); 3852 movl(result, stride); 3853 negptr(result); 3854 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3855 movdqu(vec1, Address(str1, result, scale)); 3856 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3857 } else { 3858 pmovzxbw(vec1, Address(str1, result, scale1)); 3859 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3860 } 3861 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3862 3863 // Mismatched characters in the vectors 3864 bind(VECTOR_NOT_EQUAL); 3865 addptr(cnt1, result); 3866 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3867 subl(result, cnt2); 3868 jmpb(POP_LABEL); 3869 3870 bind(COMPARE_TAIL); // limit is zero 3871 movl(cnt2, result); 3872 // Fallthru to tail compare 3873 } 3874 // Shift str2 and str1 to the end of the arrays, negate min 3875 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3876 lea(str1, Address(str1, cnt2, scale)); 3877 lea(str2, Address(str2, cnt2, scale)); 3878 } else { 3879 lea(str1, Address(str1, cnt2, scale1)); 3880 lea(str2, Address(str2, cnt2, scale2)); 3881 } 3882 decrementl(cnt2); // first character was compared already 3883 negptr(cnt2); 3884 3885 // Compare the rest of the elements 3886 bind(WHILE_HEAD_LABEL); 3887 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3888 subl(result, cnt1); 3889 jccb(Assembler::notZero, POP_LABEL); 3890 increment(cnt2); 3891 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3892 3893 // Strings are equal up to min length. Return the length difference. 3894 bind(LENGTH_DIFF_LABEL); 3895 pop(result); 3896 if (ae == StrIntrinsicNode::UU) { 3897 // Divide diff by 2 to get number of chars 3898 sarl(result, 1); 3899 } 3900 jmpb(DONE_LABEL); 3901 3902 if (VM_Version::supports_avx512vlbw()) { 3903 3904 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3905 3906 kmovql(cnt1, mask); 3907 notq(cnt1); 3908 bsfq(cnt2, cnt1); 3909 if (ae != StrIntrinsicNode::LL) { 3910 // Divide diff by 2 to get number of chars 3911 sarl(cnt2, 1); 3912 } 3913 addq(result, cnt2); 3914 if (ae == StrIntrinsicNode::LL) { 3915 load_unsigned_byte(cnt1, Address(str2, result)); 3916 load_unsigned_byte(result, Address(str1, result)); 3917 } else if (ae == StrIntrinsicNode::UU) { 3918 load_unsigned_short(cnt1, Address(str2, result, scale)); 3919 load_unsigned_short(result, Address(str1, result, scale)); 3920 } else { 3921 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3922 load_unsigned_byte(result, Address(str1, result, scale1)); 3923 } 3924 subl(result, cnt1); 3925 jmpb(POP_LABEL); 3926 }//if (VM_Version::supports_avx512vlbw()) 3927 3928 // Discard the stored length difference 3929 bind(POP_LABEL); 3930 pop(cnt1); 3931 3932 // That's it 3933 bind(DONE_LABEL); 3934 if(ae == StrIntrinsicNode::UL) { 3935 negl(result); 3936 } 3937 3938 } 3939 3940 // Search for Non-ASCII character (Negative byte value) in a byte array, 3941 // return the index of the first such character, otherwise the length 3942 // of the array segment searched. 3943 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3944 // @IntrinsicCandidate 3945 // public static int countPositives(byte[] ba, int off, int len) { 3946 // for (int i = off; i < off + len; i++) { 3947 // if (ba[i] < 0) { 3948 // return i - off; 3949 // } 3950 // } 3951 // return len; 3952 // } 3953 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3954 Register result, Register tmp1, 3955 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3956 // rsi: byte array 3957 // rcx: len 3958 // rax: result 3959 ShortBranchVerifier sbv(this); 3960 assert_different_registers(ary1, len, result, tmp1); 3961 assert_different_registers(vec1, vec2); 3962 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3963 3964 movl(result, len); // copy 3965 // len == 0 3966 testl(len, len); 3967 jcc(Assembler::zero, DONE); 3968 3969 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3970 VM_Version::supports_avx512vlbw() && 3971 VM_Version::supports_bmi2()) { 3972 3973 Label test_64_loop, test_tail, BREAK_LOOP; 3974 movl(tmp1, len); 3975 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3976 3977 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 3978 andl(len, 0xffffffc0); // vector count (in chars) 3979 jccb(Assembler::zero, test_tail); 3980 3981 lea(ary1, Address(ary1, len, Address::times_1)); 3982 negptr(len); 3983 3984 bind(test_64_loop); 3985 // Check whether our 64 elements of size byte contain negatives 3986 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3987 kortestql(mask1, mask1); 3988 jcc(Assembler::notZero, BREAK_LOOP); 3989 3990 addptr(len, 64); 3991 jccb(Assembler::notZero, test_64_loop); 3992 3993 bind(test_tail); 3994 // bail out when there is nothing to be done 3995 testl(tmp1, -1); 3996 jcc(Assembler::zero, DONE); 3997 3998 3999 // check the tail for absense of negatives 4000 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4001 { 4002 Register tmp3_aliased = len; 4003 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4004 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4005 notq(tmp3_aliased); 4006 kmovql(mask2, tmp3_aliased); 4007 } 4008 4009 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4010 ktestq(mask1, mask2); 4011 jcc(Assembler::zero, DONE); 4012 4013 // do a full check for negative registers in the tail 4014 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4015 // ary1 already pointing to the right place 4016 jmpb(TAIL_START); 4017 4018 bind(BREAK_LOOP); 4019 // At least one byte in the last 64 byte block was negative. 4020 // Set up to look at the last 64 bytes as if they were a tail 4021 lea(ary1, Address(ary1, len, Address::times_1)); 4022 addptr(result, len); 4023 // Ignore the very last byte: if all others are positive, 4024 // it must be negative, so we can skip right to the 2+1 byte 4025 // end comparison at this point 4026 orl(result, 63); 4027 movl(len, 63); 4028 // Fallthru to tail compare 4029 } else { 4030 4031 if (UseAVX >= 2) { 4032 // With AVX2, use 32-byte vector compare 4033 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4034 4035 // Compare 32-byte vectors 4036 testl(len, 0xffffffe0); // vector count (in bytes) 4037 jccb(Assembler::zero, TAIL_START); 4038 4039 andl(len, 0xffffffe0); 4040 lea(ary1, Address(ary1, len, Address::times_1)); 4041 negptr(len); 4042 4043 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4044 movdl(vec2, tmp1); 4045 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4046 4047 bind(COMPARE_WIDE_VECTORS); 4048 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4049 vptest(vec1, vec2); 4050 jccb(Assembler::notZero, BREAK_LOOP); 4051 addptr(len, 32); 4052 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4053 4054 testl(result, 0x0000001f); // any bytes remaining? 4055 jcc(Assembler::zero, DONE); 4056 4057 // Quick test using the already prepared vector mask 4058 movl(len, result); 4059 andl(len, 0x0000001f); 4060 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4061 vptest(vec1, vec2); 4062 jcc(Assembler::zero, DONE); 4063 // There are zeros, jump to the tail to determine exactly where 4064 jmpb(TAIL_START); 4065 4066 bind(BREAK_LOOP); 4067 // At least one byte in the last 32-byte vector is negative. 4068 // Set up to look at the last 32 bytes as if they were a tail 4069 lea(ary1, Address(ary1, len, Address::times_1)); 4070 addptr(result, len); 4071 // Ignore the very last byte: if all others are positive, 4072 // it must be negative, so we can skip right to the 2+1 byte 4073 // end comparison at this point 4074 orl(result, 31); 4075 movl(len, 31); 4076 // Fallthru to tail compare 4077 } else if (UseSSE42Intrinsics) { 4078 // With SSE4.2, use double quad vector compare 4079 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4080 4081 // Compare 16-byte vectors 4082 testl(len, 0xfffffff0); // vector count (in bytes) 4083 jcc(Assembler::zero, TAIL_START); 4084 4085 andl(len, 0xfffffff0); 4086 lea(ary1, Address(ary1, len, Address::times_1)); 4087 negptr(len); 4088 4089 movl(tmp1, 0x80808080); 4090 movdl(vec2, tmp1); 4091 pshufd(vec2, vec2, 0); 4092 4093 bind(COMPARE_WIDE_VECTORS); 4094 movdqu(vec1, Address(ary1, len, Address::times_1)); 4095 ptest(vec1, vec2); 4096 jccb(Assembler::notZero, BREAK_LOOP); 4097 addptr(len, 16); 4098 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4099 4100 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4101 jcc(Assembler::zero, DONE); 4102 4103 // Quick test using the already prepared vector mask 4104 movl(len, result); 4105 andl(len, 0x0000000f); // tail count (in bytes) 4106 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4107 ptest(vec1, vec2); 4108 jcc(Assembler::zero, DONE); 4109 jmpb(TAIL_START); 4110 4111 bind(BREAK_LOOP); 4112 // At least one byte in the last 16-byte vector is negative. 4113 // Set up and look at the last 16 bytes as if they were a tail 4114 lea(ary1, Address(ary1, len, Address::times_1)); 4115 addptr(result, len); 4116 // Ignore the very last byte: if all others are positive, 4117 // it must be negative, so we can skip right to the 2+1 byte 4118 // end comparison at this point 4119 orl(result, 15); 4120 movl(len, 15); 4121 // Fallthru to tail compare 4122 } 4123 } 4124 4125 bind(TAIL_START); 4126 // Compare 4-byte vectors 4127 andl(len, 0xfffffffc); // vector count (in bytes) 4128 jccb(Assembler::zero, COMPARE_CHAR); 4129 4130 lea(ary1, Address(ary1, len, Address::times_1)); 4131 negptr(len); 4132 4133 bind(COMPARE_VECTORS); 4134 movl(tmp1, Address(ary1, len, Address::times_1)); 4135 andl(tmp1, 0x80808080); 4136 jccb(Assembler::notZero, TAIL_ADJUST); 4137 addptr(len, 4); 4138 jccb(Assembler::notZero, COMPARE_VECTORS); 4139 4140 // Compare trailing char (final 2-3 bytes), if any 4141 bind(COMPARE_CHAR); 4142 4143 testl(result, 0x2); // tail char 4144 jccb(Assembler::zero, COMPARE_BYTE); 4145 load_unsigned_short(tmp1, Address(ary1, 0)); 4146 andl(tmp1, 0x00008080); 4147 jccb(Assembler::notZero, CHAR_ADJUST); 4148 lea(ary1, Address(ary1, 2)); 4149 4150 bind(COMPARE_BYTE); 4151 testl(result, 0x1); // tail byte 4152 jccb(Assembler::zero, DONE); 4153 load_unsigned_byte(tmp1, Address(ary1, 0)); 4154 testl(tmp1, 0x00000080); 4155 jccb(Assembler::zero, DONE); 4156 subptr(result, 1); 4157 jmpb(DONE); 4158 4159 bind(TAIL_ADJUST); 4160 // there are negative bits in the last 4 byte block. 4161 // Adjust result and check the next three bytes 4162 addptr(result, len); 4163 orl(result, 3); 4164 lea(ary1, Address(ary1, len, Address::times_1)); 4165 jmpb(COMPARE_CHAR); 4166 4167 bind(CHAR_ADJUST); 4168 // We are looking at a char + optional byte tail, and found that one 4169 // of the bytes in the char is negative. Adjust the result, check the 4170 // first byte and readjust if needed. 4171 andl(result, 0xfffffffc); 4172 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4173 jccb(Assembler::notZero, DONE); 4174 addptr(result, 1); 4175 4176 // That's it 4177 bind(DONE); 4178 if (UseAVX >= 2) { 4179 // clean upper bits of YMM registers 4180 vpxor(vec1, vec1); 4181 vpxor(vec2, vec2); 4182 } 4183 } 4184 4185 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4186 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4187 Register limit, Register result, Register chr, 4188 XMMRegister vec1, XMMRegister vec2, bool is_char, 4189 KRegister mask, bool expand_ary2) { 4190 // for expand_ary2, limit is the (smaller) size of the second array. 4191 ShortBranchVerifier sbv(this); 4192 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4193 4194 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4195 "Expansion only implemented for AVX2"); 4196 4197 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4198 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4199 4200 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4201 int scaleIncr = expand_ary2 ? 8 : 16; 4202 4203 if (is_array_equ) { 4204 // Check the input args 4205 cmpoop(ary1, ary2); 4206 jcc(Assembler::equal, TRUE_LABEL); 4207 4208 // Need additional checks for arrays_equals. 4209 testptr(ary1, ary1); 4210 jcc(Assembler::zero, FALSE_LABEL); 4211 testptr(ary2, ary2); 4212 jcc(Assembler::zero, FALSE_LABEL); 4213 4214 // Check the lengths 4215 movl(limit, Address(ary1, length_offset)); 4216 cmpl(limit, Address(ary2, length_offset)); 4217 jcc(Assembler::notEqual, FALSE_LABEL); 4218 } 4219 4220 // count == 0 4221 testl(limit, limit); 4222 jcc(Assembler::zero, TRUE_LABEL); 4223 4224 if (is_array_equ) { 4225 // Load array address 4226 lea(ary1, Address(ary1, base_offset)); 4227 lea(ary2, Address(ary2, base_offset)); 4228 } 4229 4230 if (is_array_equ && is_char) { 4231 // arrays_equals when used for char[]. 4232 shll(limit, 1); // byte count != 0 4233 } 4234 movl(result, limit); // copy 4235 4236 if (UseAVX >= 2) { 4237 // With AVX2, use 32-byte vector compare 4238 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4239 4240 // Compare 32-byte vectors 4241 if (expand_ary2) { 4242 andl(result, 0x0000000f); // tail count (in bytes) 4243 andl(limit, 0xfffffff0); // vector count (in bytes) 4244 jcc(Assembler::zero, COMPARE_TAIL); 4245 } else { 4246 andl(result, 0x0000001f); // tail count (in bytes) 4247 andl(limit, 0xffffffe0); // vector count (in bytes) 4248 jcc(Assembler::zero, COMPARE_TAIL_16); 4249 } 4250 4251 lea(ary1, Address(ary1, limit, scaleFactor)); 4252 lea(ary2, Address(ary2, limit, Address::times_1)); 4253 negptr(limit); 4254 4255 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4256 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4257 4258 cmpl(limit, -64); 4259 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4260 4261 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4262 4263 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4264 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4265 kortestql(mask, mask); 4266 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4267 addptr(limit, 64); // update since we already compared at this addr 4268 cmpl(limit, -64); 4269 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4270 4271 // At this point we may still need to compare -limit+result bytes. 4272 // We could execute the next two instruction and just continue via non-wide path: 4273 // cmpl(limit, 0); 4274 // jcc(Assembler::equal, COMPARE_TAIL); // true 4275 // But since we stopped at the points ary{1,2}+limit which are 4276 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4277 // (|limit| <= 32 and result < 32), 4278 // we may just compare the last 64 bytes. 4279 // 4280 addptr(result, -64); // it is safe, bc we just came from this area 4281 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4282 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4283 kortestql(mask, mask); 4284 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4285 4286 jmp(TRUE_LABEL); 4287 4288 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4289 4290 }//if (VM_Version::supports_avx512vlbw()) 4291 4292 bind(COMPARE_WIDE_VECTORS); 4293 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4294 if (expand_ary2) { 4295 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4296 } else { 4297 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4298 } 4299 vpxor(vec1, vec2); 4300 4301 vptest(vec1, vec1); 4302 jcc(Assembler::notZero, FALSE_LABEL); 4303 addptr(limit, scaleIncr * 2); 4304 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4305 4306 testl(result, result); 4307 jcc(Assembler::zero, TRUE_LABEL); 4308 4309 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4310 if (expand_ary2) { 4311 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4312 } else { 4313 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4314 } 4315 vpxor(vec1, vec2); 4316 4317 vptest(vec1, vec1); 4318 jcc(Assembler::notZero, FALSE_LABEL); 4319 jmp(TRUE_LABEL); 4320 4321 bind(COMPARE_TAIL_16); // limit is zero 4322 movl(limit, result); 4323 4324 // Compare 16-byte chunks 4325 andl(result, 0x0000000f); // tail count (in bytes) 4326 andl(limit, 0xfffffff0); // vector count (in bytes) 4327 jcc(Assembler::zero, COMPARE_TAIL); 4328 4329 lea(ary1, Address(ary1, limit, scaleFactor)); 4330 lea(ary2, Address(ary2, limit, Address::times_1)); 4331 negptr(limit); 4332 4333 bind(COMPARE_WIDE_VECTORS_16); 4334 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4335 if (expand_ary2) { 4336 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4337 } else { 4338 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4339 } 4340 pxor(vec1, vec2); 4341 4342 ptest(vec1, vec1); 4343 jcc(Assembler::notZero, FALSE_LABEL); 4344 addptr(limit, scaleIncr); 4345 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4346 4347 bind(COMPARE_TAIL); // limit is zero 4348 movl(limit, result); 4349 // Fallthru to tail compare 4350 } else if (UseSSE42Intrinsics) { 4351 // With SSE4.2, use double quad vector compare 4352 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4353 4354 // Compare 16-byte vectors 4355 andl(result, 0x0000000f); // tail count (in bytes) 4356 andl(limit, 0xfffffff0); // vector count (in bytes) 4357 jcc(Assembler::zero, COMPARE_TAIL); 4358 4359 lea(ary1, Address(ary1, limit, Address::times_1)); 4360 lea(ary2, Address(ary2, limit, Address::times_1)); 4361 negptr(limit); 4362 4363 bind(COMPARE_WIDE_VECTORS); 4364 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4365 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4366 pxor(vec1, vec2); 4367 4368 ptest(vec1, vec1); 4369 jcc(Assembler::notZero, FALSE_LABEL); 4370 addptr(limit, 16); 4371 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4372 4373 testl(result, result); 4374 jcc(Assembler::zero, TRUE_LABEL); 4375 4376 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4377 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4378 pxor(vec1, vec2); 4379 4380 ptest(vec1, vec1); 4381 jccb(Assembler::notZero, FALSE_LABEL); 4382 jmpb(TRUE_LABEL); 4383 4384 bind(COMPARE_TAIL); // limit is zero 4385 movl(limit, result); 4386 // Fallthru to tail compare 4387 } 4388 4389 // Compare 4-byte vectors 4390 if (expand_ary2) { 4391 testl(result, result); 4392 jccb(Assembler::zero, TRUE_LABEL); 4393 } else { 4394 andl(limit, 0xfffffffc); // vector count (in bytes) 4395 jccb(Assembler::zero, COMPARE_CHAR); 4396 } 4397 4398 lea(ary1, Address(ary1, limit, scaleFactor)); 4399 lea(ary2, Address(ary2, limit, Address::times_1)); 4400 negptr(limit); 4401 4402 bind(COMPARE_VECTORS); 4403 if (expand_ary2) { 4404 // There are no "vector" operations for bytes to shorts 4405 movzbl(chr, Address(ary2, limit, Address::times_1)); 4406 cmpw(Address(ary1, limit, Address::times_2), chr); 4407 jccb(Assembler::notEqual, FALSE_LABEL); 4408 addptr(limit, 1); 4409 jcc(Assembler::notZero, COMPARE_VECTORS); 4410 jmp(TRUE_LABEL); 4411 } else { 4412 movl(chr, Address(ary1, limit, Address::times_1)); 4413 cmpl(chr, Address(ary2, limit, Address::times_1)); 4414 jccb(Assembler::notEqual, FALSE_LABEL); 4415 addptr(limit, 4); 4416 jcc(Assembler::notZero, COMPARE_VECTORS); 4417 } 4418 4419 // Compare trailing char (final 2 bytes), if any 4420 bind(COMPARE_CHAR); 4421 testl(result, 0x2); // tail char 4422 jccb(Assembler::zero, COMPARE_BYTE); 4423 load_unsigned_short(chr, Address(ary1, 0)); 4424 load_unsigned_short(limit, Address(ary2, 0)); 4425 cmpl(chr, limit); 4426 jccb(Assembler::notEqual, FALSE_LABEL); 4427 4428 if (is_array_equ && is_char) { 4429 bind(COMPARE_BYTE); 4430 } else { 4431 lea(ary1, Address(ary1, 2)); 4432 lea(ary2, Address(ary2, 2)); 4433 4434 bind(COMPARE_BYTE); 4435 testl(result, 0x1); // tail byte 4436 jccb(Assembler::zero, TRUE_LABEL); 4437 load_unsigned_byte(chr, Address(ary1, 0)); 4438 load_unsigned_byte(limit, Address(ary2, 0)); 4439 cmpl(chr, limit); 4440 jccb(Assembler::notEqual, FALSE_LABEL); 4441 } 4442 bind(TRUE_LABEL); 4443 movl(result, 1); // return true 4444 jmpb(DONE); 4445 4446 bind(FALSE_LABEL); 4447 xorl(result, result); // return false 4448 4449 // That's it 4450 bind(DONE); 4451 if (UseAVX >= 2) { 4452 // clean upper bits of YMM registers 4453 vpxor(vec1, vec1); 4454 vpxor(vec2, vec2); 4455 } 4456 } 4457 4458 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4459 #define __ masm. 4460 Register dst = stub.data<0>(); 4461 XMMRegister src = stub.data<1>(); 4462 address target = stub.data<2>(); 4463 __ bind(stub.entry()); 4464 __ subptr(rsp, 8); 4465 __ movdbl(Address(rsp), src); 4466 __ call(RuntimeAddress(target)); 4467 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte. 4468 __ pop(dst); 4469 __ jmp(stub.continuation()); 4470 #undef __ 4471 } 4472 4473 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4474 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4475 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4476 4477 address slowpath_target; 4478 if (dst_bt == T_INT) { 4479 if (src_bt == T_FLOAT) { 4480 cvttss2sil(dst, src); 4481 cmpl(dst, 0x80000000); 4482 slowpath_target = StubRoutines::x86::f2i_fixup(); 4483 } else { 4484 cvttsd2sil(dst, src); 4485 cmpl(dst, 0x80000000); 4486 slowpath_target = StubRoutines::x86::d2i_fixup(); 4487 } 4488 } else { 4489 if (src_bt == T_FLOAT) { 4490 cvttss2siq(dst, src); 4491 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4492 slowpath_target = StubRoutines::x86::f2l_fixup(); 4493 } else { 4494 cvttsd2siq(dst, src); 4495 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4496 slowpath_target = StubRoutines::x86::d2l_fixup(); 4497 } 4498 } 4499 4500 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte. 4501 int max_size = 23 + (UseAPX ? 1 : 0); 4502 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath); 4503 jcc(Assembler::equal, stub->entry()); 4504 bind(stub->continuation()); 4505 } 4506 4507 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4508 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4509 switch(ideal_opc) { 4510 case Op_LShiftVS: 4511 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4512 case Op_LShiftVI: 4513 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4514 case Op_LShiftVL: 4515 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4516 case Op_RShiftVS: 4517 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4518 case Op_RShiftVI: 4519 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4520 case Op_RShiftVL: 4521 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4522 case Op_URShiftVS: 4523 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4524 case Op_URShiftVI: 4525 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4526 case Op_URShiftVL: 4527 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4528 case Op_RotateRightV: 4529 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4530 case Op_RotateLeftV: 4531 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4532 default: 4533 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4534 break; 4535 } 4536 } 4537 4538 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4539 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4540 if (is_unsigned) { 4541 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4542 } else { 4543 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4544 } 4545 } 4546 4547 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4548 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4549 switch (elem_bt) { 4550 case T_BYTE: 4551 if (ideal_opc == Op_SaturatingAddV) { 4552 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4553 } else { 4554 assert(ideal_opc == Op_SaturatingSubV, ""); 4555 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4556 } 4557 break; 4558 case T_SHORT: 4559 if (ideal_opc == Op_SaturatingAddV) { 4560 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4561 } else { 4562 assert(ideal_opc == Op_SaturatingSubV, ""); 4563 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4564 } 4565 break; 4566 default: 4567 fatal("Unsupported type %s", type2name(elem_bt)); 4568 break; 4569 } 4570 } 4571 4572 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4573 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4574 switch (elem_bt) { 4575 case T_BYTE: 4576 if (ideal_opc == Op_SaturatingAddV) { 4577 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4578 } else { 4579 assert(ideal_opc == Op_SaturatingSubV, ""); 4580 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4581 } 4582 break; 4583 case T_SHORT: 4584 if (ideal_opc == Op_SaturatingAddV) { 4585 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4586 } else { 4587 assert(ideal_opc == Op_SaturatingSubV, ""); 4588 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4589 } 4590 break; 4591 default: 4592 fatal("Unsupported type %s", type2name(elem_bt)); 4593 break; 4594 } 4595 } 4596 4597 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4598 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4599 if (is_unsigned) { 4600 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4601 } else { 4602 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4603 } 4604 } 4605 4606 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4607 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4608 switch (elem_bt) { 4609 case T_BYTE: 4610 if (ideal_opc == Op_SaturatingAddV) { 4611 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4612 } else { 4613 assert(ideal_opc == Op_SaturatingSubV, ""); 4614 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4615 } 4616 break; 4617 case T_SHORT: 4618 if (ideal_opc == Op_SaturatingAddV) { 4619 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4620 } else { 4621 assert(ideal_opc == Op_SaturatingSubV, ""); 4622 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4623 } 4624 break; 4625 default: 4626 fatal("Unsupported type %s", type2name(elem_bt)); 4627 break; 4628 } 4629 } 4630 4631 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4632 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4633 switch (elem_bt) { 4634 case T_BYTE: 4635 if (ideal_opc == Op_SaturatingAddV) { 4636 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4637 } else { 4638 assert(ideal_opc == Op_SaturatingSubV, ""); 4639 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4640 } 4641 break; 4642 case T_SHORT: 4643 if (ideal_opc == Op_SaturatingAddV) { 4644 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4645 } else { 4646 assert(ideal_opc == Op_SaturatingSubV, ""); 4647 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4648 } 4649 break; 4650 default: 4651 fatal("Unsupported type %s", type2name(elem_bt)); 4652 break; 4653 } 4654 } 4655 4656 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4657 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4658 bool is_varshift) { 4659 switch (ideal_opc) { 4660 case Op_AddVB: 4661 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4662 case Op_AddVS: 4663 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4664 case Op_AddVI: 4665 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4666 case Op_AddVL: 4667 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4668 case Op_AddVF: 4669 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4670 case Op_AddVD: 4671 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4672 case Op_SubVB: 4673 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4674 case Op_SubVS: 4675 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4676 case Op_SubVI: 4677 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4678 case Op_SubVL: 4679 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4680 case Op_SubVF: 4681 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4682 case Op_SubVD: 4683 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4684 case Op_MulVS: 4685 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4686 case Op_MulVI: 4687 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4688 case Op_MulVL: 4689 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4690 case Op_MulVF: 4691 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4692 case Op_MulVD: 4693 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4694 case Op_DivVF: 4695 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4696 case Op_DivVD: 4697 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4698 case Op_SqrtVF: 4699 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4700 case Op_SqrtVD: 4701 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4702 case Op_AbsVB: 4703 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4704 case Op_AbsVS: 4705 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4706 case Op_AbsVI: 4707 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4708 case Op_AbsVL: 4709 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4710 case Op_FmaVF: 4711 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4712 case Op_FmaVD: 4713 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4714 case Op_VectorRearrange: 4715 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4716 case Op_LShiftVS: 4717 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4718 case Op_LShiftVI: 4719 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4720 case Op_LShiftVL: 4721 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4722 case Op_RShiftVS: 4723 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4724 case Op_RShiftVI: 4725 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4726 case Op_RShiftVL: 4727 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4728 case Op_URShiftVS: 4729 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4730 case Op_URShiftVI: 4731 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4732 case Op_URShiftVL: 4733 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4734 case Op_RotateLeftV: 4735 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4736 case Op_RotateRightV: 4737 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4738 case Op_MaxV: 4739 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4740 case Op_MinV: 4741 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4742 case Op_UMinV: 4743 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4744 case Op_UMaxV: 4745 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4746 case Op_XorV: 4747 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4748 case Op_OrV: 4749 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4750 case Op_AndV: 4751 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4752 default: 4753 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4754 break; 4755 } 4756 } 4757 4758 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4759 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4760 switch (ideal_opc) { 4761 case Op_AddVB: 4762 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4763 case Op_AddVS: 4764 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4765 case Op_AddVI: 4766 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4767 case Op_AddVL: 4768 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4769 case Op_AddVF: 4770 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4771 case Op_AddVD: 4772 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4773 case Op_SubVB: 4774 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4775 case Op_SubVS: 4776 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4777 case Op_SubVI: 4778 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4779 case Op_SubVL: 4780 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4781 case Op_SubVF: 4782 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4783 case Op_SubVD: 4784 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4785 case Op_MulVS: 4786 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4787 case Op_MulVI: 4788 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_MulVL: 4790 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4791 case Op_MulVF: 4792 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4793 case Op_MulVD: 4794 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4795 case Op_DivVF: 4796 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4797 case Op_DivVD: 4798 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4799 case Op_FmaVF: 4800 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4801 case Op_FmaVD: 4802 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4803 case Op_MaxV: 4804 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4805 case Op_MinV: 4806 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4807 case Op_UMaxV: 4808 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4809 case Op_UMinV: 4810 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4811 case Op_XorV: 4812 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_OrV: 4814 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_AndV: 4816 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4817 default: 4818 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4819 break; 4820 } 4821 } 4822 4823 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4824 KRegister src1, KRegister src2) { 4825 BasicType etype = T_ILLEGAL; 4826 switch(mask_len) { 4827 case 2: 4828 case 4: 4829 case 8: etype = T_BYTE; break; 4830 case 16: etype = T_SHORT; break; 4831 case 32: etype = T_INT; break; 4832 case 64: etype = T_LONG; break; 4833 default: fatal("Unsupported type"); break; 4834 } 4835 assert(etype != T_ILLEGAL, ""); 4836 switch(ideal_opc) { 4837 case Op_AndVMask: 4838 kand(etype, dst, src1, src2); break; 4839 case Op_OrVMask: 4840 kor(etype, dst, src1, src2); break; 4841 case Op_XorVMask: 4842 kxor(etype, dst, src1, src2); break; 4843 default: 4844 fatal("Unsupported masked operation"); break; 4845 } 4846 } 4847 4848 /* 4849 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4850 * If src is NaN, the result is 0. 4851 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4852 * the result is equal to the value of Integer.MIN_VALUE. 4853 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4854 * the result is equal to the value of Integer.MAX_VALUE. 4855 */ 4856 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4857 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4858 Register rscratch, AddressLiteral float_sign_flip, 4859 int vec_enc) { 4860 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4861 Label done; 4862 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4863 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4864 vptest(xtmp2, xtmp2, vec_enc); 4865 jccb(Assembler::equal, done); 4866 4867 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4868 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4869 4870 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4871 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4872 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4873 4874 // Recompute the mask for remaining special value. 4875 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4876 // Extract SRC values corresponding to TRUE mask lanes. 4877 vpand(xtmp4, xtmp2, src, vec_enc); 4878 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4879 // values are set. 4880 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4881 4882 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4883 bind(done); 4884 } 4885 4886 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4887 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4888 Register rscratch, AddressLiteral float_sign_flip, 4889 int vec_enc) { 4890 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4891 Label done; 4892 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4893 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4894 kortestwl(ktmp1, ktmp1); 4895 jccb(Assembler::equal, done); 4896 4897 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4898 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4899 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4900 4901 kxorwl(ktmp1, ktmp1, ktmp2); 4902 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4903 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4904 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4905 bind(done); 4906 } 4907 4908 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4909 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4910 Register rscratch, AddressLiteral double_sign_flip, 4911 int vec_enc) { 4912 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4913 4914 Label done; 4915 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4916 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4917 kortestwl(ktmp1, ktmp1); 4918 jccb(Assembler::equal, done); 4919 4920 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4921 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4922 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4923 4924 kxorwl(ktmp1, ktmp1, ktmp2); 4925 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4926 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4927 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4928 bind(done); 4929 } 4930 4931 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4932 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4933 Register rscratch, AddressLiteral float_sign_flip, 4934 int vec_enc) { 4935 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4936 Label done; 4937 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4938 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4939 kortestwl(ktmp1, ktmp1); 4940 jccb(Assembler::equal, done); 4941 4942 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4943 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4944 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4945 4946 kxorwl(ktmp1, ktmp1, ktmp2); 4947 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4948 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4949 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4950 bind(done); 4951 } 4952 4953 /* 4954 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4955 * If src is NaN, the result is 0. 4956 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4957 * the result is equal to the value of Long.MIN_VALUE. 4958 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4959 * the result is equal to the value of Long.MAX_VALUE. 4960 */ 4961 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4962 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4963 Register rscratch, AddressLiteral double_sign_flip, 4964 int vec_enc) { 4965 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4966 4967 Label done; 4968 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4969 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4970 kortestwl(ktmp1, ktmp1); 4971 jccb(Assembler::equal, done); 4972 4973 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4974 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4975 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4976 4977 kxorwl(ktmp1, ktmp1, ktmp2); 4978 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4979 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4980 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4981 bind(done); 4982 } 4983 4984 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4985 XMMRegister xtmp, int index, int vec_enc) { 4986 assert(vec_enc < Assembler::AVX_512bit, ""); 4987 if (vec_enc == Assembler::AVX_256bit) { 4988 vextractf128_high(xtmp, src); 4989 vshufps(dst, src, xtmp, index, vec_enc); 4990 } else { 4991 vshufps(dst, src, zero, index, vec_enc); 4992 } 4993 } 4994 4995 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4996 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4997 AddressLiteral float_sign_flip, int src_vec_enc) { 4998 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4999 5000 Label done; 5001 // Compare the destination lanes with float_sign_flip 5002 // value to get mask for all special values. 5003 movdqu(xtmp1, float_sign_flip, rscratch); 5004 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5005 ptest(xtmp2, xtmp2); 5006 jccb(Assembler::equal, done); 5007 5008 // Flip float_sign_flip to get max integer value. 5009 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5010 pxor(xtmp1, xtmp4); 5011 5012 // Set detination lanes corresponding to unordered source lanes as zero. 5013 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5014 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5015 5016 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5017 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5018 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5019 5020 // Recompute the mask for remaining special value. 5021 pxor(xtmp2, xtmp3); 5022 // Extract mask corresponding to non-negative source lanes. 5023 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5024 5025 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5026 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5027 pand(xtmp3, xtmp2); 5028 5029 // Replace destination lanes holding special value(0x80000000) with max int 5030 // if corresponding source lane holds a +ve value. 5031 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5032 bind(done); 5033 } 5034 5035 5036 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5037 XMMRegister xtmp, Register rscratch, int vec_enc) { 5038 switch(to_elem_bt) { 5039 case T_SHORT: 5040 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5041 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5042 vpackusdw(dst, dst, zero, vec_enc); 5043 if (vec_enc == Assembler::AVX_256bit) { 5044 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5045 } 5046 break; 5047 case T_BYTE: 5048 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5049 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5050 vpackusdw(dst, dst, zero, vec_enc); 5051 if (vec_enc == Assembler::AVX_256bit) { 5052 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5053 } 5054 vpackuswb(dst, dst, zero, vec_enc); 5055 break; 5056 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt)); 5057 } 5058 } 5059 5060 /* 5061 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):- 5062 * a) Perform vector D2L/F2I cast. 5063 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5064 * It signifies that source value could be any of the special floating point 5065 * values(NaN,-Inf,Inf,Max,-Min). 5066 * c) Set destination to zero if source is NaN value. 5067 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5068 */ 5069 5070 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5071 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5072 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5073 int to_elem_sz = type2aelembytes(to_elem_bt); 5074 assert(to_elem_sz <= 4, ""); 5075 vcvttps2dq(dst, src, vec_enc); 5076 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5077 if (to_elem_sz < 4) { 5078 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5079 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5080 } 5081 } 5082 5083 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5084 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5085 Register rscratch, int vec_enc) { 5086 int to_elem_sz = type2aelembytes(to_elem_bt); 5087 assert(to_elem_sz <= 4, ""); 5088 vcvttps2dq(dst, src, vec_enc); 5089 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5090 switch(to_elem_bt) { 5091 case T_INT: 5092 break; 5093 case T_SHORT: 5094 evpmovdw(dst, dst, vec_enc); 5095 break; 5096 case T_BYTE: 5097 evpmovdb(dst, dst, vec_enc); 5098 break; 5099 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt)); 5100 } 5101 } 5102 5103 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5104 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5105 Register rscratch, int vec_enc) { 5106 evcvttps2qq(dst, src, vec_enc); 5107 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5108 } 5109 5110 // Handling for downcasting from double to integer or sub-word types on AVX2. 5111 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5112 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5113 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5114 int to_elem_sz = type2aelembytes(to_elem_bt); 5115 assert(to_elem_sz < 8, ""); 5116 vcvttpd2dq(dst, src, vec_enc); 5117 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5118 float_sign_flip, vec_enc); 5119 if (to_elem_sz < 4) { 5120 // xtmp4 holds all zero lanes. 5121 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5122 } 5123 } 5124 5125 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5126 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5127 KRegister ktmp2, AddressLiteral sign_flip, 5128 Register rscratch, int vec_enc) { 5129 if (VM_Version::supports_avx512dq()) { 5130 evcvttpd2qq(dst, src, vec_enc); 5131 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5132 switch(to_elem_bt) { 5133 case T_LONG: 5134 break; 5135 case T_INT: 5136 evpmovsqd(dst, dst, vec_enc); 5137 break; 5138 case T_SHORT: 5139 evpmovsqd(dst, dst, vec_enc); 5140 evpmovdw(dst, dst, vec_enc); 5141 break; 5142 case T_BYTE: 5143 evpmovsqd(dst, dst, vec_enc); 5144 evpmovdb(dst, dst, vec_enc); 5145 break; 5146 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt)); 5147 } 5148 } else { 5149 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5150 vcvttpd2dq(dst, src, vec_enc); 5151 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5152 switch(to_elem_bt) { 5153 case T_INT: 5154 break; 5155 case T_SHORT: 5156 evpmovdw(dst, dst, vec_enc); 5157 break; 5158 case T_BYTE: 5159 evpmovdb(dst, dst, vec_enc); 5160 break; 5161 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt)); 5162 } 5163 } 5164 } 5165 5166 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5167 switch(to_elem_bt) { 5168 case T_LONG: 5169 evcvttps2qqs(dst, src, vec_enc); 5170 break; 5171 case T_INT: 5172 evcvttps2dqs(dst, src, vec_enc); 5173 break; 5174 case T_SHORT: 5175 evcvttps2dqs(dst, src, vec_enc); 5176 evpmovdw(dst, dst, vec_enc); 5177 break; 5178 case T_BYTE: 5179 evcvttps2dqs(dst, src, vec_enc); 5180 evpmovdb(dst, dst, vec_enc); 5181 break; 5182 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5183 } 5184 } 5185 5186 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5187 switch(to_elem_bt) { 5188 case T_LONG: 5189 evcvttps2qqs(dst, src, vec_enc); 5190 break; 5191 case T_INT: 5192 evcvttps2dqs(dst, src, vec_enc); 5193 break; 5194 case T_SHORT: 5195 evcvttps2dqs(dst, src, vec_enc); 5196 evpmovdw(dst, dst, vec_enc); 5197 break; 5198 case T_BYTE: 5199 evcvttps2dqs(dst, src, vec_enc); 5200 evpmovdb(dst, dst, vec_enc); 5201 break; 5202 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5203 } 5204 } 5205 5206 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5207 switch(to_elem_bt) { 5208 case T_LONG: 5209 evcvttpd2qqs(dst, src, vec_enc); 5210 break; 5211 case T_INT: 5212 evcvttpd2dqs(dst, src, vec_enc); 5213 break; 5214 case T_SHORT: 5215 evcvttpd2dqs(dst, src, vec_enc); 5216 evpmovdw(dst, dst, vec_enc); 5217 break; 5218 case T_BYTE: 5219 evcvttpd2dqs(dst, src, vec_enc); 5220 evpmovdb(dst, dst, vec_enc); 5221 break; 5222 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt)); 5223 } 5224 } 5225 5226 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) { 5227 switch(to_elem_bt) { 5228 case T_LONG: 5229 evcvttpd2qqs(dst, src, vec_enc); 5230 break; 5231 case T_INT: 5232 evcvttpd2dqs(dst, src, vec_enc); 5233 break; 5234 case T_SHORT: 5235 evcvttpd2dqs(dst, src, vec_enc); 5236 evpmovdw(dst, dst, vec_enc); 5237 break; 5238 case T_BYTE: 5239 evcvttpd2dqs(dst, src, vec_enc); 5240 evpmovdb(dst, dst, vec_enc); 5241 break; 5242 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt)); 5243 } 5244 } 5245 5246 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5247 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5248 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5249 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5250 // and re-instantiate original MXCSR.RC mode after that. 5251 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5252 5253 mov64(tmp, julong_cast(0.5L)); 5254 evpbroadcastq(xtmp1, tmp, vec_enc); 5255 vaddpd(xtmp1, src , xtmp1, vec_enc); 5256 evcvtpd2qq(dst, xtmp1, vec_enc); 5257 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5258 double_sign_flip, vec_enc);; 5259 5260 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5261 } 5262 5263 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5264 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5265 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5266 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5267 // and re-instantiate original MXCSR.RC mode after that. 5268 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5269 5270 movl(tmp, jint_cast(0.5)); 5271 movq(xtmp1, tmp); 5272 vbroadcastss(xtmp1, xtmp1, vec_enc); 5273 vaddps(xtmp1, src , xtmp1, vec_enc); 5274 vcvtps2dq(dst, xtmp1, vec_enc); 5275 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5276 float_sign_flip, vec_enc); 5277 5278 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5279 } 5280 5281 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5282 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5283 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5284 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5285 // and re-instantiate original MXCSR.RC mode after that. 5286 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5287 5288 movl(tmp, jint_cast(0.5)); 5289 movq(xtmp1, tmp); 5290 vbroadcastss(xtmp1, xtmp1, vec_enc); 5291 vaddps(xtmp1, src , xtmp1, vec_enc); 5292 vcvtps2dq(dst, xtmp1, vec_enc); 5293 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5294 5295 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5296 } 5297 5298 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5299 BasicType from_elem_bt, BasicType to_elem_bt) { 5300 switch (from_elem_bt) { 5301 case T_BYTE: 5302 switch (to_elem_bt) { 5303 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5304 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5305 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5306 default: ShouldNotReachHere(); 5307 } 5308 break; 5309 case T_SHORT: 5310 switch (to_elem_bt) { 5311 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5312 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5313 default: ShouldNotReachHere(); 5314 } 5315 break; 5316 case T_INT: 5317 assert(to_elem_bt == T_LONG, ""); 5318 vpmovzxdq(dst, src, vlen_enc); 5319 break; 5320 default: 5321 ShouldNotReachHere(); 5322 } 5323 } 5324 5325 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5326 BasicType from_elem_bt, BasicType to_elem_bt) { 5327 switch (from_elem_bt) { 5328 case T_BYTE: 5329 switch (to_elem_bt) { 5330 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5331 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5332 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5333 default: ShouldNotReachHere(); 5334 } 5335 break; 5336 case T_SHORT: 5337 switch (to_elem_bt) { 5338 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5339 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5340 default: ShouldNotReachHere(); 5341 } 5342 break; 5343 case T_INT: 5344 assert(to_elem_bt == T_LONG, ""); 5345 vpmovsxdq(dst, src, vlen_enc); 5346 break; 5347 default: 5348 ShouldNotReachHere(); 5349 } 5350 } 5351 5352 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5353 BasicType dst_bt, BasicType src_bt, int vlen) { 5354 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5355 assert(vlen_enc != AVX_512bit, ""); 5356 5357 int dst_bt_size = type2aelembytes(dst_bt); 5358 int src_bt_size = type2aelembytes(src_bt); 5359 if (dst_bt_size > src_bt_size) { 5360 switch (dst_bt_size / src_bt_size) { 5361 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5362 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5363 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5364 default: ShouldNotReachHere(); 5365 } 5366 } else { 5367 assert(dst_bt_size < src_bt_size, ""); 5368 switch (src_bt_size / dst_bt_size) { 5369 case 2: { 5370 if (vlen_enc == AVX_128bit) { 5371 vpacksswb(dst, src, src, vlen_enc); 5372 } else { 5373 vpacksswb(dst, src, src, vlen_enc); 5374 vpermq(dst, dst, 0x08, vlen_enc); 5375 } 5376 break; 5377 } 5378 case 4: { 5379 if (vlen_enc == AVX_128bit) { 5380 vpackssdw(dst, src, src, vlen_enc); 5381 vpacksswb(dst, dst, dst, vlen_enc); 5382 } else { 5383 vpackssdw(dst, src, src, vlen_enc); 5384 vpermq(dst, dst, 0x08, vlen_enc); 5385 vpacksswb(dst, dst, dst, AVX_128bit); 5386 } 5387 break; 5388 } 5389 case 8: { 5390 if (vlen_enc == AVX_128bit) { 5391 vpshufd(dst, src, 0x08, vlen_enc); 5392 vpackssdw(dst, dst, dst, vlen_enc); 5393 vpacksswb(dst, dst, dst, vlen_enc); 5394 } else { 5395 vpshufd(dst, src, 0x08, vlen_enc); 5396 vpermq(dst, dst, 0x08, vlen_enc); 5397 vpackssdw(dst, dst, dst, AVX_128bit); 5398 vpacksswb(dst, dst, dst, AVX_128bit); 5399 } 5400 break; 5401 } 5402 default: ShouldNotReachHere(); 5403 } 5404 } 5405 } 5406 5407 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5408 bool merge, BasicType bt, int vlen_enc) { 5409 if (bt == T_INT) { 5410 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5411 } else { 5412 assert(bt == T_LONG, ""); 5413 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5414 } 5415 } 5416 5417 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5418 bool merge, BasicType bt, int vlen_enc) { 5419 if (bt == T_INT) { 5420 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5421 } else { 5422 assert(bt == T_LONG, ""); 5423 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5424 } 5425 } 5426 5427 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5428 Register rtmp2, XMMRegister xtmp, int mask_len, 5429 int vec_enc) { 5430 int index = 0; 5431 int vindex = 0; 5432 mov64(rtmp1, 0x0101010101010101L); 5433 pdepq(rtmp1, src, rtmp1); 5434 if (mask_len > 8) { 5435 movq(rtmp2, src); 5436 vpxor(xtmp, xtmp, xtmp, vec_enc); 5437 movq(xtmp, rtmp1); 5438 } 5439 movq(dst, rtmp1); 5440 5441 mask_len -= 8; 5442 while (mask_len > 0) { 5443 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5444 index++; 5445 if ((index % 2) == 0) { 5446 pxor(xtmp, xtmp); 5447 } 5448 mov64(rtmp1, 0x0101010101010101L); 5449 shrq(rtmp2, 8); 5450 pdepq(rtmp1, rtmp2, rtmp1); 5451 pinsrq(xtmp, rtmp1, index % 2); 5452 vindex = index / 2; 5453 if (vindex) { 5454 // Write entire 16 byte vector when both 64 bit 5455 // lanes are update to save redundant instructions. 5456 if (index % 2) { 5457 vinsertf128(dst, dst, xtmp, vindex); 5458 } 5459 } else { 5460 vmovdqu(dst, xtmp); 5461 } 5462 mask_len -= 8; 5463 } 5464 } 5465 5466 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5467 switch(opc) { 5468 case Op_VectorMaskTrueCount: 5469 popcntq(dst, tmp); 5470 break; 5471 case Op_VectorMaskLastTrue: 5472 if (VM_Version::supports_lzcnt()) { 5473 lzcntq(tmp, tmp); 5474 movl(dst, 63); 5475 subl(dst, tmp); 5476 } else { 5477 movl(dst, -1); 5478 bsrq(tmp, tmp); 5479 cmov32(Assembler::notZero, dst, tmp); 5480 } 5481 break; 5482 case Op_VectorMaskFirstTrue: 5483 if (VM_Version::supports_bmi1()) { 5484 if (masklen < 32) { 5485 orl(tmp, 1 << masklen); 5486 tzcntl(dst, tmp); 5487 } else if (masklen == 32) { 5488 tzcntl(dst, tmp); 5489 } else { 5490 assert(masklen == 64, ""); 5491 tzcntq(dst, tmp); 5492 } 5493 } else { 5494 if (masklen < 32) { 5495 orl(tmp, 1 << masklen); 5496 bsfl(dst, tmp); 5497 } else { 5498 assert(masklen == 32 || masklen == 64, ""); 5499 movl(dst, masklen); 5500 if (masklen == 32) { 5501 bsfl(tmp, tmp); 5502 } else { 5503 bsfq(tmp, tmp); 5504 } 5505 cmov32(Assembler::notZero, dst, tmp); 5506 } 5507 } 5508 break; 5509 case Op_VectorMaskToLong: 5510 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5511 break; 5512 default: assert(false, "Unhandled mask operation"); 5513 } 5514 } 5515 5516 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5517 int masklen, int masksize, int vec_enc) { 5518 assert(VM_Version::supports_popcnt(), ""); 5519 5520 if(VM_Version::supports_avx512bw()) { 5521 kmovql(tmp, mask); 5522 } else { 5523 assert(masklen <= 16, ""); 5524 kmovwl(tmp, mask); 5525 } 5526 5527 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5528 // operations needs to be clipped. 5529 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5530 andq(tmp, (1 << masklen) - 1); 5531 } 5532 5533 vector_mask_operation_helper(opc, dst, tmp, masklen); 5534 } 5535 5536 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5537 Register tmp, int masklen, BasicType bt, int vec_enc) { 5538 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5539 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5540 assert(VM_Version::supports_popcnt(), ""); 5541 5542 bool need_clip = false; 5543 switch(bt) { 5544 case T_BOOLEAN: 5545 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5546 vpxor(xtmp, xtmp, xtmp, vec_enc); 5547 vpsubb(xtmp, xtmp, mask, vec_enc); 5548 vpmovmskb(tmp, xtmp, vec_enc); 5549 need_clip = masklen < 16; 5550 break; 5551 case T_BYTE: 5552 vpmovmskb(tmp, mask, vec_enc); 5553 need_clip = masklen < 16; 5554 break; 5555 case T_SHORT: 5556 vpacksswb(xtmp, mask, mask, vec_enc); 5557 if (masklen >= 16) { 5558 vpermpd(xtmp, xtmp, 8, vec_enc); 5559 } 5560 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5561 need_clip = masklen < 16; 5562 break; 5563 case T_INT: 5564 case T_FLOAT: 5565 vmovmskps(tmp, mask, vec_enc); 5566 need_clip = masklen < 4; 5567 break; 5568 case T_LONG: 5569 case T_DOUBLE: 5570 vmovmskpd(tmp, mask, vec_enc); 5571 need_clip = masklen < 2; 5572 break; 5573 default: assert(false, "Unhandled type, %s", type2name(bt)); 5574 } 5575 5576 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5577 // operations needs to be clipped. 5578 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5579 // need_clip implies masklen < 32 5580 andq(tmp, (1 << masklen) - 1); 5581 } 5582 5583 vector_mask_operation_helper(opc, dst, tmp, masklen); 5584 } 5585 5586 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5587 Register rtmp2, int mask_len) { 5588 kmov(rtmp1, src); 5589 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5590 mov64(rtmp2, -1L); 5591 pextq(rtmp2, rtmp2, rtmp1); 5592 kmov(dst, rtmp2); 5593 } 5594 5595 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5596 XMMRegister mask, Register rtmp, Register rscratch, 5597 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5598 int vec_enc) { 5599 assert(type2aelembytes(bt) >= 4, ""); 5600 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5601 address compress_perm_table = nullptr; 5602 address expand_perm_table = nullptr; 5603 if (type2aelembytes(bt) == 8) { 5604 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5605 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5606 vmovmskpd(rtmp, mask, vec_enc); 5607 } else { 5608 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5609 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5610 vmovmskps(rtmp, mask, vec_enc); 5611 } 5612 shlq(rtmp, 5); // for 32 byte permute row. 5613 if (opcode == Op_CompressV) { 5614 lea(rscratch, ExternalAddress(compress_perm_table)); 5615 } else { 5616 lea(rscratch, ExternalAddress(expand_perm_table)); 5617 } 5618 addptr(rtmp, rscratch); 5619 vmovdqu(permv, Address(rtmp)); 5620 vpermps(dst, permv, src, Assembler::AVX_256bit); 5621 vpxor(xtmp, xtmp, xtmp, vec_enc); 5622 // Blend the result with zero vector using permute mask, each column entry 5623 // in a permute table row contains either a valid permute index or a -1 (default) 5624 // value, this can potentially be used as a blending mask after 5625 // compressing/expanding the source vector lanes. 5626 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv); 5627 } 5628 5629 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5630 bool merge, BasicType bt, int vec_enc) { 5631 if (opcode == Op_CompressV) { 5632 switch(bt) { 5633 case T_BYTE: 5634 evpcompressb(dst, mask, src, merge, vec_enc); 5635 break; 5636 case T_CHAR: 5637 case T_SHORT: 5638 evpcompressw(dst, mask, src, merge, vec_enc); 5639 break; 5640 case T_INT: 5641 evpcompressd(dst, mask, src, merge, vec_enc); 5642 break; 5643 case T_FLOAT: 5644 evcompressps(dst, mask, src, merge, vec_enc); 5645 break; 5646 case T_LONG: 5647 evpcompressq(dst, mask, src, merge, vec_enc); 5648 break; 5649 case T_DOUBLE: 5650 evcompresspd(dst, mask, src, merge, vec_enc); 5651 break; 5652 default: 5653 fatal("Unsupported type %s", type2name(bt)); 5654 break; 5655 } 5656 } else { 5657 assert(opcode == Op_ExpandV, ""); 5658 switch(bt) { 5659 case T_BYTE: 5660 evpexpandb(dst, mask, src, merge, vec_enc); 5661 break; 5662 case T_CHAR: 5663 case T_SHORT: 5664 evpexpandw(dst, mask, src, merge, vec_enc); 5665 break; 5666 case T_INT: 5667 evpexpandd(dst, mask, src, merge, vec_enc); 5668 break; 5669 case T_FLOAT: 5670 evexpandps(dst, mask, src, merge, vec_enc); 5671 break; 5672 case T_LONG: 5673 evpexpandq(dst, mask, src, merge, vec_enc); 5674 break; 5675 case T_DOUBLE: 5676 evexpandpd(dst, mask, src, merge, vec_enc); 5677 break; 5678 default: 5679 fatal("Unsupported type %s", type2name(bt)); 5680 break; 5681 } 5682 } 5683 } 5684 5685 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5686 KRegister ktmp1, int vec_enc) { 5687 if (opcode == Op_SignumVD) { 5688 vsubpd(dst, zero, one, vec_enc); 5689 // if src < 0 ? -1 : 1 5690 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5691 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5692 // if src == NaN, -0.0 or 0.0 return src. 5693 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5694 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5695 } else { 5696 assert(opcode == Op_SignumVF, ""); 5697 vsubps(dst, zero, one, vec_enc); 5698 // if src < 0 ? -1 : 1 5699 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5700 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5701 // if src == NaN, -0.0 or 0.0 return src. 5702 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5703 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5704 } 5705 } 5706 5707 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5708 XMMRegister xtmp1, int vec_enc) { 5709 if (opcode == Op_SignumVD) { 5710 vsubpd(dst, zero, one, vec_enc); 5711 // if src < 0 ? -1 : 1 5712 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5713 // if src == NaN, -0.0 or 0.0 return src. 5714 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5715 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5716 } else { 5717 assert(opcode == Op_SignumVF, ""); 5718 vsubps(dst, zero, one, vec_enc); 5719 // if src < 0 ? -1 : 1 5720 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5721 // if src == NaN, -0.0 or 0.0 return src. 5722 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5723 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5724 } 5725 } 5726 5727 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5728 if (VM_Version::supports_avx512bw()) { 5729 if (mask_len > 32) { 5730 kmovql(dst, src); 5731 } else { 5732 kmovdl(dst, src); 5733 if (mask_len != 32) { 5734 kshiftrdl(dst, dst, 32 - mask_len); 5735 } 5736 } 5737 } else { 5738 assert(mask_len <= 16, ""); 5739 kmovwl(dst, src); 5740 if (mask_len != 16) { 5741 kshiftrwl(dst, dst, 16 - mask_len); 5742 } 5743 } 5744 } 5745 5746 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5747 int lane_size = type2aelembytes(bt); 5748 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5749 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5750 movptr(rtmp, imm32); 5751 switch(lane_size) { 5752 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5753 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5754 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5755 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5756 fatal("Unsupported lane size %d", lane_size); 5757 break; 5758 } 5759 } else { 5760 movptr(rtmp, imm32); 5761 movq(dst, rtmp); 5762 switch(lane_size) { 5763 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5764 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5765 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5766 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5767 fatal("Unsupported lane size %d", lane_size); 5768 break; 5769 } 5770 } 5771 } 5772 5773 // 5774 // Following is lookup table based popcount computation algorithm:- 5775 // Index Bit set count 5776 // [ 0000 -> 0, 5777 // 0001 -> 1, 5778 // 0010 -> 1, 5779 // 0011 -> 2, 5780 // 0100 -> 1, 5781 // 0101 -> 2, 5782 // 0110 -> 2, 5783 // 0111 -> 3, 5784 // 1000 -> 1, 5785 // 1001 -> 2, 5786 // 1010 -> 3, 5787 // 1011 -> 3, 5788 // 1100 -> 2, 5789 // 1101 -> 3, 5790 // 1111 -> 4 ] 5791 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5792 // shuffle indices for lookup table access. 5793 // b. Right shift each byte of vector lane by 4 positions. 5794 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5795 // shuffle indices for lookup table access. 5796 // d. Add the bitset count of upper and lower 4 bits of each byte. 5797 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5798 // count of all the bytes of a quadword. 5799 // f. Perform step e. for upper 128bit vector lane. 5800 // g. Pack the bitset count of quadwords back to double word. 5801 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5802 5803 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5804 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5805 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5806 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5807 vpsrlw(dst, src, 4, vec_enc); 5808 vpand(dst, dst, xtmp1, vec_enc); 5809 vpand(xtmp1, src, xtmp1, vec_enc); 5810 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5811 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5812 vpshufb(dst, xtmp2, dst, vec_enc); 5813 vpaddb(dst, dst, xtmp1, vec_enc); 5814 } 5815 5816 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5817 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5818 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5819 // Following code is as per steps e,f,g and h of above algorithm. 5820 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5821 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5822 vpsadbw(dst, dst, xtmp2, vec_enc); 5823 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5824 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5825 vpackuswb(dst, xtmp1, dst, vec_enc); 5826 } 5827 5828 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5829 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5830 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5831 // Add the popcount of upper and lower bytes of word. 5832 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5833 vpsrlw(dst, xtmp1, 8, vec_enc); 5834 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5835 vpaddw(dst, dst, xtmp1, vec_enc); 5836 } 5837 5838 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5839 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5840 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5841 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5842 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5843 } 5844 5845 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5846 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5847 switch(bt) { 5848 case T_LONG: 5849 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5850 break; 5851 case T_INT: 5852 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5853 break; 5854 case T_CHAR: 5855 case T_SHORT: 5856 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5857 break; 5858 case T_BYTE: 5859 case T_BOOLEAN: 5860 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5861 break; 5862 default: 5863 fatal("Unsupported type %s", type2name(bt)); 5864 break; 5865 } 5866 } 5867 5868 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5869 KRegister mask, bool merge, int vec_enc) { 5870 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5871 switch(bt) { 5872 case T_LONG: 5873 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5874 evpopcntq(dst, mask, src, merge, vec_enc); 5875 break; 5876 case T_INT: 5877 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5878 evpopcntd(dst, mask, src, merge, vec_enc); 5879 break; 5880 case T_CHAR: 5881 case T_SHORT: 5882 assert(VM_Version::supports_avx512_bitalg(), ""); 5883 evpopcntw(dst, mask, src, merge, vec_enc); 5884 break; 5885 case T_BYTE: 5886 case T_BOOLEAN: 5887 assert(VM_Version::supports_avx512_bitalg(), ""); 5888 evpopcntb(dst, mask, src, merge, vec_enc); 5889 break; 5890 default: 5891 fatal("Unsupported type %s", type2name(bt)); 5892 break; 5893 } 5894 } 5895 5896 // Bit reversal algorithm first reverses the bits of each byte followed by 5897 // a byte level reversal for multi-byte primitive types (short/int/long). 5898 // Algorithm performs a lookup table access to get reverse bit sequence 5899 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5900 // is obtained by swapping the reverse bit sequences of upper and lower 5901 // nibble of a byte. 5902 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5903 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5904 if (VM_Version::supports_avx512vlbw()) { 5905 5906 // Get the reverse bit sequence of lower nibble of each byte. 5907 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5908 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5909 evpandq(dst, xtmp2, src, vec_enc); 5910 vpshufb(dst, xtmp1, dst, vec_enc); 5911 vpsllq(dst, dst, 4, vec_enc); 5912 5913 // Get the reverse bit sequence of upper nibble of each byte. 5914 vpandn(xtmp2, xtmp2, src, vec_enc); 5915 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5916 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5917 5918 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5919 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5920 evporq(xtmp2, dst, xtmp2, vec_enc); 5921 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5922 5923 } else if(vec_enc == Assembler::AVX_512bit) { 5924 // Shift based bit reversal. 5925 assert(bt == T_LONG || bt == T_INT, ""); 5926 5927 // Swap lower and upper nibble of each byte. 5928 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5929 5930 // Swap two least and most significant bits of each nibble. 5931 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5932 5933 // Swap adjacent pair of bits. 5934 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5935 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5936 5937 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5938 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5939 } else { 5940 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5941 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5942 5943 // Get the reverse bit sequence of lower nibble of each byte. 5944 vpand(dst, xtmp2, src, vec_enc); 5945 vpshufb(dst, xtmp1, dst, vec_enc); 5946 vpsllq(dst, dst, 4, vec_enc); 5947 5948 // Get the reverse bit sequence of upper nibble of each byte. 5949 vpandn(xtmp2, xtmp2, src, vec_enc); 5950 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5951 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5952 5953 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5954 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5955 vpor(xtmp2, dst, xtmp2, vec_enc); 5956 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5957 } 5958 } 5959 5960 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5961 XMMRegister xtmp, Register rscratch) { 5962 assert(VM_Version::supports_gfni(), ""); 5963 assert(rscratch != noreg || always_reachable(mask), "missing"); 5964 5965 // Galois field instruction based bit reversal based on following algorithm. 5966 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5967 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5968 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5969 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5970 } 5971 5972 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5973 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5974 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5975 evpandq(dst, xtmp1, src, vec_enc); 5976 vpsllq(dst, dst, nbits, vec_enc); 5977 vpandn(xtmp1, xtmp1, src, vec_enc); 5978 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5979 evporq(dst, dst, xtmp1, vec_enc); 5980 } 5981 5982 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5983 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5984 // Shift based bit reversal. 5985 assert(VM_Version::supports_evex(), ""); 5986 switch(bt) { 5987 case T_LONG: 5988 // Swap upper and lower double word of each quad word. 5989 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5990 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5991 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5992 break; 5993 case T_INT: 5994 // Swap upper and lower word of each double word. 5995 evprord(xtmp1, k0, src, 16, true, vec_enc); 5996 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5997 break; 5998 case T_CHAR: 5999 case T_SHORT: 6000 // Swap upper and lower byte of each word. 6001 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6002 break; 6003 case T_BYTE: 6004 evmovdquq(dst, k0, src, true, vec_enc); 6005 break; 6006 default: 6007 fatal("Unsupported type %s", type2name(bt)); 6008 break; 6009 } 6010 } 6011 6012 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6013 if (bt == T_BYTE) { 6014 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6015 evmovdquq(dst, k0, src, true, vec_enc); 6016 } else { 6017 vmovdqu(dst, src); 6018 } 6019 return; 6020 } 6021 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6022 // pre-computed shuffle indices. 6023 switch(bt) { 6024 case T_LONG: 6025 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6026 break; 6027 case T_INT: 6028 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6029 break; 6030 case T_CHAR: 6031 case T_SHORT: 6032 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6033 break; 6034 default: 6035 fatal("Unsupported type %s", type2name(bt)); 6036 break; 6037 } 6038 vpshufb(dst, src, dst, vec_enc); 6039 } 6040 6041 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6042 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6043 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6044 assert(is_integral_type(bt), ""); 6045 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6046 assert(VM_Version::supports_avx512cd(), ""); 6047 switch(bt) { 6048 case T_LONG: 6049 evplzcntq(dst, ktmp, src, merge, vec_enc); 6050 break; 6051 case T_INT: 6052 evplzcntd(dst, ktmp, src, merge, vec_enc); 6053 break; 6054 case T_SHORT: 6055 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6056 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6057 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6058 vpunpckhwd(dst, xtmp1, src, vec_enc); 6059 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6060 vpackusdw(dst, xtmp2, dst, vec_enc); 6061 break; 6062 case T_BYTE: 6063 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6064 // accessing the lookup table. 6065 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6066 // accessing the lookup table. 6067 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6068 assert(VM_Version::supports_avx512bw(), ""); 6069 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6070 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6071 vpand(xtmp2, dst, src, vec_enc); 6072 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6073 vpsrlw(xtmp3, src, 4, vec_enc); 6074 vpand(xtmp3, dst, xtmp3, vec_enc); 6075 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6076 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6077 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6078 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6079 break; 6080 default: 6081 fatal("Unsupported type %s", type2name(bt)); 6082 break; 6083 } 6084 } 6085 6086 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6087 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6088 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6089 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6090 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6091 // accessing the lookup table. 6092 vpand(dst, xtmp2, src, vec_enc); 6093 vpshufb(dst, xtmp1, dst, vec_enc); 6094 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6095 // accessing the lookup table. 6096 vpsrlw(xtmp3, src, 4, vec_enc); 6097 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6098 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6099 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6100 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6101 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6102 vpaddb(dst, dst, xtmp2, vec_enc); 6103 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6104 } 6105 6106 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6107 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6108 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6109 // Add zero counts of lower byte and upper byte of a word if 6110 // upper byte holds a zero value. 6111 vpsrlw(xtmp3, src, 8, vec_enc); 6112 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6113 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6114 vpsllw(xtmp2, dst, 8, vec_enc); 6115 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6116 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6117 vpsrlw(dst, dst, 8, vec_enc); 6118 } 6119 6120 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6121 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6122 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6123 // hence biased exponent can be used to compute leading zero count as per 6124 // following formula:- 6125 // LZCNT = 31 - (biased_exp - 127) 6126 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6127 6128 // Broadcast 0xFF 6129 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6130 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6131 6132 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6133 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6134 // contributes to the leading number of zeros. 6135 vpsrld(xtmp2, src, 1, vec_enc); 6136 vpandn(xtmp3, xtmp2, src, vec_enc); 6137 6138 // Extract biased exponent. 6139 vcvtdq2ps(dst, xtmp3, vec_enc); 6140 vpsrld(dst, dst, 23, vec_enc); 6141 vpand(dst, dst, xtmp1, vec_enc); 6142 6143 // Broadcast 127. 6144 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6145 // Exponent = biased_exp - 127 6146 vpsubd(dst, dst, xtmp1, vec_enc); 6147 6148 // Exponent_plus_one = Exponent + 1 6149 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6150 vpaddd(dst, dst, xtmp3, vec_enc); 6151 6152 // Replace -ve exponent with zero, exponent is -ve when src 6153 // lane contains a zero value. 6154 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6155 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6156 6157 // Rematerialize broadcast 32. 6158 vpslld(xtmp1, xtmp3, 5, vec_enc); 6159 // Exponent is 32 if corresponding source lane contains max_int value. 6160 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6161 // LZCNT = 32 - exponent_plus_one 6162 vpsubd(dst, xtmp1, dst, vec_enc); 6163 6164 // Replace LZCNT with a value 1 if corresponding source lane 6165 // contains max_int value. 6166 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6167 6168 // Replace biased_exp with 0 if source lane value is less than zero. 6169 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6170 vblendvps(dst, dst, xtmp2, src, vec_enc); 6171 } 6172 6173 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6174 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6175 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6176 // Add zero counts of lower word and upper word of a double word if 6177 // upper word holds a zero value. 6178 vpsrld(xtmp3, src, 16, vec_enc); 6179 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6180 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6181 vpslld(xtmp2, dst, 16, vec_enc); 6182 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6183 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6184 vpsrld(dst, dst, 16, vec_enc); 6185 // Add zero counts of lower doubleword and upper doubleword of a 6186 // quadword if upper doubleword holds a zero value. 6187 vpsrlq(xtmp3, src, 32, vec_enc); 6188 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6189 vpsllq(xtmp2, dst, 32, vec_enc); 6190 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6191 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6192 vpsrlq(dst, dst, 32, vec_enc); 6193 } 6194 6195 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6196 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6197 Register rtmp, int vec_enc) { 6198 assert(is_integral_type(bt), "unexpected type"); 6199 assert(vec_enc < Assembler::AVX_512bit, ""); 6200 switch(bt) { 6201 case T_LONG: 6202 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6203 break; 6204 case T_INT: 6205 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6206 break; 6207 case T_SHORT: 6208 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6209 break; 6210 case T_BYTE: 6211 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6212 break; 6213 default: 6214 fatal("Unsupported type %s", type2name(bt)); 6215 break; 6216 } 6217 } 6218 6219 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6220 switch(bt) { 6221 case T_BYTE: 6222 vpsubb(dst, src1, src2, vec_enc); 6223 break; 6224 case T_SHORT: 6225 vpsubw(dst, src1, src2, vec_enc); 6226 break; 6227 case T_INT: 6228 vpsubd(dst, src1, src2, vec_enc); 6229 break; 6230 case T_LONG: 6231 vpsubq(dst, src1, src2, vec_enc); 6232 break; 6233 default: 6234 fatal("Unsupported type %s", type2name(bt)); 6235 break; 6236 } 6237 } 6238 6239 // Trailing zero count computation is based on leading zero count operation as per 6240 // following equation. All AVX3 targets support AVX512CD feature which offers 6241 // direct vector instruction to compute leading zero count. 6242 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6243 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6244 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6245 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6246 assert(is_integral_type(bt), ""); 6247 // xtmp = -1 6248 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6249 // xtmp = xtmp + src 6250 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6251 // xtmp = xtmp & ~src 6252 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6253 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6254 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6255 vpsub(bt, dst, xtmp4, dst, vec_enc); 6256 } 6257 6258 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6259 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6260 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6261 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6262 assert(is_integral_type(bt), ""); 6263 // xtmp = 0 6264 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6265 // xtmp = 0 - src 6266 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6267 // xtmp = xtmp | src 6268 vpor(xtmp3, xtmp3, src, vec_enc); 6269 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6270 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6271 vpsub(bt, dst, xtmp1, dst, vec_enc); 6272 } 6273 6274 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6275 Label done; 6276 Label neg_divisor_fastpath; 6277 cmpl(divisor, 0); 6278 jccb(Assembler::less, neg_divisor_fastpath); 6279 xorl(rdx, rdx); 6280 divl(divisor); 6281 jmpb(done); 6282 bind(neg_divisor_fastpath); 6283 // Fastpath for divisor < 0: 6284 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6285 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6286 movl(rdx, rax); 6287 subl(rdx, divisor); 6288 if (VM_Version::supports_bmi1()) { 6289 andnl(rax, rdx, rax); 6290 } else { 6291 notl(rdx); 6292 andl(rax, rdx); 6293 } 6294 shrl(rax, 31); 6295 bind(done); 6296 } 6297 6298 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6299 Label done; 6300 Label neg_divisor_fastpath; 6301 cmpl(divisor, 0); 6302 jccb(Assembler::less, neg_divisor_fastpath); 6303 xorl(rdx, rdx); 6304 divl(divisor); 6305 jmpb(done); 6306 bind(neg_divisor_fastpath); 6307 // Fastpath when divisor < 0: 6308 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6309 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6310 movl(rdx, rax); 6311 subl(rax, divisor); 6312 if (VM_Version::supports_bmi1()) { 6313 andnl(rax, rax, rdx); 6314 } else { 6315 notl(rax); 6316 andl(rax, rdx); 6317 } 6318 sarl(rax, 31); 6319 andl(rax, divisor); 6320 subl(rdx, rax); 6321 bind(done); 6322 } 6323 6324 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6325 Label done; 6326 Label neg_divisor_fastpath; 6327 6328 cmpl(divisor, 0); 6329 jccb(Assembler::less, neg_divisor_fastpath); 6330 xorl(rdx, rdx); 6331 divl(divisor); 6332 jmpb(done); 6333 bind(neg_divisor_fastpath); 6334 // Fastpath for divisor < 0: 6335 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6336 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6337 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6338 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6339 movl(rdx, rax); 6340 subl(rax, divisor); 6341 if (VM_Version::supports_bmi1()) { 6342 andnl(rax, rax, rdx); 6343 } else { 6344 notl(rax); 6345 andl(rax, rdx); 6346 } 6347 movl(tmp, rax); 6348 shrl(rax, 31); // quotient 6349 sarl(tmp, 31); 6350 andl(tmp, divisor); 6351 subl(rdx, tmp); // remainder 6352 bind(done); 6353 } 6354 6355 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6356 XMMRegister xtmp2, Register rtmp) { 6357 if(VM_Version::supports_gfni()) { 6358 // Galois field instruction based bit reversal based on following algorithm. 6359 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6360 mov64(rtmp, 0x8040201008040201L); 6361 movq(xtmp1, src); 6362 movq(xtmp2, rtmp); 6363 gf2p8affineqb(xtmp1, xtmp2, 0); 6364 movq(dst, xtmp1); 6365 } else { 6366 // Swap even and odd numbered bits. 6367 movl(rtmp, src); 6368 andl(rtmp, 0x55555555); 6369 shll(rtmp, 1); 6370 movl(dst, src); 6371 andl(dst, 0xAAAAAAAA); 6372 shrl(dst, 1); 6373 orl(dst, rtmp); 6374 6375 // Swap LSB and MSB 2 bits of each nibble. 6376 movl(rtmp, dst); 6377 andl(rtmp, 0x33333333); 6378 shll(rtmp, 2); 6379 andl(dst, 0xCCCCCCCC); 6380 shrl(dst, 2); 6381 orl(dst, rtmp); 6382 6383 // Swap LSB and MSB 4 bits of each byte. 6384 movl(rtmp, dst); 6385 andl(rtmp, 0x0F0F0F0F); 6386 shll(rtmp, 4); 6387 andl(dst, 0xF0F0F0F0); 6388 shrl(dst, 4); 6389 orl(dst, rtmp); 6390 } 6391 bswapl(dst); 6392 } 6393 6394 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6395 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6396 if(VM_Version::supports_gfni()) { 6397 // Galois field instruction based bit reversal based on following algorithm. 6398 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6399 mov64(rtmp1, 0x8040201008040201L); 6400 movq(xtmp1, src); 6401 movq(xtmp2, rtmp1); 6402 gf2p8affineqb(xtmp1, xtmp2, 0); 6403 movq(dst, xtmp1); 6404 } else { 6405 // Swap even and odd numbered bits. 6406 movq(rtmp1, src); 6407 mov64(rtmp2, 0x5555555555555555L); 6408 andq(rtmp1, rtmp2); 6409 shlq(rtmp1, 1); 6410 movq(dst, src); 6411 notq(rtmp2); 6412 andq(dst, rtmp2); 6413 shrq(dst, 1); 6414 orq(dst, rtmp1); 6415 6416 // Swap LSB and MSB 2 bits of each nibble. 6417 movq(rtmp1, dst); 6418 mov64(rtmp2, 0x3333333333333333L); 6419 andq(rtmp1, rtmp2); 6420 shlq(rtmp1, 2); 6421 notq(rtmp2); 6422 andq(dst, rtmp2); 6423 shrq(dst, 2); 6424 orq(dst, rtmp1); 6425 6426 // Swap LSB and MSB 4 bits of each byte. 6427 movq(rtmp1, dst); 6428 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6429 andq(rtmp1, rtmp2); 6430 shlq(rtmp1, 4); 6431 notq(rtmp2); 6432 andq(dst, rtmp2); 6433 shrq(dst, 4); 6434 orq(dst, rtmp1); 6435 } 6436 bswapq(dst); 6437 } 6438 6439 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6440 Label done; 6441 Label neg_divisor_fastpath; 6442 cmpq(divisor, 0); 6443 jccb(Assembler::less, neg_divisor_fastpath); 6444 xorl(rdx, rdx); 6445 divq(divisor); 6446 jmpb(done); 6447 bind(neg_divisor_fastpath); 6448 // Fastpath for divisor < 0: 6449 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6450 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6451 movq(rdx, rax); 6452 subq(rdx, divisor); 6453 if (VM_Version::supports_bmi1()) { 6454 andnq(rax, rdx, rax); 6455 } else { 6456 notq(rdx); 6457 andq(rax, rdx); 6458 } 6459 shrq(rax, 63); 6460 bind(done); 6461 } 6462 6463 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6464 Label done; 6465 Label neg_divisor_fastpath; 6466 cmpq(divisor, 0); 6467 jccb(Assembler::less, neg_divisor_fastpath); 6468 xorq(rdx, rdx); 6469 divq(divisor); 6470 jmp(done); 6471 bind(neg_divisor_fastpath); 6472 // Fastpath when divisor < 0: 6473 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6474 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6475 movq(rdx, rax); 6476 subq(rax, divisor); 6477 if (VM_Version::supports_bmi1()) { 6478 andnq(rax, rax, rdx); 6479 } else { 6480 notq(rax); 6481 andq(rax, rdx); 6482 } 6483 sarq(rax, 63); 6484 andq(rax, divisor); 6485 subq(rdx, rax); 6486 bind(done); 6487 } 6488 6489 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6490 Label done; 6491 Label neg_divisor_fastpath; 6492 cmpq(divisor, 0); 6493 jccb(Assembler::less, neg_divisor_fastpath); 6494 xorq(rdx, rdx); 6495 divq(divisor); 6496 jmp(done); 6497 bind(neg_divisor_fastpath); 6498 // Fastpath for divisor < 0: 6499 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6500 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6501 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6502 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6503 movq(rdx, rax); 6504 subq(rax, divisor); 6505 if (VM_Version::supports_bmi1()) { 6506 andnq(rax, rax, rdx); 6507 } else { 6508 notq(rax); 6509 andq(rax, rdx); 6510 } 6511 movq(tmp, rax); 6512 shrq(rax, 63); // quotient 6513 sarq(tmp, 63); 6514 andq(tmp, divisor); 6515 subq(rdx, tmp); // remainder 6516 bind(done); 6517 } 6518 6519 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6520 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6521 int vlen_enc) { 6522 assert(VM_Version::supports_avx512bw(), ""); 6523 // Byte shuffles are inlane operations and indices are determined using 6524 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6525 // normalized to index range 0-15. This makes sure that all the multiples 6526 // of an index value are placed at same relative position in 128 bit 6527 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6528 // will be 16th element in their respective 128 bit lanes. 6529 movl(rtmp, 16); 6530 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6531 6532 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6533 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6534 // original shuffle indices and move the shuffled lanes corresponding to true 6535 // mask to destination vector. 6536 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6537 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6538 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6539 6540 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6541 // and broadcasting second 128 bit lane. 6542 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6543 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6544 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6545 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6546 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6547 6548 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6549 // and broadcasting third 128 bit lane. 6550 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6551 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6552 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6553 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6554 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6555 6556 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6557 // and broadcasting third 128 bit lane. 6558 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6559 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6560 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6561 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6562 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6563 } 6564 6565 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6566 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6567 if (vlen_enc == AVX_128bit) { 6568 vpermilps(dst, src, shuffle, vlen_enc); 6569 } else if (bt == T_INT) { 6570 vpermd(dst, shuffle, src, vlen_enc); 6571 } else { 6572 assert(bt == T_FLOAT, ""); 6573 vpermps(dst, shuffle, src, vlen_enc); 6574 } 6575 } 6576 6577 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6578 switch(opcode) { 6579 case Op_AddHF: vaddsh(dst, src1, src2); break; 6580 case Op_SubHF: vsubsh(dst, src1, src2); break; 6581 case Op_MulHF: vmulsh(dst, src1, src2); break; 6582 case Op_DivHF: vdivsh(dst, src1, src2); break; 6583 default: assert(false, "%s", NodeClassNames[opcode]); break; 6584 } 6585 } 6586 6587 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6588 switch(elem_bt) { 6589 case T_BYTE: 6590 if (ideal_opc == Op_SaturatingAddV) { 6591 vpaddsb(dst, src1, src2, vlen_enc); 6592 } else { 6593 assert(ideal_opc == Op_SaturatingSubV, ""); 6594 vpsubsb(dst, src1, src2, vlen_enc); 6595 } 6596 break; 6597 case T_SHORT: 6598 if (ideal_opc == Op_SaturatingAddV) { 6599 vpaddsw(dst, src1, src2, vlen_enc); 6600 } else { 6601 assert(ideal_opc == Op_SaturatingSubV, ""); 6602 vpsubsw(dst, src1, src2, vlen_enc); 6603 } 6604 break; 6605 default: 6606 fatal("Unsupported type %s", type2name(elem_bt)); 6607 break; 6608 } 6609 } 6610 6611 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6612 switch(elem_bt) { 6613 case T_BYTE: 6614 if (ideal_opc == Op_SaturatingAddV) { 6615 vpaddusb(dst, src1, src2, vlen_enc); 6616 } else { 6617 assert(ideal_opc == Op_SaturatingSubV, ""); 6618 vpsubusb(dst, src1, src2, vlen_enc); 6619 } 6620 break; 6621 case T_SHORT: 6622 if (ideal_opc == Op_SaturatingAddV) { 6623 vpaddusw(dst, src1, src2, vlen_enc); 6624 } else { 6625 assert(ideal_opc == Op_SaturatingSubV, ""); 6626 vpsubusw(dst, src1, src2, vlen_enc); 6627 } 6628 break; 6629 default: 6630 fatal("Unsupported type %s", type2name(elem_bt)); 6631 break; 6632 } 6633 } 6634 6635 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6636 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6637 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6638 // overflow_mask = Inp1 <u Inp2 6639 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6640 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6641 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6642 } 6643 6644 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6645 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6646 // Emulate unsigned comparison using signed comparison 6647 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6648 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6649 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6650 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6651 6652 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6653 6654 // Res = INP1 - INP2 (non-commutative and non-associative) 6655 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6656 // Res = Mask ? Zero : Res 6657 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6658 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6659 } 6660 6661 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6662 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6663 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6664 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6665 // Res = Signed Add INP1, INP2 6666 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6667 // T1 = SRC1 | SRC2 6668 vpor(xtmp1, src1, src2, vlen_enc); 6669 // Max_Unsigned = -1 6670 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6671 // Unsigned compare: Mask = Res <u T1 6672 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6673 // res = Mask ? Max_Unsigned : Res 6674 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6675 } 6676 6677 // 6678 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6679 // unsigned addition operation. 6680 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6681 // 6682 // We empirically determined its semantic equivalence to following reduced expression 6683 // overflow_mask = (a + b) <u (a | b) 6684 // 6685 // and also verified it though Alive2 solver. 6686 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6687 // 6688 6689 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6690 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6691 // Res = Signed Add INP1, INP2 6692 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6693 // Compute T1 = INP1 | INP2 6694 vpor(xtmp3, src1, src2, vlen_enc); 6695 // T1 = Minimum signed value. 6696 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6697 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6698 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6699 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6700 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6701 // Compute overflow detection mask = Res<1> <s T1 6702 if (elem_bt == T_INT) { 6703 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6704 } else { 6705 assert(elem_bt == T_LONG, ""); 6706 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6707 } 6708 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6709 } 6710 6711 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6712 int vlen_enc, bool xtmp2_hold_M1) { 6713 if (VM_Version::supports_avx512dq()) { 6714 evpmovq2m(ktmp, src, vlen_enc); 6715 } else { 6716 assert(VM_Version::supports_evex(), ""); 6717 if (!xtmp2_hold_M1) { 6718 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6719 } 6720 evpsraq(xtmp1, src, 63, vlen_enc); 6721 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6722 } 6723 } 6724 6725 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6726 int vlen_enc, bool xtmp2_hold_M1) { 6727 if (VM_Version::supports_avx512dq()) { 6728 evpmovd2m(ktmp, src, vlen_enc); 6729 } else { 6730 assert(VM_Version::supports_evex(), ""); 6731 if (!xtmp2_hold_M1) { 6732 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6733 } 6734 vpsrad(xtmp1, src, 31, vlen_enc); 6735 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6736 } 6737 } 6738 6739 6740 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6741 if (elem_bt == T_LONG) { 6742 if (VM_Version::supports_evex()) { 6743 evpsraq(dst, src, 63, vlen_enc); 6744 } else { 6745 vpsrad(dst, src, 31, vlen_enc); 6746 vpshufd(dst, dst, 0xF5, vlen_enc); 6747 } 6748 } else { 6749 assert(elem_bt == T_INT, ""); 6750 vpsrad(dst, src, 31, vlen_enc); 6751 } 6752 } 6753 6754 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6755 if (compute_allones) { 6756 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6757 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6758 } else { 6759 vpcmpeqq(allones, allones, allones, vlen_enc); 6760 } 6761 } 6762 if (elem_bt == T_LONG) { 6763 vpsrlq(dst, allones, 1, vlen_enc); 6764 } else { 6765 assert(elem_bt == T_INT, ""); 6766 vpsrld(dst, allones, 1, vlen_enc); 6767 } 6768 } 6769 6770 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6771 if (compute_allones) { 6772 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6773 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6774 } else { 6775 vpcmpeqq(allones, allones, allones, vlen_enc); 6776 } 6777 } 6778 if (elem_bt == T_LONG) { 6779 vpsllq(dst, allones, 63, vlen_enc); 6780 } else { 6781 assert(elem_bt == T_INT, ""); 6782 vpslld(dst, allones, 31, vlen_enc); 6783 } 6784 } 6785 6786 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6787 Assembler::ComparisonPredicate cond, int vlen_enc) { 6788 switch(elem_bt) { 6789 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6790 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6791 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6792 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6793 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6794 } 6795 } 6796 6797 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6798 switch(elem_bt) { 6799 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6800 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6801 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6802 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6803 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6804 } 6805 } 6806 6807 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6808 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6809 if (elem_bt == T_LONG) { 6810 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6811 } else { 6812 assert(elem_bt == T_INT, ""); 6813 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6814 } 6815 } 6816 6817 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6818 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6819 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6820 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6821 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6822 // Overflow detection based on Hacker's delight section 2-13. 6823 if (ideal_opc == Op_SaturatingAddV) { 6824 // res = src1 + src2 6825 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6826 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6827 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6828 vpxor(xtmp1, dst, src1, vlen_enc); 6829 vpxor(xtmp2, dst, src2, vlen_enc); 6830 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6831 } else { 6832 assert(ideal_opc == Op_SaturatingSubV, ""); 6833 // res = src1 - src2 6834 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6835 // Overflow occurs when both inputs have opposite polarity and 6836 // result polarity does not comply with first input polarity. 6837 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6838 vpxor(xtmp1, src1, src2, vlen_enc); 6839 vpxor(xtmp2, dst, src1, vlen_enc); 6840 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6841 } 6842 6843 // Compute overflow detection mask. 6844 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6845 // Note: xtmp1 hold -1 in all its lanes after above call. 6846 6847 // Compute mask based on first input polarity. 6848 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6849 6850 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6851 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6852 6853 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6854 // set bits in first input polarity mask holds a min value. 6855 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6856 // Blend destination lanes with saturated values using overflow detection mask. 6857 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6858 } 6859 6860 6861 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6862 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6863 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6864 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6865 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6866 // Overflow detection based on Hacker's delight section 2-13. 6867 if (ideal_opc == Op_SaturatingAddV) { 6868 // res = src1 + src2 6869 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6870 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6871 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6872 vpxor(xtmp1, dst, src1, vlen_enc); 6873 vpxor(xtmp2, dst, src2, vlen_enc); 6874 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6875 } else { 6876 assert(ideal_opc == Op_SaturatingSubV, ""); 6877 // res = src1 - src2 6878 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6879 // Overflow occurs when both inputs have opposite polarity and 6880 // result polarity does not comply with first input polarity. 6881 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6882 vpxor(xtmp1, src1, src2, vlen_enc); 6883 vpxor(xtmp2, dst, src1, vlen_enc); 6884 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6885 } 6886 6887 // Sign-extend to compute overflow detection mask. 6888 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6889 6890 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6891 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6892 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6893 6894 // Compose saturating min/max vector using first input polarity mask. 6895 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6896 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6897 6898 // Blend result with saturating vector using overflow detection mask. 6899 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6900 } 6901 6902 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6903 switch(elem_bt) { 6904 case T_BYTE: 6905 if (ideal_opc == Op_SaturatingAddV) { 6906 vpaddsb(dst, src1, src2, vlen_enc); 6907 } else { 6908 assert(ideal_opc == Op_SaturatingSubV, ""); 6909 vpsubsb(dst, src1, src2, vlen_enc); 6910 } 6911 break; 6912 case T_SHORT: 6913 if (ideal_opc == Op_SaturatingAddV) { 6914 vpaddsw(dst, src1, src2, vlen_enc); 6915 } else { 6916 assert(ideal_opc == Op_SaturatingSubV, ""); 6917 vpsubsw(dst, src1, src2, vlen_enc); 6918 } 6919 break; 6920 default: 6921 fatal("Unsupported type %s", type2name(elem_bt)); 6922 break; 6923 } 6924 } 6925 6926 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6927 switch(elem_bt) { 6928 case T_BYTE: 6929 if (ideal_opc == Op_SaturatingAddV) { 6930 vpaddusb(dst, src1, src2, vlen_enc); 6931 } else { 6932 assert(ideal_opc == Op_SaturatingSubV, ""); 6933 vpsubusb(dst, src1, src2, vlen_enc); 6934 } 6935 break; 6936 case T_SHORT: 6937 if (ideal_opc == Op_SaturatingAddV) { 6938 vpaddusw(dst, src1, src2, vlen_enc); 6939 } else { 6940 assert(ideal_opc == Op_SaturatingSubV, ""); 6941 vpsubusw(dst, src1, src2, vlen_enc); 6942 } 6943 break; 6944 default: 6945 fatal("Unsupported type %s", type2name(elem_bt)); 6946 break; 6947 } 6948 } 6949 6950 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6951 XMMRegister src2, int vlen_enc) { 6952 switch(elem_bt) { 6953 case T_BYTE: 6954 evpermi2b(dst, src1, src2, vlen_enc); 6955 break; 6956 case T_SHORT: 6957 evpermi2w(dst, src1, src2, vlen_enc); 6958 break; 6959 case T_INT: 6960 evpermi2d(dst, src1, src2, vlen_enc); 6961 break; 6962 case T_LONG: 6963 evpermi2q(dst, src1, src2, vlen_enc); 6964 break; 6965 case T_FLOAT: 6966 evpermi2ps(dst, src1, src2, vlen_enc); 6967 break; 6968 case T_DOUBLE: 6969 evpermi2pd(dst, src1, src2, vlen_enc); 6970 break; 6971 default: 6972 fatal("Unsupported type %s", type2name(elem_bt)); 6973 break; 6974 } 6975 } 6976 6977 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 6978 if (is_unsigned) { 6979 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6980 } else { 6981 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6982 } 6983 } 6984 6985 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 6986 if (is_unsigned) { 6987 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6988 } else { 6989 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6990 } 6991 } 6992 6993 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6994 switch(opcode) { 6995 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 6996 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 6997 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 6998 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 6999 default: assert(false, "%s", NodeClassNames[opcode]); break; 7000 } 7001 } 7002 7003 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 7004 switch(opcode) { 7005 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; 7006 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break; 7007 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break; 7008 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break; 7009 default: assert(false, "%s", NodeClassNames[opcode]); break; 7010 } 7011 } 7012 7013 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7014 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) { 7015 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit); 7016 } 7017 7018 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 7019 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 7020 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) { 7021 // Move sign bits of src2 to mask register. 7022 evpmovw2m(ktmp, src2, vlen_enc); 7023 // xtmp1 = src2 < 0 ? src2 : src1 7024 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7025 // xtmp2 = src2 < 0 ? ? src1 : src2 7026 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7027 // Idea behind above swapping is to make seconds source operand a +ve value. 7028 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7029 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7030 // the second source operand, either a NaN or a valid floating-point value, is returned 7031 // dst = max(xtmp1, xtmp2) 7032 evmaxph(dst, xtmp1, xtmp2, vlen_enc); 7033 // isNaN = is_unordered_quiet(xtmp1) 7034 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7035 // Final result is same as first source if its a NaN value, 7036 // in case second operand holds a NaN value then as per above semantics 7037 // result is same as second operand. 7038 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7039 } else { 7040 assert(opcode == Op_MinVHF || opcode == Op_MinHF, ""); 7041 // Move sign bits of src1 to mask register. 7042 evpmovw2m(ktmp, src1, vlen_enc); 7043 // xtmp1 = src1 < 0 ? src2 : src1 7044 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7045 // xtmp2 = src1 < 0 ? src1 : src2 7046 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7047 // Idea behind above swapping is to make seconds source operand a -ve value. 7048 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7049 // the second source operand is returned. 7050 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7051 // or a valid floating-point value, is written to the result. 7052 // dst = min(xtmp1, xtmp2) 7053 evminph(dst, xtmp1, xtmp2, vlen_enc); 7054 // isNaN = is_unordered_quiet(xtmp1) 7055 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc); 7056 // Final result is same as first source if its a NaN value, 7057 // in case second operand holds a NaN value then as per above semantics 7058 // result is same as second operand. 7059 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7060 } 7061 } --- EOF ---