1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "gc/shared/barrierSet.hpp" 28 #include "gc/shared/barrierSetAssembler.hpp" 29 #include "oops/methodData.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/opcodes.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/globals.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 53 54 // WARNING: Initial instruction MUST be 5 bytes or longer so that 55 // NativeJump::patch_verified_entry will be able to patch out the entry 56 // code safely. The push to verify stack depth is ok at 5 bytes, 57 // the frame allocation can be either 3 or 6 bytes. So if we don't do 58 // stack bang then we must use the 6 byte frame allocation even if 59 // we have no frame. :-( 60 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 61 62 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 63 // Remove word for return addr 64 framesize -= wordSize; 65 stack_bang_size -= wordSize; 66 67 // Calls to C2R adapters often do not accept exceptional returns. 68 // We require that their callers must bang for them. But be careful, because 69 // some VM calls (such as call site linkage) can use several kilobytes of 70 // stack. But the stack safety zone should account for that. 71 // See bugs 4446381, 4468289, 4497237. 72 if (stack_bang_size > 0) { 73 generate_stack_overflow_check(stack_bang_size); 74 75 // We always push rbp, so that on return to interpreter rbp, will be 76 // restored correctly and we can correct the stack. 77 push(rbp); 78 // Save caller's stack pointer into RBP if the frame pointer is preserved. 79 if (PreserveFramePointer) { 80 mov(rbp, rsp); 81 } 82 // Remove word for ebp 83 framesize -= wordSize; 84 85 // Create frame 86 if (framesize) { 87 subptr(rsp, framesize); 88 } 89 } else { 90 // Create frame (force generation of a 4 byte immediate value) 91 subptr_imm32(rsp, framesize); 92 93 // Save RBP register now. 94 framesize -= wordSize; 95 movptr(Address(rsp, framesize), rbp); 96 // Save caller's stack pointer into RBP if the frame pointer is preserved. 97 if (PreserveFramePointer) { 98 movptr(rbp, rsp); 99 if (framesize > 0) { 100 addptr(rbp, framesize); 101 } 102 } 103 } 104 105 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 106 framesize -= wordSize; 107 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 108 } 109 110 #ifdef ASSERT 111 if (VerifyStackAtCalls) { 112 Label L; 113 push(rax); 114 mov(rax, rsp); 115 andptr(rax, StackAlignmentInBytes-1); 116 cmpptr(rax, StackAlignmentInBytes-wordSize); 117 pop(rax); 118 jcc(Assembler::equal, L); 119 STOP("Stack is not properly aligned!"); 120 bind(L); 121 } 122 #endif 123 124 if (!is_stub) { 125 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 126 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 127 Label dummy_slow_path; 128 Label dummy_continuation; 129 Label* slow_path = &dummy_slow_path; 130 Label* continuation = &dummy_continuation; 131 if (!Compile::current()->output()->in_scratch_emit_size()) { 132 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 133 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 134 Compile::current()->output()->add_stub(stub); 135 slow_path = &stub->entry(); 136 continuation = &stub->continuation(); 137 } 138 bs->nmethod_entry_barrier(this, slow_path, continuation); 139 } 140 } 141 142 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 143 switch (vlen_in_bytes) { 144 case 4: // fall-through 145 case 8: // fall-through 146 case 16: return Assembler::AVX_128bit; 147 case 32: return Assembler::AVX_256bit; 148 case 64: return Assembler::AVX_512bit; 149 150 default: { 151 ShouldNotReachHere(); 152 return Assembler::AVX_NoVec; 153 } 154 } 155 } 156 157 // fast_lock and fast_unlock used by C2 158 159 // Because the transitions from emitted code to the runtime 160 // monitorenter/exit helper stubs are so slow it's critical that 161 // we inline both the stack-locking fast path and the inflated fast path. 162 // 163 // See also: cmpFastLock and cmpFastUnlock. 164 // 165 // What follows is a specialized inline transliteration of the code 166 // in enter() and exit(). If we're concerned about I$ bloat another 167 // option would be to emit TrySlowEnter and TrySlowExit methods 168 // at startup-time. These methods would accept arguments as 169 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 170 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 171 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 172 // In practice, however, the # of lock sites is bounded and is usually small. 173 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 174 // if the processor uses simple bimodal branch predictors keyed by EIP 175 // Since the helper routines would be called from multiple synchronization 176 // sites. 177 // 178 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 179 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 180 // to those specialized methods. That'd give us a mostly platform-independent 181 // implementation that the JITs could optimize and inline at their pleasure. 182 // Done correctly, the only time we'd need to cross to native could would be 183 // to park() or unpark() threads. We'd also need a few more unsafe operators 184 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 185 // (b) explicit barriers or fence operations. 186 // 187 // TODO: 188 // 189 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 190 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 191 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 192 // the lock operators would typically be faster than reifying Self. 193 // 194 // * Ideally I'd define the primitives as: 195 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 196 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 197 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 198 // Instead, we're stuck with a rather awkward and brittle register assignments below. 199 // Furthermore the register assignments are overconstrained, possibly resulting in 200 // sub-optimal code near the synchronization site. 201 // 202 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 203 // Alternately, use a better sp-proximity test. 204 // 205 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 206 // Either one is sufficient to uniquely identify a thread. 207 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 208 // 209 // * Intrinsify notify() and notifyAll() for the common cases where the 210 // object is locked by the calling thread but the waitlist is empty. 211 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 212 // 213 // * use jccb and jmpb instead of jcc and jmp to improve code density. 214 // But beware of excessive branch density on AMD Opterons. 215 // 216 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 217 // or failure of the fast path. If the fast path fails then we pass 218 // control to the slow path, typically in C. In fast_lock and 219 // fast_unlock we often branch to DONE_LABEL, just to find that C2 220 // will emit a conditional branch immediately after the node. 221 // So we have branches to branches and lots of ICC.ZF games. 222 // Instead, it might be better to have C2 pass a "FailureLabel" 223 // into fast_lock and fast_unlock. In the case of success, control 224 // will drop through the node. ICC.ZF is undefined at exit. 225 // In the case of failure, the node will branch directly to the 226 // FailureLabel 227 228 229 // obj: object to lock 230 // box: on-stack box address (displaced header location) - KILLED 231 // rax,: tmp -- KILLED 232 // scr: tmp -- KILLED 233 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 234 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 235 Metadata* method_data) { 236 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 237 // Ensure the register assignments are disjoint 238 assert(tmpReg == rax, ""); 239 assert(cx1Reg == noreg, ""); 240 assert(cx2Reg == noreg, ""); 241 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 242 243 // Possible cases that we'll encounter in fast_lock 244 // ------------------------------------------------ 245 // * Inflated 246 // -- unlocked 247 // -- Locked 248 // = by self 249 // = by other 250 // * neutral 251 // * stack-locked 252 // -- by self 253 // = sp-proximity test hits 254 // = sp-proximity test generates false-negative 255 // -- by other 256 // 257 258 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 259 260 if (DiagnoseSyncOnValueBasedClasses != 0) { 261 load_klass(tmpReg, objReg, scrReg); 262 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 263 jcc(Assembler::notZero, DONE_LABEL); 264 } 265 266 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 267 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 268 jcc(Assembler::notZero, IsInflated); 269 270 if (LockingMode == LM_MONITOR) { 271 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 272 testptr(objReg, objReg); 273 } else { 274 assert(LockingMode == LM_LEGACY, "must be"); 275 // Attempt stack-locking ... 276 orptr (tmpReg, markWord::unlocked_value); 277 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 278 lock(); 279 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 280 jcc(Assembler::equal, COUNT); // Success 281 282 // Recursive locking. 283 // The object is stack-locked: markword contains stack pointer to BasicLock. 284 // Locked by current thread if difference with current SP is less than one page. 285 subptr(tmpReg, rsp); 286 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 287 andptr(tmpReg, (int32_t) (7 - (int)os::vm_page_size()) ); 288 movptr(Address(boxReg, 0), tmpReg); 289 } 290 jmp(DONE_LABEL); 291 292 bind(IsInflated); 293 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 294 295 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 296 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 297 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 298 299 // It's inflated and we use scrReg for ObjectMonitor* in this section. 300 movptr(boxReg, Address(r15_thread, JavaThread::monitor_owner_id_offset())); 301 movq(scrReg, tmpReg); 302 xorq(tmpReg, tmpReg); 303 lock(); 304 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 305 306 // Propagate ICC.ZF from CAS above into DONE_LABEL. 307 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 308 309 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 310 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 311 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 312 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 313 bind(DONE_LABEL); 314 315 // ZFlag == 1 count in fast path 316 // ZFlag == 0 count in slow path 317 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 318 319 bind(COUNT); 320 if (LockingMode == LM_LEGACY) { 321 // Count monitors in fast path 322 increment(Address(thread, JavaThread::held_monitor_count_offset())); 323 } 324 xorl(tmpReg, tmpReg); // Set ZF == 1 325 326 bind(NO_COUNT); 327 328 // At NO_COUNT the icc ZFlag is set as follows ... 329 // fast_unlock uses the same protocol. 330 // ZFlag == 1 -> Success 331 // ZFlag == 0 -> Failure - force control through the slow path 332 } 333 334 // obj: object to unlock 335 // box: box address (displaced header location), killed. Must be EAX. 336 // tmp: killed, cannot be obj nor box. 337 // 338 // Some commentary on balanced locking: 339 // 340 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 341 // Methods that don't have provably balanced locking are forced to run in the 342 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 343 // The interpreter provides two properties: 344 // I1: At return-time the interpreter automatically and quietly unlocks any 345 // objects acquired the current activation (frame). Recall that the 346 // interpreter maintains an on-stack list of locks currently held by 347 // a frame. 348 // I2: If a method attempts to unlock an object that is not held by the 349 // the frame the interpreter throws IMSX. 350 // 351 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 352 // B() doesn't have provably balanced locking so it runs in the interpreter. 353 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 354 // is still locked by A(). 355 // 356 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 357 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 358 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 359 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 360 // Arguably given that the spec legislates the JNI case as undefined our implementation 361 // could reasonably *avoid* checking owner in fast_unlock(). 362 // In the interest of performance we elide m->Owner==Self check in unlock. 363 // A perfectly viable alternative is to elide the owner check except when 364 // Xcheck:jni is enabled. 365 366 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 367 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 368 assert(boxReg == rax, ""); 369 assert_different_registers(objReg, boxReg, tmpReg); 370 371 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 372 373 if (LockingMode == LM_LEGACY) { 374 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 375 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 376 } 377 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 378 if (LockingMode != LM_MONITOR) { 379 testptr(tmpReg, markWord::monitor_value); // Inflated? 380 jcc(Assembler::zero, Stacked); 381 } 382 383 // It's inflated. 384 385 // Despite our balanced locking property we still check that m->_owner == Self 386 // as java routines or native JNI code called by this thread might 387 // have released the lock. 388 // 389 // If there's no contention try a 1-0 exit. That is, exit without 390 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 391 // we detect and recover from the race that the 1-0 exit admits. 392 // 393 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 394 // before it STs null into _owner, releasing the lock. Updates 395 // to data protected by the critical section must be visible before 396 // we drop the lock (and thus before any other thread could acquire 397 // the lock and observe the fields protected by the lock). 398 // IA32's memory-model is SPO, so STs are ordered with respect to 399 // each other and there's no need for an explicit barrier (fence). 400 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 401 Label LSuccess, LNotRecursive; 402 403 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 404 jccb(Assembler::equal, LNotRecursive); 405 406 // Recursive inflated unlock 407 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 408 jmpb(LSuccess); 409 410 bind(LNotRecursive); 411 412 // Set owner to null. 413 // Release to satisfy the JMM 414 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 415 // We need a full fence after clearing owner to avoid stranding. 416 // StoreLoad achieves this. 417 membar(StoreLoad); 418 419 // Check if the entry_list is empty. 420 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(entry_list)), NULL_WORD); 421 jccb(Assembler::zero, LSuccess); // If so we are done. 422 423 // Check if there is a successor. 424 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 425 jccb(Assembler::notZero, LSuccess); // If so we are done. 426 427 // Save the monitor pointer in the current thread, so we can try to 428 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 429 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 430 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 431 432 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 433 jmpb (DONE_LABEL); 434 435 bind (LSuccess); 436 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 437 jmpb (DONE_LABEL); 438 439 if (LockingMode == LM_LEGACY) { 440 bind (Stacked); 441 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 442 lock(); 443 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 444 // Intentional fall-thru into DONE_LABEL 445 } 446 447 bind(DONE_LABEL); 448 449 // ZFlag == 1 count in fast path 450 // ZFlag == 0 count in slow path 451 jccb(Assembler::notZero, NO_COUNT); 452 453 bind(COUNT); 454 455 if (LockingMode == LM_LEGACY) { 456 // Count monitors in fast path 457 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 458 } 459 460 xorl(tmpReg, tmpReg); // Set ZF == 1 461 462 bind(NO_COUNT); 463 } 464 465 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 466 Register t, Register thread) { 467 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 468 assert(rax_reg == rax, "Used for CAS"); 469 assert_different_registers(obj, box, rax_reg, t, thread); 470 471 // Handle inflated monitor. 472 Label inflated; 473 // Finish fast lock successfully. ZF value is irrelevant. 474 Label locked; 475 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 476 Label slow_path; 477 478 if (UseObjectMonitorTable) { 479 // Clear cache in case fast locking succeeds. 480 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 481 } 482 483 if (DiagnoseSyncOnValueBasedClasses != 0) { 484 load_klass(rax_reg, obj, t); 485 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 486 jcc(Assembler::notZero, slow_path); 487 } 488 489 const Register mark = t; 490 491 { // Lightweight Lock 492 493 Label push; 494 495 const Register top = UseObjectMonitorTable ? rax_reg : box; 496 497 // Load the mark. 498 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 499 500 // Prefetch top. 501 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 502 503 // Check for monitor (0b10). 504 testptr(mark, markWord::monitor_value); 505 jcc(Assembler::notZero, inflated); 506 507 // Check if lock-stack is full. 508 cmpl(top, LockStack::end_offset() - 1); 509 jcc(Assembler::greater, slow_path); 510 511 // Check if recursive. 512 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 513 jccb(Assembler::equal, push); 514 515 // Try to lock. Transition lock bits 0b01 => 0b00 516 movptr(rax_reg, mark); 517 orptr(rax_reg, markWord::unlocked_value); 518 andptr(mark, ~(int32_t)markWord::unlocked_value); 519 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 520 jcc(Assembler::notEqual, slow_path); 521 522 if (UseObjectMonitorTable) { 523 // Need to reload top, clobbered by CAS. 524 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 525 } 526 bind(push); 527 // After successful lock, push object on lock-stack. 528 movptr(Address(thread, top), obj); 529 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 530 jmpb(locked); 531 } 532 533 { // Handle inflated monitor. 534 bind(inflated); 535 536 const Register monitor = t; 537 538 if (!UseObjectMonitorTable) { 539 assert(mark == monitor, "should be the same here"); 540 } else { 541 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 542 // Fetch ObjectMonitor* from the cache or take the slow-path. 543 Label monitor_found; 544 545 // Load cache address 546 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 547 548 const int num_unrolled = 2; 549 for (int i = 0; i < num_unrolled; i++) { 550 cmpptr(obj, Address(t)); 551 jccb(Assembler::equal, monitor_found); 552 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 553 } 554 555 Label loop; 556 557 // Search for obj in cache. 558 bind(loop); 559 560 // Check for match. 561 cmpptr(obj, Address(t)); 562 jccb(Assembler::equal, monitor_found); 563 564 // Search until null encountered, guaranteed _null_sentinel at end. 565 cmpptr(Address(t), 1); 566 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 567 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 568 jmpb(loop); 569 570 // Cache hit. 571 bind(monitor_found); 572 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 573 } 574 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 575 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 576 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 577 578 Label monitor_locked; 579 // Lock the monitor. 580 581 if (UseObjectMonitorTable) { 582 // Cache the monitor for unlock before trashing box. On failure to acquire 583 // the lock, the slow path will reset the entry accordingly (see CacheSetter). 584 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 585 } 586 587 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 588 xorptr(rax_reg, rax_reg); 589 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset())); 590 lock(); cmpxchgptr(box, owner_address); 591 jccb(Assembler::equal, monitor_locked); 592 593 // Check if recursive. 594 cmpptr(box, rax_reg); 595 jccb(Assembler::notEqual, slow_path); 596 597 // Recursive. 598 increment(recursions_address); 599 600 bind(monitor_locked); 601 } 602 603 bind(locked); 604 // Set ZF = 1 605 xorl(rax_reg, rax_reg); 606 607 #ifdef ASSERT 608 // Check that locked label is reached with ZF set. 609 Label zf_correct; 610 Label zf_bad_zero; 611 jcc(Assembler::zero, zf_correct); 612 jmp(zf_bad_zero); 613 #endif 614 615 bind(slow_path); 616 #ifdef ASSERT 617 // Check that slow_path label is reached with ZF not set. 618 jcc(Assembler::notZero, zf_correct); 619 stop("Fast Lock ZF != 0"); 620 bind(zf_bad_zero); 621 stop("Fast Lock ZF != 1"); 622 bind(zf_correct); 623 #endif 624 // C2 uses the value of ZF to determine the continuation. 625 } 626 627 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 628 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 629 assert(reg_rax == rax, "Used for CAS"); 630 assert_different_registers(obj, reg_rax, t); 631 632 // Handle inflated monitor. 633 Label inflated, inflated_check_lock_stack; 634 // Finish fast unlock successfully. MUST jump with ZF == 1 635 Label unlocked, slow_path; 636 637 const Register mark = t; 638 const Register monitor = t; 639 const Register top = UseObjectMonitorTable ? t : reg_rax; 640 const Register box = reg_rax; 641 642 Label dummy; 643 C2FastUnlockLightweightStub* stub = nullptr; 644 645 if (!Compile::current()->output()->in_scratch_emit_size()) { 646 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 647 Compile::current()->output()->add_stub(stub); 648 } 649 650 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 651 652 { // Lightweight Unlock 653 654 // Load top. 655 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 656 657 if (!UseObjectMonitorTable) { 658 // Prefetch mark. 659 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 660 } 661 662 // Check if obj is top of lock-stack. 663 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 664 // Top of lock stack was not obj. Must be monitor. 665 jcc(Assembler::notEqual, inflated_check_lock_stack); 666 667 // Pop lock-stack. 668 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 669 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 670 671 // Check if recursive. 672 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 673 jcc(Assembler::equal, unlocked); 674 675 // We elide the monitor check, let the CAS fail instead. 676 677 if (UseObjectMonitorTable) { 678 // Load mark. 679 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 680 } 681 682 // Try to unlock. Transition lock bits 0b00 => 0b01 683 movptr(reg_rax, mark); 684 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 685 orptr(mark, markWord::unlocked_value); 686 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 687 jcc(Assembler::notEqual, push_and_slow_path); 688 jmp(unlocked); 689 } 690 691 692 { // Handle inflated monitor. 693 bind(inflated_check_lock_stack); 694 #ifdef ASSERT 695 Label check_done; 696 subl(top, oopSize); 697 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 698 jcc(Assembler::below, check_done); 699 cmpptr(obj, Address(thread, top)); 700 jccb(Assembler::notEqual, inflated_check_lock_stack); 701 stop("Fast Unlock lock on stack"); 702 bind(check_done); 703 if (UseObjectMonitorTable) { 704 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 705 } 706 testptr(mark, markWord::monitor_value); 707 jccb(Assembler::notZero, inflated); 708 stop("Fast Unlock not monitor"); 709 #endif 710 711 bind(inflated); 712 713 if (!UseObjectMonitorTable) { 714 assert(mark == monitor, "should be the same here"); 715 } else { 716 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 717 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 718 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 719 cmpptr(monitor, alignof(ObjectMonitor*)); 720 jcc(Assembler::below, slow_path); 721 } 722 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 723 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 724 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 725 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag}; 726 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 727 728 Label recursive; 729 730 // Check if recursive. 731 cmpptr(recursions_address, 0); 732 jccb(Assembler::notZero, recursive); 733 734 // Set owner to null. 735 // Release to satisfy the JMM 736 movptr(owner_address, NULL_WORD); 737 // We need a full fence after clearing owner to avoid stranding. 738 // StoreLoad achieves this. 739 membar(StoreLoad); 740 741 // Check if the entry_list is empty. 742 cmpptr(entry_list_address, NULL_WORD); 743 jccb(Assembler::zero, unlocked); // If so we are done. 744 745 // Check if there is a successor. 746 cmpptr(succ_address, NULL_WORD); 747 jccb(Assembler::notZero, unlocked); // If so we are done. 748 749 // Save the monitor pointer in the current thread, so we can try to 750 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 751 if (!UseObjectMonitorTable) { 752 andptr(monitor, ~(int32_t)markWord::monitor_value); 753 } 754 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 755 756 orl(t, 1); // Fast Unlock ZF = 0 757 jmpb(slow_path); 758 759 // Recursive unlock. 760 bind(recursive); 761 decrement(recursions_address); 762 } 763 764 bind(unlocked); 765 xorl(t, t); // Fast Unlock ZF = 1 766 767 #ifdef ASSERT 768 // Check that unlocked label is reached with ZF set. 769 Label zf_correct; 770 Label zf_bad_zero; 771 jcc(Assembler::zero, zf_correct); 772 jmp(zf_bad_zero); 773 #endif 774 775 bind(slow_path); 776 if (stub != nullptr) { 777 bind(stub->slow_path_continuation()); 778 } 779 #ifdef ASSERT 780 // Check that stub->continuation() label is reached with ZF not set. 781 jcc(Assembler::notZero, zf_correct); 782 stop("Fast Unlock ZF != 0"); 783 bind(zf_bad_zero); 784 stop("Fast Unlock ZF != 1"); 785 bind(zf_correct); 786 #endif 787 // C2 uses the value of ZF to determine the continuation. 788 } 789 790 //------------------------------------------------------------------------------------------- 791 // Generic instructions support for use in .ad files C2 code generation 792 793 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 794 if (dst != src) { 795 movdqu(dst, src); 796 } 797 if (opcode == Op_AbsVD) { 798 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 799 } else { 800 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 801 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 802 } 803 } 804 805 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 806 if (opcode == Op_AbsVD) { 807 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 808 } else { 809 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 810 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 811 } 812 } 813 814 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 815 if (dst != src) { 816 movdqu(dst, src); 817 } 818 if (opcode == Op_AbsVF) { 819 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 820 } else { 821 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 822 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 823 } 824 } 825 826 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 827 if (opcode == Op_AbsVF) { 828 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 829 } else { 830 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 831 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 832 } 833 } 834 835 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 836 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 837 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 838 839 if (opcode == Op_MinV) { 840 if (elem_bt == T_BYTE) { 841 pminsb(dst, src); 842 } else if (elem_bt == T_SHORT) { 843 pminsw(dst, src); 844 } else if (elem_bt == T_INT) { 845 pminsd(dst, src); 846 } else { 847 assert(elem_bt == T_LONG, "required"); 848 assert(tmp == xmm0, "required"); 849 assert_different_registers(dst, src, tmp); 850 movdqu(xmm0, dst); 851 pcmpgtq(xmm0, src); 852 blendvpd(dst, src); // xmm0 as mask 853 } 854 } else { // opcode == Op_MaxV 855 if (elem_bt == T_BYTE) { 856 pmaxsb(dst, src); 857 } else if (elem_bt == T_SHORT) { 858 pmaxsw(dst, src); 859 } else if (elem_bt == T_INT) { 860 pmaxsd(dst, src); 861 } else { 862 assert(elem_bt == T_LONG, "required"); 863 assert(tmp == xmm0, "required"); 864 assert_different_registers(dst, src, tmp); 865 movdqu(xmm0, src); 866 pcmpgtq(xmm0, dst); 867 blendvpd(dst, src); // xmm0 as mask 868 } 869 } 870 } 871 872 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 873 XMMRegister src1, Address src2, int vlen_enc) { 874 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 875 if (opcode == Op_UMinV) { 876 switch(elem_bt) { 877 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 878 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 879 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 880 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 881 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 882 } 883 } else { 884 assert(opcode == Op_UMaxV, "required"); 885 switch(elem_bt) { 886 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 887 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 888 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 889 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 890 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 891 } 892 } 893 } 894 895 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 896 // For optimality, leverage a full vector width of 512 bits 897 // for operations over smaller vector sizes on AVX512 targets. 898 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) { 899 if (opcode == Op_UMaxV) { 900 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 901 } else { 902 assert(opcode == Op_UMinV, "required"); 903 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit); 904 } 905 } else { 906 // T1 = -1 907 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc); 908 // T1 = -1 << 63 909 vpsllq(xtmp1, xtmp1, 63, vlen_enc); 910 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2 911 vpaddq(xtmp2, xtmp1, src2, vlen_enc); 912 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1 913 vpaddq(xtmp1, xtmp1, src1, vlen_enc); 914 // Mask = T2 > T1 915 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc); 916 if (opcode == Op_UMaxV) { 917 // Res = Mask ? Src2 : Src1 918 vpblendvb(dst, src1, src2, xtmp1, vlen_enc); 919 } else { 920 // Res = Mask ? Src1 : Src2 921 vpblendvb(dst, src2, src1, xtmp1, vlen_enc); 922 } 923 } 924 } 925 926 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst, 927 XMMRegister src1, XMMRegister src2, int vlen_enc) { 928 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity"); 929 if (opcode == Op_UMinV) { 930 switch(elem_bt) { 931 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break; 932 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break; 933 case T_INT: vpminud(dst, src1, src2, vlen_enc); break; 934 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break; 935 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 936 } 937 } else { 938 assert(opcode == Op_UMaxV, "required"); 939 switch(elem_bt) { 940 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break; 941 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break; 942 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break; 943 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break; 944 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 945 } 946 } 947 } 948 949 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 950 XMMRegister dst, XMMRegister src1, XMMRegister src2, 951 int vlen_enc) { 952 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 953 954 if (opcode == Op_MinV) { 955 if (elem_bt == T_BYTE) { 956 vpminsb(dst, src1, src2, vlen_enc); 957 } else if (elem_bt == T_SHORT) { 958 vpminsw(dst, src1, src2, vlen_enc); 959 } else if (elem_bt == T_INT) { 960 vpminsd(dst, src1, src2, vlen_enc); 961 } else { 962 assert(elem_bt == T_LONG, "required"); 963 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 964 vpminsq(dst, src1, src2, vlen_enc); 965 } else { 966 assert_different_registers(dst, src1, src2); 967 vpcmpgtq(dst, src1, src2, vlen_enc); 968 vblendvpd(dst, src1, src2, dst, vlen_enc); 969 } 970 } 971 } else { // opcode == Op_MaxV 972 if (elem_bt == T_BYTE) { 973 vpmaxsb(dst, src1, src2, vlen_enc); 974 } else if (elem_bt == T_SHORT) { 975 vpmaxsw(dst, src1, src2, vlen_enc); 976 } else if (elem_bt == T_INT) { 977 vpmaxsd(dst, src1, src2, vlen_enc); 978 } else { 979 assert(elem_bt == T_LONG, "required"); 980 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 981 vpmaxsq(dst, src1, src2, vlen_enc); 982 } else { 983 assert_different_registers(dst, src1, src2); 984 vpcmpgtq(dst, src1, src2, vlen_enc); 985 vblendvpd(dst, src2, src1, dst, vlen_enc); 986 } 987 } 988 } 989 } 990 991 // Float/Double min max 992 993 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 994 XMMRegister dst, XMMRegister a, XMMRegister b, 995 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 996 int vlen_enc) { 997 assert(UseAVX > 0, "required"); 998 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 999 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1000 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1001 assert_different_registers(a, tmp, atmp, btmp); 1002 assert_different_registers(b, tmp, atmp, btmp); 1003 1004 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1005 bool is_double_word = is_double_word_type(elem_bt); 1006 1007 /* Note on 'non-obvious' assembly sequence: 1008 * 1009 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1010 * and Java on how they handle floats: 1011 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1012 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1013 * 1014 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1015 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1016 * (only useful when signs differ, noop otherwise) 1017 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1018 1019 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1020 * btmp = (b < +0.0) ? a : b 1021 * atmp = (b < +0.0) ? b : a 1022 * Tmp = Max_Float(atmp , btmp) 1023 * Res = (atmp == NaN) ? atmp : Tmp 1024 */ 1025 1026 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1027 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1028 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1029 XMMRegister mask; 1030 1031 if (!is_double_word && is_min) { 1032 mask = a; 1033 vblend = &MacroAssembler::vblendvps; 1034 vmaxmin = &MacroAssembler::vminps; 1035 vcmp = &MacroAssembler::vcmpps; 1036 } else if (!is_double_word && !is_min) { 1037 mask = b; 1038 vblend = &MacroAssembler::vblendvps; 1039 vmaxmin = &MacroAssembler::vmaxps; 1040 vcmp = &MacroAssembler::vcmpps; 1041 } else if (is_double_word && is_min) { 1042 mask = a; 1043 vblend = &MacroAssembler::vblendvpd; 1044 vmaxmin = &MacroAssembler::vminpd; 1045 vcmp = &MacroAssembler::vcmppd; 1046 } else { 1047 assert(is_double_word && !is_min, "sanity"); 1048 mask = b; 1049 vblend = &MacroAssembler::vblendvpd; 1050 vmaxmin = &MacroAssembler::vmaxpd; 1051 vcmp = &MacroAssembler::vcmppd; 1052 } 1053 1054 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1055 XMMRegister maxmin, scratch; 1056 if (dst == btmp) { 1057 maxmin = btmp; 1058 scratch = tmp; 1059 } else { 1060 maxmin = tmp; 1061 scratch = btmp; 1062 } 1063 1064 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1065 if (precompute_mask && !is_double_word) { 1066 vpsrad(tmp, mask, 32, vlen_enc); 1067 mask = tmp; 1068 } else if (precompute_mask && is_double_word) { 1069 vpxor(tmp, tmp, tmp, vlen_enc); 1070 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1071 mask = tmp; 1072 } 1073 1074 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1075 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1076 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1077 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1078 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1079 } 1080 1081 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1082 XMMRegister dst, XMMRegister a, XMMRegister b, 1083 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1084 int vlen_enc) { 1085 assert(UseAVX > 2, "required"); 1086 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1087 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1088 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1089 assert_different_registers(dst, a, atmp, btmp); 1090 assert_different_registers(dst, b, atmp, btmp); 1091 1092 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1093 bool is_double_word = is_double_word_type(elem_bt); 1094 bool merge = true; 1095 1096 if (!is_double_word && is_min) { 1097 evpmovd2m(ktmp, a, vlen_enc); 1098 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1099 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1100 vminps(dst, atmp, btmp, vlen_enc); 1101 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1102 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1103 } else if (!is_double_word && !is_min) { 1104 evpmovd2m(ktmp, b, vlen_enc); 1105 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1106 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1107 vmaxps(dst, atmp, btmp, vlen_enc); 1108 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1109 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1110 } else if (is_double_word && is_min) { 1111 evpmovq2m(ktmp, a, vlen_enc); 1112 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1113 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1114 vminpd(dst, atmp, btmp, vlen_enc); 1115 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1116 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1117 } else { 1118 assert(is_double_word && !is_min, "sanity"); 1119 evpmovq2m(ktmp, b, vlen_enc); 1120 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1121 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1122 vmaxpd(dst, atmp, btmp, vlen_enc); 1123 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1124 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1125 } 1126 } 1127 1128 // Float/Double signum 1129 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1130 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1131 1132 Label DONE_LABEL; 1133 1134 if (opcode == Op_SignumF) { 1135 assert(UseSSE > 0, "required"); 1136 ucomiss(dst, zero); 1137 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1138 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1139 movflt(dst, one); 1140 jcc(Assembler::above, DONE_LABEL); 1141 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1142 } else if (opcode == Op_SignumD) { 1143 assert(UseSSE > 1, "required"); 1144 ucomisd(dst, zero); 1145 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1146 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1147 movdbl(dst, one); 1148 jcc(Assembler::above, DONE_LABEL); 1149 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1150 } 1151 1152 bind(DONE_LABEL); 1153 } 1154 1155 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1156 if (sign) { 1157 pmovsxbw(dst, src); 1158 } else { 1159 pmovzxbw(dst, src); 1160 } 1161 } 1162 1163 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1164 if (sign) { 1165 vpmovsxbw(dst, src, vector_len); 1166 } else { 1167 vpmovzxbw(dst, src, vector_len); 1168 } 1169 } 1170 1171 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1172 if (sign) { 1173 vpmovsxbd(dst, src, vector_len); 1174 } else { 1175 vpmovzxbd(dst, src, vector_len); 1176 } 1177 } 1178 1179 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1180 if (sign) { 1181 vpmovsxwd(dst, src, vector_len); 1182 } else { 1183 vpmovzxwd(dst, src, vector_len); 1184 } 1185 } 1186 1187 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1188 int shift, int vector_len) { 1189 if (opcode == Op_RotateLeftV) { 1190 if (etype == T_INT) { 1191 evprold(dst, src, shift, vector_len); 1192 } else { 1193 assert(etype == T_LONG, "expected type T_LONG"); 1194 evprolq(dst, src, shift, vector_len); 1195 } 1196 } else { 1197 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1198 if (etype == T_INT) { 1199 evprord(dst, src, shift, vector_len); 1200 } else { 1201 assert(etype == T_LONG, "expected type T_LONG"); 1202 evprorq(dst, src, shift, vector_len); 1203 } 1204 } 1205 } 1206 1207 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1208 XMMRegister shift, int vector_len) { 1209 if (opcode == Op_RotateLeftV) { 1210 if (etype == T_INT) { 1211 evprolvd(dst, src, shift, vector_len); 1212 } else { 1213 assert(etype == T_LONG, "expected type T_LONG"); 1214 evprolvq(dst, src, shift, vector_len); 1215 } 1216 } else { 1217 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1218 if (etype == T_INT) { 1219 evprorvd(dst, src, shift, vector_len); 1220 } else { 1221 assert(etype == T_LONG, "expected type T_LONG"); 1222 evprorvq(dst, src, shift, vector_len); 1223 } 1224 } 1225 } 1226 1227 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1228 if (opcode == Op_RShiftVI) { 1229 psrad(dst, shift); 1230 } else if (opcode == Op_LShiftVI) { 1231 pslld(dst, shift); 1232 } else { 1233 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1234 psrld(dst, shift); 1235 } 1236 } 1237 1238 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1239 switch (opcode) { 1240 case Op_RShiftVI: psrad(dst, shift); break; 1241 case Op_LShiftVI: pslld(dst, shift); break; 1242 case Op_URShiftVI: psrld(dst, shift); break; 1243 1244 default: assert(false, "%s", NodeClassNames[opcode]); 1245 } 1246 } 1247 1248 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1249 if (opcode == Op_RShiftVI) { 1250 vpsrad(dst, nds, shift, vector_len); 1251 } else if (opcode == Op_LShiftVI) { 1252 vpslld(dst, nds, shift, vector_len); 1253 } else { 1254 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1255 vpsrld(dst, nds, shift, vector_len); 1256 } 1257 } 1258 1259 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1260 switch (opcode) { 1261 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1262 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1263 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1264 1265 default: assert(false, "%s", NodeClassNames[opcode]); 1266 } 1267 } 1268 1269 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1270 switch (opcode) { 1271 case Op_RShiftVB: // fall-through 1272 case Op_RShiftVS: psraw(dst, shift); break; 1273 1274 case Op_LShiftVB: // fall-through 1275 case Op_LShiftVS: psllw(dst, shift); break; 1276 1277 case Op_URShiftVS: // fall-through 1278 case Op_URShiftVB: psrlw(dst, shift); break; 1279 1280 default: assert(false, "%s", NodeClassNames[opcode]); 1281 } 1282 } 1283 1284 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1285 switch (opcode) { 1286 case Op_RShiftVB: // fall-through 1287 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1288 1289 case Op_LShiftVB: // fall-through 1290 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1291 1292 case Op_URShiftVS: // fall-through 1293 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1294 1295 default: assert(false, "%s", NodeClassNames[opcode]); 1296 } 1297 } 1298 1299 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1300 switch (opcode) { 1301 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1302 case Op_LShiftVL: psllq(dst, shift); break; 1303 case Op_URShiftVL: psrlq(dst, shift); break; 1304 1305 default: assert(false, "%s", NodeClassNames[opcode]); 1306 } 1307 } 1308 1309 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1310 if (opcode == Op_RShiftVL) { 1311 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1312 } else if (opcode == Op_LShiftVL) { 1313 psllq(dst, shift); 1314 } else { 1315 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1316 psrlq(dst, shift); 1317 } 1318 } 1319 1320 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1321 switch (opcode) { 1322 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1323 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1324 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1325 1326 default: assert(false, "%s", NodeClassNames[opcode]); 1327 } 1328 } 1329 1330 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1331 if (opcode == Op_RShiftVL) { 1332 evpsraq(dst, nds, shift, vector_len); 1333 } else if (opcode == Op_LShiftVL) { 1334 vpsllq(dst, nds, shift, vector_len); 1335 } else { 1336 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1337 vpsrlq(dst, nds, shift, vector_len); 1338 } 1339 } 1340 1341 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1342 switch (opcode) { 1343 case Op_RShiftVB: // fall-through 1344 case Op_RShiftVS: // fall-through 1345 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1346 1347 case Op_LShiftVB: // fall-through 1348 case Op_LShiftVS: // fall-through 1349 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1350 1351 case Op_URShiftVB: // fall-through 1352 case Op_URShiftVS: // fall-through 1353 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1354 1355 default: assert(false, "%s", NodeClassNames[opcode]); 1356 } 1357 } 1358 1359 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1360 switch (opcode) { 1361 case Op_RShiftVB: // fall-through 1362 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1363 1364 case Op_LShiftVB: // fall-through 1365 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1366 1367 case Op_URShiftVB: // fall-through 1368 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1369 1370 default: assert(false, "%s", NodeClassNames[opcode]); 1371 } 1372 } 1373 1374 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1375 assert(UseAVX >= 2, "required"); 1376 switch (opcode) { 1377 case Op_RShiftVL: { 1378 if (UseAVX > 2) { 1379 assert(tmp == xnoreg, "not used"); 1380 if (!VM_Version::supports_avx512vl()) { 1381 vlen_enc = Assembler::AVX_512bit; 1382 } 1383 evpsravq(dst, src, shift, vlen_enc); 1384 } else { 1385 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1386 vpsrlvq(dst, src, shift, vlen_enc); 1387 vpsrlvq(tmp, tmp, shift, vlen_enc); 1388 vpxor(dst, dst, tmp, vlen_enc); 1389 vpsubq(dst, dst, tmp, vlen_enc); 1390 } 1391 break; 1392 } 1393 case Op_LShiftVL: { 1394 assert(tmp == xnoreg, "not used"); 1395 vpsllvq(dst, src, shift, vlen_enc); 1396 break; 1397 } 1398 case Op_URShiftVL: { 1399 assert(tmp == xnoreg, "not used"); 1400 vpsrlvq(dst, src, shift, vlen_enc); 1401 break; 1402 } 1403 default: assert(false, "%s", NodeClassNames[opcode]); 1404 } 1405 } 1406 1407 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1408 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1409 assert(opcode == Op_LShiftVB || 1410 opcode == Op_RShiftVB || 1411 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1412 bool sign = (opcode != Op_URShiftVB); 1413 assert(vector_len == 0, "required"); 1414 vextendbd(sign, dst, src, 1); 1415 vpmovzxbd(vtmp, shift, 1); 1416 varshiftd(opcode, dst, dst, vtmp, 1); 1417 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1418 vextracti128_high(vtmp, dst); 1419 vpackusdw(dst, dst, vtmp, 0); 1420 } 1421 1422 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1423 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1424 assert(opcode == Op_LShiftVB || 1425 opcode == Op_RShiftVB || 1426 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1427 bool sign = (opcode != Op_URShiftVB); 1428 int ext_vector_len = vector_len + 1; 1429 vextendbw(sign, dst, src, ext_vector_len); 1430 vpmovzxbw(vtmp, shift, ext_vector_len); 1431 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1432 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1433 if (vector_len == 0) { 1434 vextracti128_high(vtmp, dst); 1435 vpackuswb(dst, dst, vtmp, vector_len); 1436 } else { 1437 vextracti64x4_high(vtmp, dst); 1438 vpackuswb(dst, dst, vtmp, vector_len); 1439 vpermq(dst, dst, 0xD8, vector_len); 1440 } 1441 } 1442 1443 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1444 switch(typ) { 1445 case T_BYTE: 1446 pinsrb(dst, val, idx); 1447 break; 1448 case T_SHORT: 1449 pinsrw(dst, val, idx); 1450 break; 1451 case T_INT: 1452 pinsrd(dst, val, idx); 1453 break; 1454 case T_LONG: 1455 pinsrq(dst, val, idx); 1456 break; 1457 default: 1458 assert(false,"Should not reach here."); 1459 break; 1460 } 1461 } 1462 1463 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1464 switch(typ) { 1465 case T_BYTE: 1466 vpinsrb(dst, src, val, idx); 1467 break; 1468 case T_SHORT: 1469 vpinsrw(dst, src, val, idx); 1470 break; 1471 case T_INT: 1472 vpinsrd(dst, src, val, idx); 1473 break; 1474 case T_LONG: 1475 vpinsrq(dst, src, val, idx); 1476 break; 1477 default: 1478 assert(false,"Should not reach here."); 1479 break; 1480 } 1481 } 1482 1483 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1484 XMMRegister dst, Register base, 1485 Register idx_base, 1486 Register offset, Register mask, 1487 Register mask_idx, Register rtmp, 1488 int vlen_enc) { 1489 vpxor(dst, dst, dst, vlen_enc); 1490 if (elem_bt == T_SHORT) { 1491 for (int i = 0; i < 4; i++) { 1492 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1493 Label skip_load; 1494 btq(mask, mask_idx); 1495 jccb(Assembler::carryClear, skip_load); 1496 movl(rtmp, Address(idx_base, i * 4)); 1497 if (offset != noreg) { 1498 addl(rtmp, offset); 1499 } 1500 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1501 bind(skip_load); 1502 incq(mask_idx); 1503 } 1504 } else { 1505 assert(elem_bt == T_BYTE, ""); 1506 for (int i = 0; i < 8; i++) { 1507 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1508 Label skip_load; 1509 btq(mask, mask_idx); 1510 jccb(Assembler::carryClear, skip_load); 1511 movl(rtmp, Address(idx_base, i * 4)); 1512 if (offset != noreg) { 1513 addl(rtmp, offset); 1514 } 1515 pinsrb(dst, Address(base, rtmp), i); 1516 bind(skip_load); 1517 incq(mask_idx); 1518 } 1519 } 1520 } 1521 1522 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1523 Register base, Register idx_base, 1524 Register offset, Register rtmp, 1525 int vlen_enc) { 1526 vpxor(dst, dst, dst, vlen_enc); 1527 if (elem_bt == T_SHORT) { 1528 for (int i = 0; i < 4; i++) { 1529 // dst[i] = src[offset + idx_base[i]] 1530 movl(rtmp, Address(idx_base, i * 4)); 1531 if (offset != noreg) { 1532 addl(rtmp, offset); 1533 } 1534 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1535 } 1536 } else { 1537 assert(elem_bt == T_BYTE, ""); 1538 for (int i = 0; i < 8; i++) { 1539 // dst[i] = src[offset + idx_base[i]] 1540 movl(rtmp, Address(idx_base, i * 4)); 1541 if (offset != noreg) { 1542 addl(rtmp, offset); 1543 } 1544 pinsrb(dst, Address(base, rtmp), i); 1545 } 1546 } 1547 } 1548 1549 /* 1550 * Gather using hybrid algorithm, first partially unroll scalar loop 1551 * to accumulate values from gather indices into a quad-word(64bit) slice. 1552 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1553 * permutation to place the slice into appropriate vector lane 1554 * locations in destination vector. Following pseudo code describes the 1555 * algorithm in detail: 1556 * 1557 * DST_VEC = ZERO_VEC 1558 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1559 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1560 * FOREACH_ITER: 1561 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1562 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1563 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1564 * PERM_INDEX = PERM_INDEX - TWO_VEC 1565 * 1566 * With each iteration, doubleword permute indices (0,1) corresponding 1567 * to gathered quadword gets right shifted by two lane positions. 1568 * 1569 */ 1570 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1571 Register base, Register idx_base, 1572 Register offset, Register mask, 1573 XMMRegister xtmp1, XMMRegister xtmp2, 1574 XMMRegister temp_dst, Register rtmp, 1575 Register mask_idx, Register length, 1576 int vector_len, int vlen_enc) { 1577 Label GATHER8_LOOP; 1578 assert(is_subword_type(elem_ty), ""); 1579 movl(length, vector_len); 1580 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1581 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1582 vallones(xtmp2, vlen_enc); 1583 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1584 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1585 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1586 1587 bind(GATHER8_LOOP); 1588 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1589 if (mask == noreg) { 1590 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1591 } else { 1592 vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc); 1593 } 1594 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1595 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1596 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1597 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1598 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1599 vpor(dst, dst, temp_dst, vlen_enc); 1600 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1601 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1602 jcc(Assembler::notEqual, GATHER8_LOOP); 1603 } 1604 1605 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1606 switch(typ) { 1607 case T_INT: 1608 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1609 break; 1610 case T_FLOAT: 1611 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1612 break; 1613 case T_LONG: 1614 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1615 break; 1616 case T_DOUBLE: 1617 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1618 break; 1619 default: 1620 assert(false,"Should not reach here."); 1621 break; 1622 } 1623 } 1624 1625 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1626 switch(typ) { 1627 case T_INT: 1628 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1629 break; 1630 case T_FLOAT: 1631 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1632 break; 1633 case T_LONG: 1634 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1635 break; 1636 case T_DOUBLE: 1637 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1638 break; 1639 default: 1640 assert(false,"Should not reach here."); 1641 break; 1642 } 1643 } 1644 1645 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1646 switch(typ) { 1647 case T_INT: 1648 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1649 break; 1650 case T_FLOAT: 1651 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1652 break; 1653 case T_LONG: 1654 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1655 break; 1656 case T_DOUBLE: 1657 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1658 break; 1659 default: 1660 assert(false,"Should not reach here."); 1661 break; 1662 } 1663 } 1664 1665 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1666 if (vlen_in_bytes <= 16) { 1667 pxor (dst, dst); 1668 psubb(dst, src); 1669 switch (elem_bt) { 1670 case T_BYTE: /* nothing to do */ break; 1671 case T_SHORT: pmovsxbw(dst, dst); break; 1672 case T_INT: pmovsxbd(dst, dst); break; 1673 case T_FLOAT: pmovsxbd(dst, dst); break; 1674 case T_LONG: pmovsxbq(dst, dst); break; 1675 case T_DOUBLE: pmovsxbq(dst, dst); break; 1676 1677 default: assert(false, "%s", type2name(elem_bt)); 1678 } 1679 } else { 1680 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1681 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1682 1683 vpxor (dst, dst, dst, vlen_enc); 1684 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1685 1686 switch (elem_bt) { 1687 case T_BYTE: /* nothing to do */ break; 1688 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1689 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1690 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1691 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1692 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1693 1694 default: assert(false, "%s", type2name(elem_bt)); 1695 } 1696 } 1697 } 1698 1699 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1700 if (novlbwdq) { 1701 vpmovsxbd(xtmp, src, vlen_enc); 1702 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1703 Assembler::eq, true, vlen_enc, noreg); 1704 } else { 1705 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1706 vpsubb(xtmp, xtmp, src, vlen_enc); 1707 evpmovb2m(dst, xtmp, vlen_enc); 1708 } 1709 } 1710 1711 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) { 1712 if (is_integral_type(bt)) { 1713 switch (vlen_in_bytes) { 1714 case 4: movdl(dst, src); break; 1715 case 8: movq(dst, src); break; 1716 case 16: movdqu(dst, src); break; 1717 case 32: vmovdqu(dst, src); break; 1718 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1719 default: ShouldNotReachHere(); 1720 } 1721 } else { 1722 switch (vlen_in_bytes) { 1723 case 4: movflt(dst, src); break; 1724 case 8: movdbl(dst, src); break; 1725 case 16: movups(dst, src); break; 1726 case 32: vmovups(dst, src, Assembler::AVX_256bit); break; 1727 case 64: vmovups(dst, src, Assembler::AVX_512bit); break; 1728 default: ShouldNotReachHere(); 1729 } 1730 } 1731 } 1732 1733 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1734 assert(rscratch != noreg || always_reachable(src), "missing"); 1735 1736 if (reachable(src)) { 1737 load_vector(bt, dst, as_Address(src), vlen_in_bytes); 1738 } else { 1739 lea(rscratch, src); 1740 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes); 1741 } 1742 } 1743 1744 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1745 int vlen_enc = vector_length_encoding(vlen); 1746 if (VM_Version::supports_avx()) { 1747 if (bt == T_LONG) { 1748 if (VM_Version::supports_avx2()) { 1749 vpbroadcastq(dst, src, vlen_enc); 1750 } else { 1751 vmovddup(dst, src, vlen_enc); 1752 } 1753 } else if (bt == T_DOUBLE) { 1754 if (vlen_enc != Assembler::AVX_128bit) { 1755 vbroadcastsd(dst, src, vlen_enc, noreg); 1756 } else { 1757 vmovddup(dst, src, vlen_enc); 1758 } 1759 } else { 1760 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1761 vpbroadcastd(dst, src, vlen_enc); 1762 } else { 1763 vbroadcastss(dst, src, vlen_enc); 1764 } 1765 } 1766 } else if (VM_Version::supports_sse3()) { 1767 movddup(dst, src); 1768 } else { 1769 load_vector(bt, dst, src, vlen); 1770 } 1771 } 1772 1773 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1774 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1775 int offset = exact_log2(type2aelembytes(bt)) << 6; 1776 if (is_floating_point_type(bt)) { 1777 offset += 128; 1778 } 1779 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1780 load_vector(T_BYTE, dst, addr, vlen_in_bytes); 1781 } 1782 1783 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1784 1785 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1786 int vector_len = Assembler::AVX_128bit; 1787 1788 switch (opcode) { 1789 case Op_AndReductionV: pand(dst, src); break; 1790 case Op_OrReductionV: por (dst, src); break; 1791 case Op_XorReductionV: pxor(dst, src); break; 1792 case Op_MinReductionV: 1793 switch (typ) { 1794 case T_BYTE: pminsb(dst, src); break; 1795 case T_SHORT: pminsw(dst, src); break; 1796 case T_INT: pminsd(dst, src); break; 1797 case T_LONG: assert(UseAVX > 2, "required"); 1798 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1799 default: assert(false, "wrong type"); 1800 } 1801 break; 1802 case Op_MaxReductionV: 1803 switch (typ) { 1804 case T_BYTE: pmaxsb(dst, src); break; 1805 case T_SHORT: pmaxsw(dst, src); break; 1806 case T_INT: pmaxsd(dst, src); break; 1807 case T_LONG: assert(UseAVX > 2, "required"); 1808 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1809 default: assert(false, "wrong type"); 1810 } 1811 break; 1812 case Op_AddReductionVF: addss(dst, src); break; 1813 case Op_AddReductionVD: addsd(dst, src); break; 1814 case Op_AddReductionVI: 1815 switch (typ) { 1816 case T_BYTE: paddb(dst, src); break; 1817 case T_SHORT: paddw(dst, src); break; 1818 case T_INT: paddd(dst, src); break; 1819 default: assert(false, "wrong type"); 1820 } 1821 break; 1822 case Op_AddReductionVL: paddq(dst, src); break; 1823 case Op_MulReductionVF: mulss(dst, src); break; 1824 case Op_MulReductionVD: mulsd(dst, src); break; 1825 case Op_MulReductionVI: 1826 switch (typ) { 1827 case T_SHORT: pmullw(dst, src); break; 1828 case T_INT: pmulld(dst, src); break; 1829 default: assert(false, "wrong type"); 1830 } 1831 break; 1832 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1833 evpmullq(dst, dst, src, vector_len); break; 1834 default: assert(false, "wrong opcode"); 1835 } 1836 } 1837 1838 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1839 switch (opcode) { 1840 case Op_AddReductionVF: addps(dst, src); break; 1841 case Op_AddReductionVD: addpd(dst, src); break; 1842 case Op_MulReductionVF: mulps(dst, src); break; 1843 case Op_MulReductionVD: mulpd(dst, src); break; 1844 default: assert(false, "%s", NodeClassNames[opcode]); 1845 } 1846 } 1847 1848 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1849 int vector_len = Assembler::AVX_256bit; 1850 1851 switch (opcode) { 1852 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1853 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1854 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1855 case Op_MinReductionV: 1856 switch (typ) { 1857 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1858 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1859 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1860 case T_LONG: assert(UseAVX > 2, "required"); 1861 vpminsq(dst, src1, src2, vector_len); break; 1862 default: assert(false, "wrong type"); 1863 } 1864 break; 1865 case Op_MaxReductionV: 1866 switch (typ) { 1867 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1868 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1869 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1870 case T_LONG: assert(UseAVX > 2, "required"); 1871 vpmaxsq(dst, src1, src2, vector_len); break; 1872 default: assert(false, "wrong type"); 1873 } 1874 break; 1875 case Op_AddReductionVI: 1876 switch (typ) { 1877 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1878 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1879 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1880 default: assert(false, "wrong type"); 1881 } 1882 break; 1883 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1884 case Op_MulReductionVI: 1885 switch (typ) { 1886 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1887 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1888 default: assert(false, "wrong type"); 1889 } 1890 break; 1891 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1892 default: assert(false, "wrong opcode"); 1893 } 1894 } 1895 1896 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1897 int vector_len = Assembler::AVX_256bit; 1898 1899 switch (opcode) { 1900 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1901 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1902 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1903 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1904 default: assert(false, "%s", NodeClassNames[opcode]); 1905 } 1906 } 1907 1908 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1909 XMMRegister dst, XMMRegister src, 1910 XMMRegister vtmp1, XMMRegister vtmp2) { 1911 switch (opcode) { 1912 case Op_AddReductionVF: 1913 case Op_MulReductionVF: 1914 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1915 break; 1916 1917 case Op_AddReductionVD: 1918 case Op_MulReductionVD: 1919 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1920 break; 1921 1922 default: assert(false, "wrong opcode"); 1923 } 1924 } 1925 1926 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1927 XMMRegister dst, XMMRegister src, 1928 XMMRegister vtmp1, XMMRegister vtmp2) { 1929 switch (opcode) { 1930 case Op_AddReductionVF: 1931 case Op_MulReductionVF: 1932 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1933 break; 1934 1935 case Op_AddReductionVD: 1936 case Op_MulReductionVD: 1937 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1938 break; 1939 1940 default: assert(false, "%s", NodeClassNames[opcode]); 1941 } 1942 } 1943 1944 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1945 Register dst, Register src1, XMMRegister src2, 1946 XMMRegister vtmp1, XMMRegister vtmp2) { 1947 switch (vlen) { 1948 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1949 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1950 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1951 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1952 1953 default: assert(false, "wrong vector length"); 1954 } 1955 } 1956 1957 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1958 Register dst, Register src1, XMMRegister src2, 1959 XMMRegister vtmp1, XMMRegister vtmp2) { 1960 switch (vlen) { 1961 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1962 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1963 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1964 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1965 1966 default: assert(false, "wrong vector length"); 1967 } 1968 } 1969 1970 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1971 Register dst, Register src1, XMMRegister src2, 1972 XMMRegister vtmp1, XMMRegister vtmp2) { 1973 switch (vlen) { 1974 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1975 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1976 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1977 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1978 1979 default: assert(false, "wrong vector length"); 1980 } 1981 } 1982 1983 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1984 Register dst, Register src1, XMMRegister src2, 1985 XMMRegister vtmp1, XMMRegister vtmp2) { 1986 switch (vlen) { 1987 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1988 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1989 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1990 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1991 1992 default: assert(false, "wrong vector length"); 1993 } 1994 } 1995 1996 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1997 Register dst, Register src1, XMMRegister src2, 1998 XMMRegister vtmp1, XMMRegister vtmp2) { 1999 switch (vlen) { 2000 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2001 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2002 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2003 2004 default: assert(false, "wrong vector length"); 2005 } 2006 } 2007 2008 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2009 switch (vlen) { 2010 case 2: 2011 assert(vtmp2 == xnoreg, ""); 2012 reduce2F(opcode, dst, src, vtmp1); 2013 break; 2014 case 4: 2015 assert(vtmp2 == xnoreg, ""); 2016 reduce4F(opcode, dst, src, vtmp1); 2017 break; 2018 case 8: 2019 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2020 break; 2021 case 16: 2022 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2023 break; 2024 default: assert(false, "wrong vector length"); 2025 } 2026 } 2027 2028 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2029 switch (vlen) { 2030 case 2: 2031 assert(vtmp2 == xnoreg, ""); 2032 reduce2D(opcode, dst, src, vtmp1); 2033 break; 2034 case 4: 2035 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2036 break; 2037 case 8: 2038 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2039 break; 2040 default: assert(false, "wrong vector length"); 2041 } 2042 } 2043 2044 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2045 switch (vlen) { 2046 case 2: 2047 assert(vtmp1 == xnoreg, ""); 2048 assert(vtmp2 == xnoreg, ""); 2049 unorderedReduce2F(opcode, dst, src); 2050 break; 2051 case 4: 2052 assert(vtmp2 == xnoreg, ""); 2053 unorderedReduce4F(opcode, dst, src, vtmp1); 2054 break; 2055 case 8: 2056 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2057 break; 2058 case 16: 2059 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2060 break; 2061 default: assert(false, "wrong vector length"); 2062 } 2063 } 2064 2065 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2066 switch (vlen) { 2067 case 2: 2068 assert(vtmp1 == xnoreg, ""); 2069 assert(vtmp2 == xnoreg, ""); 2070 unorderedReduce2D(opcode, dst, src); 2071 break; 2072 case 4: 2073 assert(vtmp2 == xnoreg, ""); 2074 unorderedReduce4D(opcode, dst, src, vtmp1); 2075 break; 2076 case 8: 2077 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2078 break; 2079 default: assert(false, "wrong vector length"); 2080 } 2081 } 2082 2083 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2084 if (opcode == Op_AddReductionVI) { 2085 if (vtmp1 != src2) { 2086 movdqu(vtmp1, src2); 2087 } 2088 phaddd(vtmp1, vtmp1); 2089 } else { 2090 pshufd(vtmp1, src2, 0x1); 2091 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2092 } 2093 movdl(vtmp2, src1); 2094 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2095 movdl(dst, vtmp1); 2096 } 2097 2098 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2099 if (opcode == Op_AddReductionVI) { 2100 if (vtmp1 != src2) { 2101 movdqu(vtmp1, src2); 2102 } 2103 phaddd(vtmp1, src2); 2104 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2105 } else { 2106 pshufd(vtmp2, src2, 0xE); 2107 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2108 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2109 } 2110 } 2111 2112 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2113 if (opcode == Op_AddReductionVI) { 2114 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2115 vextracti128_high(vtmp2, vtmp1); 2116 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2117 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2118 } else { 2119 vextracti128_high(vtmp1, src2); 2120 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2121 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2122 } 2123 } 2124 2125 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2126 vextracti64x4_high(vtmp2, src2); 2127 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2128 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2129 } 2130 2131 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2132 pshufd(vtmp2, src2, 0x1); 2133 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2134 movdqu(vtmp1, vtmp2); 2135 psrldq(vtmp1, 2); 2136 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2137 movdqu(vtmp2, vtmp1); 2138 psrldq(vtmp2, 1); 2139 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2140 movdl(vtmp2, src1); 2141 pmovsxbd(vtmp1, vtmp1); 2142 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2143 pextrb(dst, vtmp1, 0x0); 2144 movsbl(dst, dst); 2145 } 2146 2147 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2148 pshufd(vtmp1, src2, 0xE); 2149 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2150 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2151 } 2152 2153 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2154 vextracti128_high(vtmp2, src2); 2155 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2156 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2157 } 2158 2159 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2160 vextracti64x4_high(vtmp1, src2); 2161 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2162 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2163 } 2164 2165 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2166 pmovsxbw(vtmp2, src2); 2167 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2168 } 2169 2170 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2171 if (UseAVX > 1) { 2172 int vector_len = Assembler::AVX_256bit; 2173 vpmovsxbw(vtmp1, src2, vector_len); 2174 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2175 } else { 2176 pmovsxbw(vtmp2, src2); 2177 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2178 pshufd(vtmp2, src2, 0x1); 2179 pmovsxbw(vtmp2, src2); 2180 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2181 } 2182 } 2183 2184 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2185 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2186 int vector_len = Assembler::AVX_512bit; 2187 vpmovsxbw(vtmp1, src2, vector_len); 2188 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2189 } else { 2190 assert(UseAVX >= 2,"Should not reach here."); 2191 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2192 vextracti128_high(vtmp2, src2); 2193 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2194 } 2195 } 2196 2197 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2198 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2199 vextracti64x4_high(vtmp2, src2); 2200 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2201 } 2202 2203 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2204 if (opcode == Op_AddReductionVI) { 2205 if (vtmp1 != src2) { 2206 movdqu(vtmp1, src2); 2207 } 2208 phaddw(vtmp1, vtmp1); 2209 phaddw(vtmp1, vtmp1); 2210 } else { 2211 pshufd(vtmp2, src2, 0x1); 2212 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2213 movdqu(vtmp1, vtmp2); 2214 psrldq(vtmp1, 2); 2215 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2216 } 2217 movdl(vtmp2, src1); 2218 pmovsxwd(vtmp1, vtmp1); 2219 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2220 pextrw(dst, vtmp1, 0x0); 2221 movswl(dst, dst); 2222 } 2223 2224 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2225 if (opcode == Op_AddReductionVI) { 2226 if (vtmp1 != src2) { 2227 movdqu(vtmp1, src2); 2228 } 2229 phaddw(vtmp1, src2); 2230 } else { 2231 pshufd(vtmp1, src2, 0xE); 2232 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2233 } 2234 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2235 } 2236 2237 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2238 if (opcode == Op_AddReductionVI) { 2239 int vector_len = Assembler::AVX_256bit; 2240 vphaddw(vtmp2, src2, src2, vector_len); 2241 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2242 } else { 2243 vextracti128_high(vtmp2, src2); 2244 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2245 } 2246 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2247 } 2248 2249 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2250 int vector_len = Assembler::AVX_256bit; 2251 vextracti64x4_high(vtmp1, src2); 2252 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2253 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2254 } 2255 2256 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2257 pshufd(vtmp2, src2, 0xE); 2258 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2259 movdq(vtmp1, src1); 2260 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2261 movdq(dst, vtmp1); 2262 } 2263 2264 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2265 vextracti128_high(vtmp1, src2); 2266 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2267 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2268 } 2269 2270 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2271 vextracti64x4_high(vtmp2, src2); 2272 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2273 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2274 } 2275 2276 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2277 mov64(temp, -1L); 2278 bzhiq(temp, temp, len); 2279 kmovql(dst, temp); 2280 } 2281 2282 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2283 reduce_operation_128(T_FLOAT, opcode, dst, src); 2284 pshufd(vtmp, src, 0x1); 2285 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2286 } 2287 2288 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2289 reduce2F(opcode, dst, src, vtmp); 2290 pshufd(vtmp, src, 0x2); 2291 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2292 pshufd(vtmp, src, 0x3); 2293 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2294 } 2295 2296 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2297 reduce4F(opcode, dst, src, vtmp2); 2298 vextractf128_high(vtmp2, src); 2299 reduce4F(opcode, dst, vtmp2, vtmp1); 2300 } 2301 2302 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2303 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2304 vextracti64x4_high(vtmp1, src); 2305 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2306 } 2307 2308 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2309 pshufd(dst, src, 0x1); 2310 reduce_operation_128(T_FLOAT, opcode, dst, src); 2311 } 2312 2313 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2314 pshufd(vtmp, src, 0xE); 2315 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2316 unorderedReduce2F(opcode, dst, vtmp); 2317 } 2318 2319 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2320 vextractf128_high(vtmp1, src); 2321 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2322 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2323 } 2324 2325 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2326 vextractf64x4_high(vtmp2, src); 2327 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2328 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2329 } 2330 2331 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2332 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2333 pshufd(vtmp, src, 0xE); 2334 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2335 } 2336 2337 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2338 reduce2D(opcode, dst, src, vtmp2); 2339 vextractf128_high(vtmp2, src); 2340 reduce2D(opcode, dst, vtmp2, vtmp1); 2341 } 2342 2343 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2344 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2345 vextracti64x4_high(vtmp1, src); 2346 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2347 } 2348 2349 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2350 pshufd(dst, src, 0xE); 2351 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2352 } 2353 2354 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2355 vextractf128_high(vtmp, src); 2356 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2357 unorderedReduce2D(opcode, dst, vtmp); 2358 } 2359 2360 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2361 vextractf64x4_high(vtmp2, src); 2362 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2363 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2364 } 2365 2366 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2367 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2368 } 2369 2370 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2371 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2372 } 2373 2374 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 2375 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2376 } 2377 2378 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2379 int vec_enc) { 2380 switch(elem_bt) { 2381 case T_INT: 2382 case T_FLOAT: 2383 vmaskmovps(dst, src, mask, vec_enc); 2384 break; 2385 case T_LONG: 2386 case T_DOUBLE: 2387 vmaskmovpd(dst, src, mask, vec_enc); 2388 break; 2389 default: 2390 fatal("Unsupported type %s", type2name(elem_bt)); 2391 break; 2392 } 2393 } 2394 2395 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2396 int vec_enc) { 2397 switch(elem_bt) { 2398 case T_INT: 2399 case T_FLOAT: 2400 vmaskmovps(dst, src, mask, vec_enc); 2401 break; 2402 case T_LONG: 2403 case T_DOUBLE: 2404 vmaskmovpd(dst, src, mask, vec_enc); 2405 break; 2406 default: 2407 fatal("Unsupported type %s", type2name(elem_bt)); 2408 break; 2409 } 2410 } 2411 2412 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2413 XMMRegister dst, XMMRegister src, 2414 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2415 XMMRegister xmm_0, XMMRegister xmm_1) { 2416 const int permconst[] = {1, 14}; 2417 XMMRegister wsrc = src; 2418 XMMRegister wdst = xmm_0; 2419 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2420 2421 int vlen_enc = Assembler::AVX_128bit; 2422 if (vlen == 16) { 2423 vlen_enc = Assembler::AVX_256bit; 2424 } 2425 2426 for (int i = log2(vlen) - 1; i >=0; i--) { 2427 if (i == 0 && !is_dst_valid) { 2428 wdst = dst; 2429 } 2430 if (i == 3) { 2431 vextracti64x4_high(wtmp, wsrc); 2432 } else if (i == 2) { 2433 vextracti128_high(wtmp, wsrc); 2434 } else { // i = [0,1] 2435 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2436 } 2437 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2438 wsrc = wdst; 2439 vlen_enc = Assembler::AVX_128bit; 2440 } 2441 if (is_dst_valid) { 2442 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2443 } 2444 } 2445 2446 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2447 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2448 XMMRegister xmm_0, XMMRegister xmm_1) { 2449 XMMRegister wsrc = src; 2450 XMMRegister wdst = xmm_0; 2451 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2452 int vlen_enc = Assembler::AVX_128bit; 2453 if (vlen == 8) { 2454 vlen_enc = Assembler::AVX_256bit; 2455 } 2456 for (int i = log2(vlen) - 1; i >=0; i--) { 2457 if (i == 0 && !is_dst_valid) { 2458 wdst = dst; 2459 } 2460 if (i == 1) { 2461 vextracti128_high(wtmp, wsrc); 2462 } else if (i == 2) { 2463 vextracti64x4_high(wtmp, wsrc); 2464 } else { 2465 assert(i == 0, "%d", i); 2466 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2467 } 2468 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2469 wsrc = wdst; 2470 vlen_enc = Assembler::AVX_128bit; 2471 } 2472 if (is_dst_valid) { 2473 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2474 } 2475 } 2476 2477 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2478 switch (bt) { 2479 case T_BYTE: pextrb(dst, src, idx); break; 2480 case T_SHORT: pextrw(dst, src, idx); break; 2481 case T_INT: pextrd(dst, src, idx); break; 2482 case T_LONG: pextrq(dst, src, idx); break; 2483 2484 default: 2485 assert(false,"Should not reach here."); 2486 break; 2487 } 2488 } 2489 2490 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2491 int esize = type2aelembytes(typ); 2492 int elem_per_lane = 16/esize; 2493 int lane = elemindex / elem_per_lane; 2494 int eindex = elemindex % elem_per_lane; 2495 2496 if (lane >= 2) { 2497 assert(UseAVX > 2, "required"); 2498 vextractf32x4(dst, src, lane & 3); 2499 return dst; 2500 } else if (lane > 0) { 2501 assert(UseAVX > 0, "required"); 2502 vextractf128(dst, src, lane); 2503 return dst; 2504 } else { 2505 return src; 2506 } 2507 } 2508 2509 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2510 if (typ == T_BYTE) { 2511 movsbl(dst, dst); 2512 } else if (typ == T_SHORT) { 2513 movswl(dst, dst); 2514 } 2515 } 2516 2517 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2518 int esize = type2aelembytes(typ); 2519 int elem_per_lane = 16/esize; 2520 int eindex = elemindex % elem_per_lane; 2521 assert(is_integral_type(typ),"required"); 2522 2523 if (eindex == 0) { 2524 if (typ == T_LONG) { 2525 movq(dst, src); 2526 } else { 2527 movdl(dst, src); 2528 movsxl(typ, dst); 2529 } 2530 } else { 2531 extract(typ, dst, src, eindex); 2532 movsxl(typ, dst); 2533 } 2534 } 2535 2536 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2537 int esize = type2aelembytes(typ); 2538 int elem_per_lane = 16/esize; 2539 int eindex = elemindex % elem_per_lane; 2540 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2541 2542 if (eindex == 0) { 2543 movq(dst, src); 2544 } else { 2545 if (typ == T_FLOAT) { 2546 if (UseAVX == 0) { 2547 movdqu(dst, src); 2548 shufps(dst, dst, eindex); 2549 } else { 2550 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2551 } 2552 } else { 2553 if (UseAVX == 0) { 2554 movdqu(dst, src); 2555 psrldq(dst, eindex*esize); 2556 } else { 2557 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2558 } 2559 movq(dst, dst); 2560 } 2561 } 2562 // Zero upper bits 2563 if (typ == T_FLOAT) { 2564 if (UseAVX == 0) { 2565 assert(vtmp != xnoreg, "required."); 2566 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2567 pand(dst, vtmp); 2568 } else { 2569 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2570 } 2571 } 2572 } 2573 2574 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2575 switch(typ) { 2576 case T_BYTE: 2577 case T_BOOLEAN: 2578 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2579 break; 2580 case T_SHORT: 2581 case T_CHAR: 2582 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2583 break; 2584 case T_INT: 2585 case T_FLOAT: 2586 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2587 break; 2588 case T_LONG: 2589 case T_DOUBLE: 2590 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2591 break; 2592 default: 2593 assert(false,"Should not reach here."); 2594 break; 2595 } 2596 } 2597 2598 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2599 assert(rscratch != noreg || always_reachable(src2), "missing"); 2600 2601 switch(typ) { 2602 case T_BOOLEAN: 2603 case T_BYTE: 2604 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2605 break; 2606 case T_CHAR: 2607 case T_SHORT: 2608 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2609 break; 2610 case T_INT: 2611 case T_FLOAT: 2612 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2613 break; 2614 case T_LONG: 2615 case T_DOUBLE: 2616 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2617 break; 2618 default: 2619 assert(false,"Should not reach here."); 2620 break; 2621 } 2622 } 2623 2624 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2625 switch(typ) { 2626 case T_BYTE: 2627 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2628 break; 2629 case T_SHORT: 2630 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2631 break; 2632 case T_INT: 2633 case T_FLOAT: 2634 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2635 break; 2636 case T_LONG: 2637 case T_DOUBLE: 2638 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2639 break; 2640 default: 2641 assert(false,"Should not reach here."); 2642 break; 2643 } 2644 } 2645 2646 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2647 assert(vlen_in_bytes <= 32, ""); 2648 int esize = type2aelembytes(bt); 2649 if (vlen_in_bytes == 32) { 2650 assert(vtmp == xnoreg, "required."); 2651 if (esize >= 4) { 2652 vtestps(src1, src2, AVX_256bit); 2653 } else { 2654 vptest(src1, src2, AVX_256bit); 2655 } 2656 return; 2657 } 2658 if (vlen_in_bytes < 16) { 2659 // Duplicate the lower part to fill the whole register, 2660 // Don't need to do so for src2 2661 assert(vtmp != xnoreg, "required"); 2662 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2663 pshufd(vtmp, src1, shuffle_imm); 2664 } else { 2665 assert(vtmp == xnoreg, "required"); 2666 vtmp = src1; 2667 } 2668 if (esize >= 4 && VM_Version::supports_avx()) { 2669 vtestps(vtmp, src2, AVX_128bit); 2670 } else { 2671 ptest(vtmp, src2); 2672 } 2673 } 2674 2675 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2676 #ifdef ASSERT 2677 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2678 bool is_bw_supported = VM_Version::supports_avx512bw(); 2679 if (is_bw && !is_bw_supported) { 2680 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2681 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2682 "XMM register should be 0-15"); 2683 } 2684 #endif // ASSERT 2685 switch (elem_bt) { 2686 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2687 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2688 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2689 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2690 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2691 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2692 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2693 } 2694 } 2695 2696 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2697 assert(UseAVX >= 2, "required"); 2698 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2699 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2700 if ((UseAVX > 2) && 2701 (!is_bw || VM_Version::supports_avx512bw()) && 2702 (!is_vl || VM_Version::supports_avx512vl())) { 2703 switch (elem_bt) { 2704 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2705 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2706 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2707 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2708 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2709 } 2710 } else { 2711 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2712 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2713 switch (elem_bt) { 2714 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2715 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2716 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2717 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2718 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2719 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2720 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2721 } 2722 } 2723 } 2724 2725 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2726 switch (to_elem_bt) { 2727 case T_SHORT: 2728 vpmovsxbw(dst, src, vlen_enc); 2729 break; 2730 case T_INT: 2731 vpmovsxbd(dst, src, vlen_enc); 2732 break; 2733 case T_FLOAT: 2734 vpmovsxbd(dst, src, vlen_enc); 2735 vcvtdq2ps(dst, dst, vlen_enc); 2736 break; 2737 case T_LONG: 2738 vpmovsxbq(dst, src, vlen_enc); 2739 break; 2740 case T_DOUBLE: { 2741 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2742 vpmovsxbd(dst, src, mid_vlen_enc); 2743 vcvtdq2pd(dst, dst, vlen_enc); 2744 break; 2745 } 2746 default: 2747 fatal("Unsupported type %s", type2name(to_elem_bt)); 2748 break; 2749 } 2750 } 2751 2752 //------------------------------------------------------------------------------------------- 2753 2754 // IndexOf for constant substrings with size >= 8 chars 2755 // which don't need to be loaded through stack. 2756 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2757 Register cnt1, Register cnt2, 2758 int int_cnt2, Register result, 2759 XMMRegister vec, Register tmp, 2760 int ae) { 2761 ShortBranchVerifier sbv(this); 2762 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2763 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2764 2765 // This method uses the pcmpestri instruction with bound registers 2766 // inputs: 2767 // xmm - substring 2768 // rax - substring length (elements count) 2769 // mem - scanned string 2770 // rdx - string length (elements count) 2771 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2772 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2773 // outputs: 2774 // rcx - matched index in string 2775 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2776 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2777 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2778 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2779 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2780 2781 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2782 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2783 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2784 2785 // Note, inline_string_indexOf() generates checks: 2786 // if (substr.count > string.count) return -1; 2787 // if (substr.count == 0) return 0; 2788 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2789 2790 // Load substring. 2791 if (ae == StrIntrinsicNode::UL) { 2792 pmovzxbw(vec, Address(str2, 0)); 2793 } else { 2794 movdqu(vec, Address(str2, 0)); 2795 } 2796 movl(cnt2, int_cnt2); 2797 movptr(result, str1); // string addr 2798 2799 if (int_cnt2 > stride) { 2800 jmpb(SCAN_TO_SUBSTR); 2801 2802 // Reload substr for rescan, this code 2803 // is executed only for large substrings (> 8 chars) 2804 bind(RELOAD_SUBSTR); 2805 if (ae == StrIntrinsicNode::UL) { 2806 pmovzxbw(vec, Address(str2, 0)); 2807 } else { 2808 movdqu(vec, Address(str2, 0)); 2809 } 2810 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2811 2812 bind(RELOAD_STR); 2813 // We came here after the beginning of the substring was 2814 // matched but the rest of it was not so we need to search 2815 // again. Start from the next element after the previous match. 2816 2817 // cnt2 is number of substring reminding elements and 2818 // cnt1 is number of string reminding elements when cmp failed. 2819 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2820 subl(cnt1, cnt2); 2821 addl(cnt1, int_cnt2); 2822 movl(cnt2, int_cnt2); // Now restore cnt2 2823 2824 decrementl(cnt1); // Shift to next element 2825 cmpl(cnt1, cnt2); 2826 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2827 2828 addptr(result, (1<<scale1)); 2829 2830 } // (int_cnt2 > 8) 2831 2832 // Scan string for start of substr in 16-byte vectors 2833 bind(SCAN_TO_SUBSTR); 2834 pcmpestri(vec, Address(result, 0), mode); 2835 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2836 subl(cnt1, stride); 2837 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2838 cmpl(cnt1, cnt2); 2839 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2840 addptr(result, 16); 2841 jmpb(SCAN_TO_SUBSTR); 2842 2843 // Found a potential substr 2844 bind(FOUND_CANDIDATE); 2845 // Matched whole vector if first element matched (tmp(rcx) == 0). 2846 if (int_cnt2 == stride) { 2847 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2848 } else { // int_cnt2 > 8 2849 jccb(Assembler::overflow, FOUND_SUBSTR); 2850 } 2851 // After pcmpestri tmp(rcx) contains matched element index 2852 // Compute start addr of substr 2853 lea(result, Address(result, tmp, scale1)); 2854 2855 // Make sure string is still long enough 2856 subl(cnt1, tmp); 2857 cmpl(cnt1, cnt2); 2858 if (int_cnt2 == stride) { 2859 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2860 } else { // int_cnt2 > 8 2861 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2862 } 2863 // Left less then substring. 2864 2865 bind(RET_NOT_FOUND); 2866 movl(result, -1); 2867 jmp(EXIT); 2868 2869 if (int_cnt2 > stride) { 2870 // This code is optimized for the case when whole substring 2871 // is matched if its head is matched. 2872 bind(MATCH_SUBSTR_HEAD); 2873 pcmpestri(vec, Address(result, 0), mode); 2874 // Reload only string if does not match 2875 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2876 2877 Label CONT_SCAN_SUBSTR; 2878 // Compare the rest of substring (> 8 chars). 2879 bind(FOUND_SUBSTR); 2880 // First 8 chars are already matched. 2881 negptr(cnt2); 2882 addptr(cnt2, stride); 2883 2884 bind(SCAN_SUBSTR); 2885 subl(cnt1, stride); 2886 cmpl(cnt2, -stride); // Do not read beyond substring 2887 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2888 // Back-up strings to avoid reading beyond substring: 2889 // cnt1 = cnt1 - cnt2 + 8 2890 addl(cnt1, cnt2); // cnt2 is negative 2891 addl(cnt1, stride); 2892 movl(cnt2, stride); negptr(cnt2); 2893 bind(CONT_SCAN_SUBSTR); 2894 if (int_cnt2 < (int)G) { 2895 int tail_off1 = int_cnt2<<scale1; 2896 int tail_off2 = int_cnt2<<scale2; 2897 if (ae == StrIntrinsicNode::UL) { 2898 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2899 } else { 2900 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2901 } 2902 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2903 } else { 2904 // calculate index in register to avoid integer overflow (int_cnt2*2) 2905 movl(tmp, int_cnt2); 2906 addptr(tmp, cnt2); 2907 if (ae == StrIntrinsicNode::UL) { 2908 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2909 } else { 2910 movdqu(vec, Address(str2, tmp, scale2, 0)); 2911 } 2912 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2913 } 2914 // Need to reload strings pointers if not matched whole vector 2915 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2916 addptr(cnt2, stride); 2917 jcc(Assembler::negative, SCAN_SUBSTR); 2918 // Fall through if found full substring 2919 2920 } // (int_cnt2 > 8) 2921 2922 bind(RET_FOUND); 2923 // Found result if we matched full small substring. 2924 // Compute substr offset 2925 subptr(result, str1); 2926 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2927 shrl(result, 1); // index 2928 } 2929 bind(EXIT); 2930 2931 } // string_indexofC8 2932 2933 // Small strings are loaded through stack if they cross page boundary. 2934 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2935 Register cnt1, Register cnt2, 2936 int int_cnt2, Register result, 2937 XMMRegister vec, Register tmp, 2938 int ae) { 2939 ShortBranchVerifier sbv(this); 2940 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2941 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2942 2943 // 2944 // int_cnt2 is length of small (< 8 chars) constant substring 2945 // or (-1) for non constant substring in which case its length 2946 // is in cnt2 register. 2947 // 2948 // Note, inline_string_indexOf() generates checks: 2949 // if (substr.count > string.count) return -1; 2950 // if (substr.count == 0) return 0; 2951 // 2952 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2953 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2954 // This method uses the pcmpestri instruction with bound registers 2955 // inputs: 2956 // xmm - substring 2957 // rax - substring length (elements count) 2958 // mem - scanned string 2959 // rdx - string length (elements count) 2960 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2961 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2962 // outputs: 2963 // rcx - matched index in string 2964 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2965 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2966 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2967 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2968 2969 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2970 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2971 FOUND_CANDIDATE; 2972 2973 { //======================================================== 2974 // We don't know where these strings are located 2975 // and we can't read beyond them. Load them through stack. 2976 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2977 2978 movptr(tmp, rsp); // save old SP 2979 2980 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2981 if (int_cnt2 == (1>>scale2)) { // One byte 2982 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2983 load_unsigned_byte(result, Address(str2, 0)); 2984 movdl(vec, result); // move 32 bits 2985 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2986 // Not enough header space in 32-bit VM: 12+3 = 15. 2987 movl(result, Address(str2, -1)); 2988 shrl(result, 8); 2989 movdl(vec, result); // move 32 bits 2990 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2991 load_unsigned_short(result, Address(str2, 0)); 2992 movdl(vec, result); // move 32 bits 2993 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2994 movdl(vec, Address(str2, 0)); // move 32 bits 2995 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2996 movq(vec, Address(str2, 0)); // move 64 bits 2997 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2998 // Array header size is 12 bytes in 32-bit VM 2999 // + 6 bytes for 3 chars == 18 bytes, 3000 // enough space to load vec and shift. 3001 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3002 if (ae == StrIntrinsicNode::UL) { 3003 int tail_off = int_cnt2-8; 3004 pmovzxbw(vec, Address(str2, tail_off)); 3005 psrldq(vec, -2*tail_off); 3006 } 3007 else { 3008 int tail_off = int_cnt2*(1<<scale2); 3009 movdqu(vec, Address(str2, tail_off-16)); 3010 psrldq(vec, 16-tail_off); 3011 } 3012 } 3013 } else { // not constant substring 3014 cmpl(cnt2, stride); 3015 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3016 3017 // We can read beyond string if srt+16 does not cross page boundary 3018 // since heaps are aligned and mapped by pages. 3019 assert(os::vm_page_size() < (int)G, "default page should be small"); 3020 movl(result, str2); // We need only low 32 bits 3021 andl(result, ((int)os::vm_page_size()-1)); 3022 cmpl(result, ((int)os::vm_page_size()-16)); 3023 jccb(Assembler::belowEqual, CHECK_STR); 3024 3025 // Move small strings to stack to allow load 16 bytes into vec. 3026 subptr(rsp, 16); 3027 int stk_offset = wordSize-(1<<scale2); 3028 push(cnt2); 3029 3030 bind(COPY_SUBSTR); 3031 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3032 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3033 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3034 } else if (ae == StrIntrinsicNode::UU) { 3035 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3036 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3037 } 3038 decrement(cnt2); 3039 jccb(Assembler::notZero, COPY_SUBSTR); 3040 3041 pop(cnt2); 3042 movptr(str2, rsp); // New substring address 3043 } // non constant 3044 3045 bind(CHECK_STR); 3046 cmpl(cnt1, stride); 3047 jccb(Assembler::aboveEqual, BIG_STRINGS); 3048 3049 // Check cross page boundary. 3050 movl(result, str1); // We need only low 32 bits 3051 andl(result, ((int)os::vm_page_size()-1)); 3052 cmpl(result, ((int)os::vm_page_size()-16)); 3053 jccb(Assembler::belowEqual, BIG_STRINGS); 3054 3055 subptr(rsp, 16); 3056 int stk_offset = -(1<<scale1); 3057 if (int_cnt2 < 0) { // not constant 3058 push(cnt2); 3059 stk_offset += wordSize; 3060 } 3061 movl(cnt2, cnt1); 3062 3063 bind(COPY_STR); 3064 if (ae == StrIntrinsicNode::LL) { 3065 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3066 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3067 } else { 3068 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3069 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3070 } 3071 decrement(cnt2); 3072 jccb(Assembler::notZero, COPY_STR); 3073 3074 if (int_cnt2 < 0) { // not constant 3075 pop(cnt2); 3076 } 3077 movptr(str1, rsp); // New string address 3078 3079 bind(BIG_STRINGS); 3080 // Load substring. 3081 if (int_cnt2 < 0) { // -1 3082 if (ae == StrIntrinsicNode::UL) { 3083 pmovzxbw(vec, Address(str2, 0)); 3084 } else { 3085 movdqu(vec, Address(str2, 0)); 3086 } 3087 push(cnt2); // substr count 3088 push(str2); // substr addr 3089 push(str1); // string addr 3090 } else { 3091 // Small (< 8 chars) constant substrings are loaded already. 3092 movl(cnt2, int_cnt2); 3093 } 3094 push(tmp); // original SP 3095 3096 } // Finished loading 3097 3098 //======================================================== 3099 // Start search 3100 // 3101 3102 movptr(result, str1); // string addr 3103 3104 if (int_cnt2 < 0) { // Only for non constant substring 3105 jmpb(SCAN_TO_SUBSTR); 3106 3107 // SP saved at sp+0 3108 // String saved at sp+1*wordSize 3109 // Substr saved at sp+2*wordSize 3110 // Substr count saved at sp+3*wordSize 3111 3112 // Reload substr for rescan, this code 3113 // is executed only for large substrings (> 8 chars) 3114 bind(RELOAD_SUBSTR); 3115 movptr(str2, Address(rsp, 2*wordSize)); 3116 movl(cnt2, Address(rsp, 3*wordSize)); 3117 if (ae == StrIntrinsicNode::UL) { 3118 pmovzxbw(vec, Address(str2, 0)); 3119 } else { 3120 movdqu(vec, Address(str2, 0)); 3121 } 3122 // We came here after the beginning of the substring was 3123 // matched but the rest of it was not so we need to search 3124 // again. Start from the next element after the previous match. 3125 subptr(str1, result); // Restore counter 3126 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3127 shrl(str1, 1); 3128 } 3129 addl(cnt1, str1); 3130 decrementl(cnt1); // Shift to next element 3131 cmpl(cnt1, cnt2); 3132 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3133 3134 addptr(result, (1<<scale1)); 3135 } // non constant 3136 3137 // Scan string for start of substr in 16-byte vectors 3138 bind(SCAN_TO_SUBSTR); 3139 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3140 pcmpestri(vec, Address(result, 0), mode); 3141 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3142 subl(cnt1, stride); 3143 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3144 cmpl(cnt1, cnt2); 3145 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3146 addptr(result, 16); 3147 3148 bind(ADJUST_STR); 3149 cmpl(cnt1, stride); // Do not read beyond string 3150 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3151 // Back-up string to avoid reading beyond string. 3152 lea(result, Address(result, cnt1, scale1, -16)); 3153 movl(cnt1, stride); 3154 jmpb(SCAN_TO_SUBSTR); 3155 3156 // Found a potential substr 3157 bind(FOUND_CANDIDATE); 3158 // After pcmpestri tmp(rcx) contains matched element index 3159 3160 // Make sure string is still long enough 3161 subl(cnt1, tmp); 3162 cmpl(cnt1, cnt2); 3163 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3164 // Left less then substring. 3165 3166 bind(RET_NOT_FOUND); 3167 movl(result, -1); 3168 jmp(CLEANUP); 3169 3170 bind(FOUND_SUBSTR); 3171 // Compute start addr of substr 3172 lea(result, Address(result, tmp, scale1)); 3173 if (int_cnt2 > 0) { // Constant substring 3174 // Repeat search for small substring (< 8 chars) 3175 // from new point without reloading substring. 3176 // Have to check that we don't read beyond string. 3177 cmpl(tmp, stride-int_cnt2); 3178 jccb(Assembler::greater, ADJUST_STR); 3179 // Fall through if matched whole substring. 3180 } else { // non constant 3181 assert(int_cnt2 == -1, "should be != 0"); 3182 3183 addl(tmp, cnt2); 3184 // Found result if we matched whole substring. 3185 cmpl(tmp, stride); 3186 jcc(Assembler::lessEqual, RET_FOUND); 3187 3188 // Repeat search for small substring (<= 8 chars) 3189 // from new point 'str1' without reloading substring. 3190 cmpl(cnt2, stride); 3191 // Have to check that we don't read beyond string. 3192 jccb(Assembler::lessEqual, ADJUST_STR); 3193 3194 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3195 // Compare the rest of substring (> 8 chars). 3196 movptr(str1, result); 3197 3198 cmpl(tmp, cnt2); 3199 // First 8 chars are already matched. 3200 jccb(Assembler::equal, CHECK_NEXT); 3201 3202 bind(SCAN_SUBSTR); 3203 pcmpestri(vec, Address(str1, 0), mode); 3204 // Need to reload strings pointers if not matched whole vector 3205 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3206 3207 bind(CHECK_NEXT); 3208 subl(cnt2, stride); 3209 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3210 addptr(str1, 16); 3211 if (ae == StrIntrinsicNode::UL) { 3212 addptr(str2, 8); 3213 } else { 3214 addptr(str2, 16); 3215 } 3216 subl(cnt1, stride); 3217 cmpl(cnt2, stride); // Do not read beyond substring 3218 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3219 // Back-up strings to avoid reading beyond substring. 3220 3221 if (ae == StrIntrinsicNode::UL) { 3222 lea(str2, Address(str2, cnt2, scale2, -8)); 3223 lea(str1, Address(str1, cnt2, scale1, -16)); 3224 } else { 3225 lea(str2, Address(str2, cnt2, scale2, -16)); 3226 lea(str1, Address(str1, cnt2, scale1, -16)); 3227 } 3228 subl(cnt1, cnt2); 3229 movl(cnt2, stride); 3230 addl(cnt1, stride); 3231 bind(CONT_SCAN_SUBSTR); 3232 if (ae == StrIntrinsicNode::UL) { 3233 pmovzxbw(vec, Address(str2, 0)); 3234 } else { 3235 movdqu(vec, Address(str2, 0)); 3236 } 3237 jmp(SCAN_SUBSTR); 3238 3239 bind(RET_FOUND_LONG); 3240 movptr(str1, Address(rsp, wordSize)); 3241 } // non constant 3242 3243 bind(RET_FOUND); 3244 // Compute substr offset 3245 subptr(result, str1); 3246 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3247 shrl(result, 1); // index 3248 } 3249 bind(CLEANUP); 3250 pop(rsp); // restore SP 3251 3252 } // string_indexof 3253 3254 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3255 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3256 ShortBranchVerifier sbv(this); 3257 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3258 3259 int stride = 8; 3260 3261 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3262 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3263 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3264 FOUND_SEQ_CHAR, DONE_LABEL; 3265 3266 movptr(result, str1); 3267 if (UseAVX >= 2) { 3268 cmpl(cnt1, stride); 3269 jcc(Assembler::less, SCAN_TO_CHAR); 3270 cmpl(cnt1, 2*stride); 3271 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3272 movdl(vec1, ch); 3273 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3274 vpxor(vec2, vec2); 3275 movl(tmp, cnt1); 3276 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3277 andl(cnt1,0x0000000F); //tail count (in chars) 3278 3279 bind(SCAN_TO_16_CHAR_LOOP); 3280 vmovdqu(vec3, Address(result, 0)); 3281 vpcmpeqw(vec3, vec3, vec1, 1); 3282 vptest(vec2, vec3); 3283 jcc(Assembler::carryClear, FOUND_CHAR); 3284 addptr(result, 32); 3285 subl(tmp, 2*stride); 3286 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3287 jmp(SCAN_TO_8_CHAR); 3288 bind(SCAN_TO_8_CHAR_INIT); 3289 movdl(vec1, ch); 3290 pshuflw(vec1, vec1, 0x00); 3291 pshufd(vec1, vec1, 0); 3292 pxor(vec2, vec2); 3293 } 3294 bind(SCAN_TO_8_CHAR); 3295 cmpl(cnt1, stride); 3296 jcc(Assembler::less, SCAN_TO_CHAR); 3297 if (UseAVX < 2) { 3298 movdl(vec1, ch); 3299 pshuflw(vec1, vec1, 0x00); 3300 pshufd(vec1, vec1, 0); 3301 pxor(vec2, vec2); 3302 } 3303 movl(tmp, cnt1); 3304 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3305 andl(cnt1,0x00000007); //tail count (in chars) 3306 3307 bind(SCAN_TO_8_CHAR_LOOP); 3308 movdqu(vec3, Address(result, 0)); 3309 pcmpeqw(vec3, vec1); 3310 ptest(vec2, vec3); 3311 jcc(Assembler::carryClear, FOUND_CHAR); 3312 addptr(result, 16); 3313 subl(tmp, stride); 3314 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3315 bind(SCAN_TO_CHAR); 3316 testl(cnt1, cnt1); 3317 jcc(Assembler::zero, RET_NOT_FOUND); 3318 bind(SCAN_TO_CHAR_LOOP); 3319 load_unsigned_short(tmp, Address(result, 0)); 3320 cmpl(ch, tmp); 3321 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3322 addptr(result, 2); 3323 subl(cnt1, 1); 3324 jccb(Assembler::zero, RET_NOT_FOUND); 3325 jmp(SCAN_TO_CHAR_LOOP); 3326 3327 bind(RET_NOT_FOUND); 3328 movl(result, -1); 3329 jmpb(DONE_LABEL); 3330 3331 bind(FOUND_CHAR); 3332 if (UseAVX >= 2) { 3333 vpmovmskb(tmp, vec3); 3334 } else { 3335 pmovmskb(tmp, vec3); 3336 } 3337 bsfl(ch, tmp); 3338 addptr(result, ch); 3339 3340 bind(FOUND_SEQ_CHAR); 3341 subptr(result, str1); 3342 shrl(result, 1); 3343 3344 bind(DONE_LABEL); 3345 } // string_indexof_char 3346 3347 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3348 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3349 ShortBranchVerifier sbv(this); 3350 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3351 3352 int stride = 16; 3353 3354 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3355 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3356 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3357 FOUND_SEQ_CHAR, DONE_LABEL; 3358 3359 movptr(result, str1); 3360 if (UseAVX >= 2) { 3361 cmpl(cnt1, stride); 3362 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3363 cmpl(cnt1, stride*2); 3364 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3365 movdl(vec1, ch); 3366 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3367 vpxor(vec2, vec2); 3368 movl(tmp, cnt1); 3369 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3370 andl(cnt1,0x0000001F); //tail count (in chars) 3371 3372 bind(SCAN_TO_32_CHAR_LOOP); 3373 vmovdqu(vec3, Address(result, 0)); 3374 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3375 vptest(vec2, vec3); 3376 jcc(Assembler::carryClear, FOUND_CHAR); 3377 addptr(result, 32); 3378 subl(tmp, stride*2); 3379 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3380 jmp(SCAN_TO_16_CHAR); 3381 3382 bind(SCAN_TO_16_CHAR_INIT); 3383 movdl(vec1, ch); 3384 pxor(vec2, vec2); 3385 pshufb(vec1, vec2); 3386 } 3387 3388 bind(SCAN_TO_16_CHAR); 3389 cmpl(cnt1, stride); 3390 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3391 if (UseAVX < 2) { 3392 movdl(vec1, ch); 3393 pxor(vec2, vec2); 3394 pshufb(vec1, vec2); 3395 } 3396 movl(tmp, cnt1); 3397 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3398 andl(cnt1,0x0000000F); //tail count (in bytes) 3399 3400 bind(SCAN_TO_16_CHAR_LOOP); 3401 movdqu(vec3, Address(result, 0)); 3402 pcmpeqb(vec3, vec1); 3403 ptest(vec2, vec3); 3404 jcc(Assembler::carryClear, FOUND_CHAR); 3405 addptr(result, 16); 3406 subl(tmp, stride); 3407 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3408 3409 bind(SCAN_TO_CHAR_INIT); 3410 testl(cnt1, cnt1); 3411 jcc(Assembler::zero, RET_NOT_FOUND); 3412 bind(SCAN_TO_CHAR_LOOP); 3413 load_unsigned_byte(tmp, Address(result, 0)); 3414 cmpl(ch, tmp); 3415 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3416 addptr(result, 1); 3417 subl(cnt1, 1); 3418 jccb(Assembler::zero, RET_NOT_FOUND); 3419 jmp(SCAN_TO_CHAR_LOOP); 3420 3421 bind(RET_NOT_FOUND); 3422 movl(result, -1); 3423 jmpb(DONE_LABEL); 3424 3425 bind(FOUND_CHAR); 3426 if (UseAVX >= 2) { 3427 vpmovmskb(tmp, vec3); 3428 } else { 3429 pmovmskb(tmp, vec3); 3430 } 3431 bsfl(ch, tmp); 3432 addptr(result, ch); 3433 3434 bind(FOUND_SEQ_CHAR); 3435 subptr(result, str1); 3436 3437 bind(DONE_LABEL); 3438 } // stringL_indexof_char 3439 3440 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3441 switch (eltype) { 3442 case T_BOOLEAN: return sizeof(jboolean); 3443 case T_BYTE: return sizeof(jbyte); 3444 case T_SHORT: return sizeof(jshort); 3445 case T_CHAR: return sizeof(jchar); 3446 case T_INT: return sizeof(jint); 3447 default: 3448 ShouldNotReachHere(); 3449 return -1; 3450 } 3451 } 3452 3453 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3454 switch (eltype) { 3455 // T_BOOLEAN used as surrogate for unsigned byte 3456 case T_BOOLEAN: movzbl(dst, src); break; 3457 case T_BYTE: movsbl(dst, src); break; 3458 case T_SHORT: movswl(dst, src); break; 3459 case T_CHAR: movzwl(dst, src); break; 3460 case T_INT: movl(dst, src); break; 3461 default: 3462 ShouldNotReachHere(); 3463 } 3464 } 3465 3466 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3467 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3468 } 3469 3470 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3471 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8); 3472 } 3473 3474 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3475 const int vlen = Assembler::AVX_256bit; 3476 switch (eltype) { 3477 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3478 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3479 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3480 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3481 case T_INT: 3482 // do nothing 3483 break; 3484 default: 3485 ShouldNotReachHere(); 3486 } 3487 } 3488 3489 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3490 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3491 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3492 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3493 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3494 BasicType eltype) { 3495 ShortBranchVerifier sbv(this); 3496 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3497 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3498 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3499 3500 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3501 SHORT_UNROLLED_LOOP_EXIT, 3502 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3503 UNROLLED_VECTOR_LOOP_BEGIN, 3504 END; 3505 switch (eltype) { 3506 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3507 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3508 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3509 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3510 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3511 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3512 } 3513 3514 // For "renaming" for readibility of the code 3515 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3516 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3517 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3518 3519 const int elsize = arrays_hashcode_elsize(eltype); 3520 3521 /* 3522 if (cnt1 >= 2) { 3523 if (cnt1 >= 32) { 3524 UNROLLED VECTOR LOOP 3525 } 3526 UNROLLED SCALAR LOOP 3527 } 3528 SINGLE SCALAR 3529 */ 3530 3531 cmpl(cnt1, 32); 3532 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3533 3534 // cnt1 >= 32 && generate_vectorized_loop 3535 xorl(index, index); 3536 3537 // vresult = IntVector.zero(I256); 3538 for (int idx = 0; idx < 4; idx++) { 3539 vpxor(vresult[idx], vresult[idx]); 3540 } 3541 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3542 Register bound = tmp2; 3543 Register next = tmp3; 3544 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3545 movl(next, Address(tmp2, 0)); 3546 movdl(vnext, next); 3547 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3548 3549 // index = 0; 3550 // bound = cnt1 & ~(32 - 1); 3551 movl(bound, cnt1); 3552 andl(bound, ~(32 - 1)); 3553 // for (; index < bound; index += 32) { 3554 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3555 // result *= next; 3556 imull(result, next); 3557 // loop fission to upfront the cost of fetching from memory, OOO execution 3558 // can then hopefully do a better job of prefetching 3559 for (int idx = 0; idx < 4; idx++) { 3560 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3561 } 3562 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3563 for (int idx = 0; idx < 4; idx++) { 3564 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3565 arrays_hashcode_elvcast(vtmp[idx], eltype); 3566 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3567 } 3568 // index += 32; 3569 addl(index, 32); 3570 // index < bound; 3571 cmpl(index, bound); 3572 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3573 // } 3574 3575 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3576 subl(cnt1, bound); 3577 // release bound 3578 3579 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3580 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3581 for (int idx = 0; idx < 4; idx++) { 3582 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, (int)((8 * idx + 1) * sizeof(jint))), T_INT); 3583 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3584 } 3585 // result += vresult.reduceLanes(ADD); 3586 for (int idx = 0; idx < 4; idx++) { 3587 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3588 } 3589 3590 // } else if (cnt1 < 32) { 3591 3592 bind(SHORT_UNROLLED_BEGIN); 3593 // int i = 1; 3594 movl(index, 1); 3595 cmpl(index, cnt1); 3596 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3597 3598 // for (; i < cnt1 ; i += 2) { 3599 bind(SHORT_UNROLLED_LOOP_BEGIN); 3600 movl(tmp3, 961); 3601 imull(result, tmp3); 3602 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3603 movl(tmp3, tmp2); 3604 shll(tmp3, 5); 3605 subl(tmp3, tmp2); 3606 addl(result, tmp3); 3607 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3608 addl(result, tmp3); 3609 addl(index, 2); 3610 cmpl(index, cnt1); 3611 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3612 3613 // } 3614 // if (i >= cnt1) { 3615 bind(SHORT_UNROLLED_LOOP_EXIT); 3616 jccb(Assembler::greater, END); 3617 movl(tmp2, result); 3618 shll(result, 5); 3619 subl(result, tmp2); 3620 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3621 addl(result, tmp3); 3622 // } 3623 bind(END); 3624 3625 BLOCK_COMMENT("} // arrays_hashcode"); 3626 3627 } // arrays_hashcode 3628 3629 // helper function for string_compare 3630 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3631 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3632 Address::ScaleFactor scale2, Register index, int ae) { 3633 if (ae == StrIntrinsicNode::LL) { 3634 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3635 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3636 } else if (ae == StrIntrinsicNode::UU) { 3637 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3638 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3639 } else { 3640 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3641 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3642 } 3643 } 3644 3645 // Compare strings, used for char[] and byte[]. 3646 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3647 Register cnt1, Register cnt2, Register result, 3648 XMMRegister vec1, int ae, KRegister mask) { 3649 ShortBranchVerifier sbv(this); 3650 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3651 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3 3652 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3653 int stride2x2 = 0x40; 3654 Address::ScaleFactor scale = Address::no_scale; 3655 Address::ScaleFactor scale1 = Address::no_scale; 3656 Address::ScaleFactor scale2 = Address::no_scale; 3657 3658 if (ae != StrIntrinsicNode::LL) { 3659 stride2x2 = 0x20; 3660 } 3661 3662 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3663 shrl(cnt2, 1); 3664 } 3665 // Compute the minimum of the string lengths and the 3666 // difference of the string lengths (stack). 3667 // Do the conditional move stuff 3668 movl(result, cnt1); 3669 subl(cnt1, cnt2); 3670 push(cnt1); 3671 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3672 3673 // Is the minimum length zero? 3674 testl(cnt2, cnt2); 3675 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3676 if (ae == StrIntrinsicNode::LL) { 3677 // Load first bytes 3678 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3679 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3680 } else if (ae == StrIntrinsicNode::UU) { 3681 // Load first characters 3682 load_unsigned_short(result, Address(str1, 0)); 3683 load_unsigned_short(cnt1, Address(str2, 0)); 3684 } else { 3685 load_unsigned_byte(result, Address(str1, 0)); 3686 load_unsigned_short(cnt1, Address(str2, 0)); 3687 } 3688 subl(result, cnt1); 3689 jcc(Assembler::notZero, POP_LABEL); 3690 3691 if (ae == StrIntrinsicNode::UU) { 3692 // Divide length by 2 to get number of chars 3693 shrl(cnt2, 1); 3694 } 3695 cmpl(cnt2, 1); 3696 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3697 3698 // Check if the strings start at the same location and setup scale and stride 3699 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3700 cmpptr(str1, str2); 3701 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3702 if (ae == StrIntrinsicNode::LL) { 3703 scale = Address::times_1; 3704 stride = 16; 3705 } else { 3706 scale = Address::times_2; 3707 stride = 8; 3708 } 3709 } else { 3710 scale1 = Address::times_1; 3711 scale2 = Address::times_2; 3712 // scale not used 3713 stride = 8; 3714 } 3715 3716 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3717 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3718 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3719 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3720 Label COMPARE_TAIL_LONG; 3721 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3 3722 3723 int pcmpmask = 0x19; 3724 if (ae == StrIntrinsicNode::LL) { 3725 pcmpmask &= ~0x01; 3726 } 3727 3728 // Setup to compare 16-chars (32-bytes) vectors, 3729 // start from first character again because it has aligned address. 3730 if (ae == StrIntrinsicNode::LL) { 3731 stride2 = 32; 3732 } else { 3733 stride2 = 16; 3734 } 3735 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3736 adr_stride = stride << scale; 3737 } else { 3738 adr_stride1 = 8; //stride << scale1; 3739 adr_stride2 = 16; //stride << scale2; 3740 } 3741 3742 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3743 // rax and rdx are used by pcmpestri as elements counters 3744 movl(result, cnt2); 3745 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3746 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3747 3748 // fast path : compare first 2 8-char vectors. 3749 bind(COMPARE_16_CHARS); 3750 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3751 movdqu(vec1, Address(str1, 0)); 3752 } else { 3753 pmovzxbw(vec1, Address(str1, 0)); 3754 } 3755 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3756 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3757 3758 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3759 movdqu(vec1, Address(str1, adr_stride)); 3760 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3761 } else { 3762 pmovzxbw(vec1, Address(str1, adr_stride1)); 3763 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3764 } 3765 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3766 addl(cnt1, stride); 3767 3768 // Compare the characters at index in cnt1 3769 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3770 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3771 subl(result, cnt2); 3772 jmp(POP_LABEL); 3773 3774 // Setup the registers to start vector comparison loop 3775 bind(COMPARE_WIDE_VECTORS); 3776 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3777 lea(str1, Address(str1, result, scale)); 3778 lea(str2, Address(str2, result, scale)); 3779 } else { 3780 lea(str1, Address(str1, result, scale1)); 3781 lea(str2, Address(str2, result, scale2)); 3782 } 3783 subl(result, stride2); 3784 subl(cnt2, stride2); 3785 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3786 negptr(result); 3787 3788 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3789 bind(COMPARE_WIDE_VECTORS_LOOP); 3790 3791 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3792 cmpl(cnt2, stride2x2); 3793 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3794 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3795 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3796 3797 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3798 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3799 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3800 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3801 } else { 3802 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3803 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3804 } 3805 kortestql(mask, mask); 3806 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3807 addptr(result, stride2x2); // update since we already compared at this addr 3808 subl(cnt2, stride2x2); // and sub the size too 3809 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3810 3811 vpxor(vec1, vec1); 3812 jmpb(COMPARE_WIDE_TAIL); 3813 }//if (VM_Version::supports_avx512vlbw()) 3814 3815 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3816 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3817 vmovdqu(vec1, Address(str1, result, scale)); 3818 vpxor(vec1, Address(str2, result, scale)); 3819 } else { 3820 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3821 vpxor(vec1, Address(str2, result, scale2)); 3822 } 3823 vptest(vec1, vec1); 3824 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3825 addptr(result, stride2); 3826 subl(cnt2, stride2); 3827 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3828 // clean upper bits of YMM registers 3829 vpxor(vec1, vec1); 3830 3831 // compare wide vectors tail 3832 bind(COMPARE_WIDE_TAIL); 3833 testptr(result, result); 3834 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3835 3836 movl(result, stride2); 3837 movl(cnt2, result); 3838 negptr(result); 3839 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3840 3841 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3842 bind(VECTOR_NOT_EQUAL); 3843 // clean upper bits of YMM registers 3844 vpxor(vec1, vec1); 3845 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3846 lea(str1, Address(str1, result, scale)); 3847 lea(str2, Address(str2, result, scale)); 3848 } else { 3849 lea(str1, Address(str1, result, scale1)); 3850 lea(str2, Address(str2, result, scale2)); 3851 } 3852 jmp(COMPARE_16_CHARS); 3853 3854 // Compare tail chars, length between 1 to 15 chars 3855 bind(COMPARE_TAIL_LONG); 3856 movl(cnt2, result); 3857 cmpl(cnt2, stride); 3858 jcc(Assembler::less, COMPARE_SMALL_STR); 3859 3860 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3861 movdqu(vec1, Address(str1, 0)); 3862 } else { 3863 pmovzxbw(vec1, Address(str1, 0)); 3864 } 3865 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3866 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3867 subptr(cnt2, stride); 3868 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3869 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3870 lea(str1, Address(str1, result, scale)); 3871 lea(str2, Address(str2, result, scale)); 3872 } else { 3873 lea(str1, Address(str1, result, scale1)); 3874 lea(str2, Address(str2, result, scale2)); 3875 } 3876 negptr(cnt2); 3877 jmpb(WHILE_HEAD_LABEL); 3878 3879 bind(COMPARE_SMALL_STR); 3880 } else if (UseSSE42Intrinsics) { 3881 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3882 int pcmpmask = 0x19; 3883 // Setup to compare 8-char (16-byte) vectors, 3884 // start from first character again because it has aligned address. 3885 movl(result, cnt2); 3886 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3887 if (ae == StrIntrinsicNode::LL) { 3888 pcmpmask &= ~0x01; 3889 } 3890 jcc(Assembler::zero, COMPARE_TAIL); 3891 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3892 lea(str1, Address(str1, result, scale)); 3893 lea(str2, Address(str2, result, scale)); 3894 } else { 3895 lea(str1, Address(str1, result, scale1)); 3896 lea(str2, Address(str2, result, scale2)); 3897 } 3898 negptr(result); 3899 3900 // pcmpestri 3901 // inputs: 3902 // vec1- substring 3903 // rax - negative string length (elements count) 3904 // mem - scanned string 3905 // rdx - string length (elements count) 3906 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3907 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3908 // outputs: 3909 // rcx - first mismatched element index 3910 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3911 3912 bind(COMPARE_WIDE_VECTORS); 3913 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3914 movdqu(vec1, Address(str1, result, scale)); 3915 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3916 } else { 3917 pmovzxbw(vec1, Address(str1, result, scale1)); 3918 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3919 } 3920 // After pcmpestri cnt1(rcx) contains mismatched element index 3921 3922 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3923 addptr(result, stride); 3924 subptr(cnt2, stride); 3925 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3926 3927 // compare wide vectors tail 3928 testptr(result, result); 3929 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3930 3931 movl(cnt2, stride); 3932 movl(result, stride); 3933 negptr(result); 3934 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3935 movdqu(vec1, Address(str1, result, scale)); 3936 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3937 } else { 3938 pmovzxbw(vec1, Address(str1, result, scale1)); 3939 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3940 } 3941 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3942 3943 // Mismatched characters in the vectors 3944 bind(VECTOR_NOT_EQUAL); 3945 addptr(cnt1, result); 3946 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3947 subl(result, cnt2); 3948 jmpb(POP_LABEL); 3949 3950 bind(COMPARE_TAIL); // limit is zero 3951 movl(cnt2, result); 3952 // Fallthru to tail compare 3953 } 3954 // Shift str2 and str1 to the end of the arrays, negate min 3955 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3956 lea(str1, Address(str1, cnt2, scale)); 3957 lea(str2, Address(str2, cnt2, scale)); 3958 } else { 3959 lea(str1, Address(str1, cnt2, scale1)); 3960 lea(str2, Address(str2, cnt2, scale2)); 3961 } 3962 decrementl(cnt2); // first character was compared already 3963 negptr(cnt2); 3964 3965 // Compare the rest of the elements 3966 bind(WHILE_HEAD_LABEL); 3967 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3968 subl(result, cnt1); 3969 jccb(Assembler::notZero, POP_LABEL); 3970 increment(cnt2); 3971 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3972 3973 // Strings are equal up to min length. Return the length difference. 3974 bind(LENGTH_DIFF_LABEL); 3975 pop(result); 3976 if (ae == StrIntrinsicNode::UU) { 3977 // Divide diff by 2 to get number of chars 3978 sarl(result, 1); 3979 } 3980 jmpb(DONE_LABEL); 3981 3982 if (VM_Version::supports_avx512vlbw()) { 3983 3984 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3985 3986 kmovql(cnt1, mask); 3987 notq(cnt1); 3988 bsfq(cnt2, cnt1); 3989 if (ae != StrIntrinsicNode::LL) { 3990 // Divide diff by 2 to get number of chars 3991 sarl(cnt2, 1); 3992 } 3993 addq(result, cnt2); 3994 if (ae == StrIntrinsicNode::LL) { 3995 load_unsigned_byte(cnt1, Address(str2, result)); 3996 load_unsigned_byte(result, Address(str1, result)); 3997 } else if (ae == StrIntrinsicNode::UU) { 3998 load_unsigned_short(cnt1, Address(str2, result, scale)); 3999 load_unsigned_short(result, Address(str1, result, scale)); 4000 } else { 4001 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4002 load_unsigned_byte(result, Address(str1, result, scale1)); 4003 } 4004 subl(result, cnt1); 4005 jmpb(POP_LABEL); 4006 }//if (VM_Version::supports_avx512vlbw()) 4007 4008 // Discard the stored length difference 4009 bind(POP_LABEL); 4010 pop(cnt1); 4011 4012 // That's it 4013 bind(DONE_LABEL); 4014 if(ae == StrIntrinsicNode::UL) { 4015 negl(result); 4016 } 4017 4018 } 4019 4020 // Search for Non-ASCII character (Negative byte value) in a byte array, 4021 // return the index of the first such character, otherwise the length 4022 // of the array segment searched. 4023 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4024 // @IntrinsicCandidate 4025 // public static int countPositives(byte[] ba, int off, int len) { 4026 // for (int i = off; i < off + len; i++) { 4027 // if (ba[i] < 0) { 4028 // return i - off; 4029 // } 4030 // } 4031 // return len; 4032 // } 4033 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4034 Register result, Register tmp1, 4035 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4036 // rsi: byte array 4037 // rcx: len 4038 // rax: result 4039 ShortBranchVerifier sbv(this); 4040 assert_different_registers(ary1, len, result, tmp1); 4041 assert_different_registers(vec1, vec2); 4042 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4043 4044 movl(result, len); // copy 4045 // len == 0 4046 testl(len, len); 4047 jcc(Assembler::zero, DONE); 4048 4049 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4050 VM_Version::supports_avx512vlbw() && 4051 VM_Version::supports_bmi2()) { 4052 4053 Label test_64_loop, test_tail, BREAK_LOOP; 4054 movl(tmp1, len); 4055 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4056 4057 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4058 andl(len, 0xffffffc0); // vector count (in chars) 4059 jccb(Assembler::zero, test_tail); 4060 4061 lea(ary1, Address(ary1, len, Address::times_1)); 4062 negptr(len); 4063 4064 bind(test_64_loop); 4065 // Check whether our 64 elements of size byte contain negatives 4066 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4067 kortestql(mask1, mask1); 4068 jcc(Assembler::notZero, BREAK_LOOP); 4069 4070 addptr(len, 64); 4071 jccb(Assembler::notZero, test_64_loop); 4072 4073 bind(test_tail); 4074 // bail out when there is nothing to be done 4075 testl(tmp1, -1); 4076 jcc(Assembler::zero, DONE); 4077 4078 4079 // check the tail for absense of negatives 4080 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4081 { 4082 Register tmp3_aliased = len; 4083 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4084 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4085 notq(tmp3_aliased); 4086 kmovql(mask2, tmp3_aliased); 4087 } 4088 4089 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4090 ktestq(mask1, mask2); 4091 jcc(Assembler::zero, DONE); 4092 4093 // do a full check for negative registers in the tail 4094 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4095 // ary1 already pointing to the right place 4096 jmpb(TAIL_START); 4097 4098 bind(BREAK_LOOP); 4099 // At least one byte in the last 64 byte block was negative. 4100 // Set up to look at the last 64 bytes as if they were a tail 4101 lea(ary1, Address(ary1, len, Address::times_1)); 4102 addptr(result, len); 4103 // Ignore the very last byte: if all others are positive, 4104 // it must be negative, so we can skip right to the 2+1 byte 4105 // end comparison at this point 4106 orl(result, 63); 4107 movl(len, 63); 4108 // Fallthru to tail compare 4109 } else { 4110 4111 if (UseAVX >= 2 && UseSSE >= 2) { 4112 // With AVX2, use 32-byte vector compare 4113 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4114 4115 // Compare 32-byte vectors 4116 testl(len, 0xffffffe0); // vector count (in bytes) 4117 jccb(Assembler::zero, TAIL_START); 4118 4119 andl(len, 0xffffffe0); 4120 lea(ary1, Address(ary1, len, Address::times_1)); 4121 negptr(len); 4122 4123 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4124 movdl(vec2, tmp1); 4125 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4126 4127 bind(COMPARE_WIDE_VECTORS); 4128 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4129 vptest(vec1, vec2); 4130 jccb(Assembler::notZero, BREAK_LOOP); 4131 addptr(len, 32); 4132 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4133 4134 testl(result, 0x0000001f); // any bytes remaining? 4135 jcc(Assembler::zero, DONE); 4136 4137 // Quick test using the already prepared vector mask 4138 movl(len, result); 4139 andl(len, 0x0000001f); 4140 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4141 vptest(vec1, vec2); 4142 jcc(Assembler::zero, DONE); 4143 // There are zeros, jump to the tail to determine exactly where 4144 jmpb(TAIL_START); 4145 4146 bind(BREAK_LOOP); 4147 // At least one byte in the last 32-byte vector is negative. 4148 // Set up to look at the last 32 bytes as if they were a tail 4149 lea(ary1, Address(ary1, len, Address::times_1)); 4150 addptr(result, len); 4151 // Ignore the very last byte: if all others are positive, 4152 // it must be negative, so we can skip right to the 2+1 byte 4153 // end comparison at this point 4154 orl(result, 31); 4155 movl(len, 31); 4156 // Fallthru to tail compare 4157 } else if (UseSSE42Intrinsics) { 4158 // With SSE4.2, use double quad vector compare 4159 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4160 4161 // Compare 16-byte vectors 4162 testl(len, 0xfffffff0); // vector count (in bytes) 4163 jcc(Assembler::zero, TAIL_START); 4164 4165 andl(len, 0xfffffff0); 4166 lea(ary1, Address(ary1, len, Address::times_1)); 4167 negptr(len); 4168 4169 movl(tmp1, 0x80808080); 4170 movdl(vec2, tmp1); 4171 pshufd(vec2, vec2, 0); 4172 4173 bind(COMPARE_WIDE_VECTORS); 4174 movdqu(vec1, Address(ary1, len, Address::times_1)); 4175 ptest(vec1, vec2); 4176 jccb(Assembler::notZero, BREAK_LOOP); 4177 addptr(len, 16); 4178 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4179 4180 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4181 jcc(Assembler::zero, DONE); 4182 4183 // Quick test using the already prepared vector mask 4184 movl(len, result); 4185 andl(len, 0x0000000f); // tail count (in bytes) 4186 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4187 ptest(vec1, vec2); 4188 jcc(Assembler::zero, DONE); 4189 jmpb(TAIL_START); 4190 4191 bind(BREAK_LOOP); 4192 // At least one byte in the last 16-byte vector is negative. 4193 // Set up and look at the last 16 bytes as if they were a tail 4194 lea(ary1, Address(ary1, len, Address::times_1)); 4195 addptr(result, len); 4196 // Ignore the very last byte: if all others are positive, 4197 // it must be negative, so we can skip right to the 2+1 byte 4198 // end comparison at this point 4199 orl(result, 15); 4200 movl(len, 15); 4201 // Fallthru to tail compare 4202 } 4203 } 4204 4205 bind(TAIL_START); 4206 // Compare 4-byte vectors 4207 andl(len, 0xfffffffc); // vector count (in bytes) 4208 jccb(Assembler::zero, COMPARE_CHAR); 4209 4210 lea(ary1, Address(ary1, len, Address::times_1)); 4211 negptr(len); 4212 4213 bind(COMPARE_VECTORS); 4214 movl(tmp1, Address(ary1, len, Address::times_1)); 4215 andl(tmp1, 0x80808080); 4216 jccb(Assembler::notZero, TAIL_ADJUST); 4217 addptr(len, 4); 4218 jccb(Assembler::notZero, COMPARE_VECTORS); 4219 4220 // Compare trailing char (final 2-3 bytes), if any 4221 bind(COMPARE_CHAR); 4222 4223 testl(result, 0x2); // tail char 4224 jccb(Assembler::zero, COMPARE_BYTE); 4225 load_unsigned_short(tmp1, Address(ary1, 0)); 4226 andl(tmp1, 0x00008080); 4227 jccb(Assembler::notZero, CHAR_ADJUST); 4228 lea(ary1, Address(ary1, 2)); 4229 4230 bind(COMPARE_BYTE); 4231 testl(result, 0x1); // tail byte 4232 jccb(Assembler::zero, DONE); 4233 load_unsigned_byte(tmp1, Address(ary1, 0)); 4234 testl(tmp1, 0x00000080); 4235 jccb(Assembler::zero, DONE); 4236 subptr(result, 1); 4237 jmpb(DONE); 4238 4239 bind(TAIL_ADJUST); 4240 // there are negative bits in the last 4 byte block. 4241 // Adjust result and check the next three bytes 4242 addptr(result, len); 4243 orl(result, 3); 4244 lea(ary1, Address(ary1, len, Address::times_1)); 4245 jmpb(COMPARE_CHAR); 4246 4247 bind(CHAR_ADJUST); 4248 // We are looking at a char + optional byte tail, and found that one 4249 // of the bytes in the char is negative. Adjust the result, check the 4250 // first byte and readjust if needed. 4251 andl(result, 0xfffffffc); 4252 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4253 jccb(Assembler::notZero, DONE); 4254 addptr(result, 1); 4255 4256 // That's it 4257 bind(DONE); 4258 if (UseAVX >= 2 && UseSSE >= 2) { 4259 // clean upper bits of YMM registers 4260 vpxor(vec1, vec1); 4261 vpxor(vec2, vec2); 4262 } 4263 } 4264 4265 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4266 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4267 Register limit, Register result, Register chr, 4268 XMMRegister vec1, XMMRegister vec2, bool is_char, 4269 KRegister mask, bool expand_ary2) { 4270 // for expand_ary2, limit is the (smaller) size of the second array. 4271 ShortBranchVerifier sbv(this); 4272 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4273 4274 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4275 "Expansion only implemented for AVX2"); 4276 4277 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4278 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4279 4280 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4281 int scaleIncr = expand_ary2 ? 8 : 16; 4282 4283 if (is_array_equ) { 4284 // Check the input args 4285 cmpoop(ary1, ary2); 4286 jcc(Assembler::equal, TRUE_LABEL); 4287 4288 // Need additional checks for arrays_equals. 4289 testptr(ary1, ary1); 4290 jcc(Assembler::zero, FALSE_LABEL); 4291 testptr(ary2, ary2); 4292 jcc(Assembler::zero, FALSE_LABEL); 4293 4294 // Check the lengths 4295 movl(limit, Address(ary1, length_offset)); 4296 cmpl(limit, Address(ary2, length_offset)); 4297 jcc(Assembler::notEqual, FALSE_LABEL); 4298 } 4299 4300 // count == 0 4301 testl(limit, limit); 4302 jcc(Assembler::zero, TRUE_LABEL); 4303 4304 if (is_array_equ) { 4305 // Load array address 4306 lea(ary1, Address(ary1, base_offset)); 4307 lea(ary2, Address(ary2, base_offset)); 4308 } 4309 4310 if (is_array_equ && is_char) { 4311 // arrays_equals when used for char[]. 4312 shll(limit, 1); // byte count != 0 4313 } 4314 movl(result, limit); // copy 4315 4316 if (UseAVX >= 2) { 4317 // With AVX2, use 32-byte vector compare 4318 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4319 4320 // Compare 32-byte vectors 4321 if (expand_ary2) { 4322 andl(result, 0x0000000f); // tail count (in bytes) 4323 andl(limit, 0xfffffff0); // vector count (in bytes) 4324 jcc(Assembler::zero, COMPARE_TAIL); 4325 } else { 4326 andl(result, 0x0000001f); // tail count (in bytes) 4327 andl(limit, 0xffffffe0); // vector count (in bytes) 4328 jcc(Assembler::zero, COMPARE_TAIL_16); 4329 } 4330 4331 lea(ary1, Address(ary1, limit, scaleFactor)); 4332 lea(ary2, Address(ary2, limit, Address::times_1)); 4333 negptr(limit); 4334 4335 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4336 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4337 4338 cmpl(limit, -64); 4339 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4340 4341 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4342 4343 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4344 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4345 kortestql(mask, mask); 4346 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4347 addptr(limit, 64); // update since we already compared at this addr 4348 cmpl(limit, -64); 4349 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4350 4351 // At this point we may still need to compare -limit+result bytes. 4352 // We could execute the next two instruction and just continue via non-wide path: 4353 // cmpl(limit, 0); 4354 // jcc(Assembler::equal, COMPARE_TAIL); // true 4355 // But since we stopped at the points ary{1,2}+limit which are 4356 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4357 // (|limit| <= 32 and result < 32), 4358 // we may just compare the last 64 bytes. 4359 // 4360 addptr(result, -64); // it is safe, bc we just came from this area 4361 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4362 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4363 kortestql(mask, mask); 4364 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4365 4366 jmp(TRUE_LABEL); 4367 4368 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4369 4370 }//if (VM_Version::supports_avx512vlbw()) 4371 4372 bind(COMPARE_WIDE_VECTORS); 4373 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4374 if (expand_ary2) { 4375 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4376 } else { 4377 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4378 } 4379 vpxor(vec1, vec2); 4380 4381 vptest(vec1, vec1); 4382 jcc(Assembler::notZero, FALSE_LABEL); 4383 addptr(limit, scaleIncr * 2); 4384 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4385 4386 testl(result, result); 4387 jcc(Assembler::zero, TRUE_LABEL); 4388 4389 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4390 if (expand_ary2) { 4391 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4392 } else { 4393 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4394 } 4395 vpxor(vec1, vec2); 4396 4397 vptest(vec1, vec1); 4398 jcc(Assembler::notZero, FALSE_LABEL); 4399 jmp(TRUE_LABEL); 4400 4401 bind(COMPARE_TAIL_16); // limit is zero 4402 movl(limit, result); 4403 4404 // Compare 16-byte chunks 4405 andl(result, 0x0000000f); // tail count (in bytes) 4406 andl(limit, 0xfffffff0); // vector count (in bytes) 4407 jcc(Assembler::zero, COMPARE_TAIL); 4408 4409 lea(ary1, Address(ary1, limit, scaleFactor)); 4410 lea(ary2, Address(ary2, limit, Address::times_1)); 4411 negptr(limit); 4412 4413 bind(COMPARE_WIDE_VECTORS_16); 4414 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4415 if (expand_ary2) { 4416 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4417 } else { 4418 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4419 } 4420 pxor(vec1, vec2); 4421 4422 ptest(vec1, vec1); 4423 jcc(Assembler::notZero, FALSE_LABEL); 4424 addptr(limit, scaleIncr); 4425 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4426 4427 bind(COMPARE_TAIL); // limit is zero 4428 movl(limit, result); 4429 // Fallthru to tail compare 4430 } else if (UseSSE42Intrinsics) { 4431 // With SSE4.2, use double quad vector compare 4432 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4433 4434 // Compare 16-byte vectors 4435 andl(result, 0x0000000f); // tail count (in bytes) 4436 andl(limit, 0xfffffff0); // vector count (in bytes) 4437 jcc(Assembler::zero, COMPARE_TAIL); 4438 4439 lea(ary1, Address(ary1, limit, Address::times_1)); 4440 lea(ary2, Address(ary2, limit, Address::times_1)); 4441 negptr(limit); 4442 4443 bind(COMPARE_WIDE_VECTORS); 4444 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4445 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4446 pxor(vec1, vec2); 4447 4448 ptest(vec1, vec1); 4449 jcc(Assembler::notZero, FALSE_LABEL); 4450 addptr(limit, 16); 4451 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4452 4453 testl(result, result); 4454 jcc(Assembler::zero, TRUE_LABEL); 4455 4456 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4457 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4458 pxor(vec1, vec2); 4459 4460 ptest(vec1, vec1); 4461 jccb(Assembler::notZero, FALSE_LABEL); 4462 jmpb(TRUE_LABEL); 4463 4464 bind(COMPARE_TAIL); // limit is zero 4465 movl(limit, result); 4466 // Fallthru to tail compare 4467 } 4468 4469 // Compare 4-byte vectors 4470 if (expand_ary2) { 4471 testl(result, result); 4472 jccb(Assembler::zero, TRUE_LABEL); 4473 } else { 4474 andl(limit, 0xfffffffc); // vector count (in bytes) 4475 jccb(Assembler::zero, COMPARE_CHAR); 4476 } 4477 4478 lea(ary1, Address(ary1, limit, scaleFactor)); 4479 lea(ary2, Address(ary2, limit, Address::times_1)); 4480 negptr(limit); 4481 4482 bind(COMPARE_VECTORS); 4483 if (expand_ary2) { 4484 // There are no "vector" operations for bytes to shorts 4485 movzbl(chr, Address(ary2, limit, Address::times_1)); 4486 cmpw(Address(ary1, limit, Address::times_2), chr); 4487 jccb(Assembler::notEqual, FALSE_LABEL); 4488 addptr(limit, 1); 4489 jcc(Assembler::notZero, COMPARE_VECTORS); 4490 jmp(TRUE_LABEL); 4491 } else { 4492 movl(chr, Address(ary1, limit, Address::times_1)); 4493 cmpl(chr, Address(ary2, limit, Address::times_1)); 4494 jccb(Assembler::notEqual, FALSE_LABEL); 4495 addptr(limit, 4); 4496 jcc(Assembler::notZero, COMPARE_VECTORS); 4497 } 4498 4499 // Compare trailing char (final 2 bytes), if any 4500 bind(COMPARE_CHAR); 4501 testl(result, 0x2); // tail char 4502 jccb(Assembler::zero, COMPARE_BYTE); 4503 load_unsigned_short(chr, Address(ary1, 0)); 4504 load_unsigned_short(limit, Address(ary2, 0)); 4505 cmpl(chr, limit); 4506 jccb(Assembler::notEqual, FALSE_LABEL); 4507 4508 if (is_array_equ && is_char) { 4509 bind(COMPARE_BYTE); 4510 } else { 4511 lea(ary1, Address(ary1, 2)); 4512 lea(ary2, Address(ary2, 2)); 4513 4514 bind(COMPARE_BYTE); 4515 testl(result, 0x1); // tail byte 4516 jccb(Assembler::zero, TRUE_LABEL); 4517 load_unsigned_byte(chr, Address(ary1, 0)); 4518 load_unsigned_byte(limit, Address(ary2, 0)); 4519 cmpl(chr, limit); 4520 jccb(Assembler::notEqual, FALSE_LABEL); 4521 } 4522 bind(TRUE_LABEL); 4523 movl(result, 1); // return true 4524 jmpb(DONE); 4525 4526 bind(FALSE_LABEL); 4527 xorl(result, result); // return false 4528 4529 // That's it 4530 bind(DONE); 4531 if (UseAVX >= 2) { 4532 // clean upper bits of YMM registers 4533 vpxor(vec1, vec1); 4534 vpxor(vec2, vec2); 4535 } 4536 } 4537 4538 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4539 #define __ masm. 4540 Register dst = stub.data<0>(); 4541 XMMRegister src = stub.data<1>(); 4542 address target = stub.data<2>(); 4543 __ bind(stub.entry()); 4544 __ subptr(rsp, 8); 4545 __ movdbl(Address(rsp), src); 4546 __ call(RuntimeAddress(target)); 4547 __ pop(dst); 4548 __ jmp(stub.continuation()); 4549 #undef __ 4550 } 4551 4552 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4553 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4554 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4555 4556 address slowpath_target; 4557 if (dst_bt == T_INT) { 4558 if (src_bt == T_FLOAT) { 4559 cvttss2sil(dst, src); 4560 cmpl(dst, 0x80000000); 4561 slowpath_target = StubRoutines::x86::f2i_fixup(); 4562 } else { 4563 cvttsd2sil(dst, src); 4564 cmpl(dst, 0x80000000); 4565 slowpath_target = StubRoutines::x86::d2i_fixup(); 4566 } 4567 } else { 4568 if (src_bt == T_FLOAT) { 4569 cvttss2siq(dst, src); 4570 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4571 slowpath_target = StubRoutines::x86::f2l_fixup(); 4572 } else { 4573 cvttsd2siq(dst, src); 4574 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4575 slowpath_target = StubRoutines::x86::d2l_fixup(); 4576 } 4577 } 4578 4579 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4580 jcc(Assembler::equal, stub->entry()); 4581 bind(stub->continuation()); 4582 } 4583 4584 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4585 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4586 switch(ideal_opc) { 4587 case Op_LShiftVS: 4588 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4589 case Op_LShiftVI: 4590 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4591 case Op_LShiftVL: 4592 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4593 case Op_RShiftVS: 4594 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4595 case Op_RShiftVI: 4596 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4597 case Op_RShiftVL: 4598 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4599 case Op_URShiftVS: 4600 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4601 case Op_URShiftVI: 4602 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4603 case Op_URShiftVL: 4604 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4605 case Op_RotateRightV: 4606 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4607 case Op_RotateLeftV: 4608 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4609 default: 4610 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4611 break; 4612 } 4613 } 4614 4615 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4616 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) { 4617 if (is_unsigned) { 4618 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4619 } else { 4620 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4621 } 4622 } 4623 4624 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4625 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4626 switch (elem_bt) { 4627 case T_BYTE: 4628 if (ideal_opc == Op_SaturatingAddV) { 4629 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4630 } else { 4631 assert(ideal_opc == Op_SaturatingSubV, ""); 4632 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4633 } 4634 break; 4635 case T_SHORT: 4636 if (ideal_opc == Op_SaturatingAddV) { 4637 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4638 } else { 4639 assert(ideal_opc == Op_SaturatingSubV, ""); 4640 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4641 } 4642 break; 4643 default: 4644 fatal("Unsupported type %s", type2name(elem_bt)); 4645 break; 4646 } 4647 } 4648 4649 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4650 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 4651 switch (elem_bt) { 4652 case T_BYTE: 4653 if (ideal_opc == Op_SaturatingAddV) { 4654 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4655 } else { 4656 assert(ideal_opc == Op_SaturatingSubV, ""); 4657 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4658 } 4659 break; 4660 case T_SHORT: 4661 if (ideal_opc == Op_SaturatingAddV) { 4662 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4663 } else { 4664 assert(ideal_opc == Op_SaturatingSubV, ""); 4665 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4666 } 4667 break; 4668 default: 4669 fatal("Unsupported type %s", type2name(elem_bt)); 4670 break; 4671 } 4672 } 4673 4674 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1, 4675 Address src2, bool is_unsigned, bool merge, int vlen_enc) { 4676 if (is_unsigned) { 4677 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4678 } else { 4679 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc); 4680 } 4681 } 4682 4683 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4684 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4685 switch (elem_bt) { 4686 case T_BYTE: 4687 if (ideal_opc == Op_SaturatingAddV) { 4688 evpaddsb(dst, mask, src1, src2, merge, vlen_enc); 4689 } else { 4690 assert(ideal_opc == Op_SaturatingSubV, ""); 4691 evpsubsb(dst, mask, src1, src2, merge, vlen_enc); 4692 } 4693 break; 4694 case T_SHORT: 4695 if (ideal_opc == Op_SaturatingAddV) { 4696 evpaddsw(dst, mask, src1, src2, merge, vlen_enc); 4697 } else { 4698 assert(ideal_opc == Op_SaturatingSubV, ""); 4699 evpsubsw(dst, mask, src1, src2, merge, vlen_enc); 4700 } 4701 break; 4702 default: 4703 fatal("Unsupported type %s", type2name(elem_bt)); 4704 break; 4705 } 4706 } 4707 4708 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, 4709 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4710 switch (elem_bt) { 4711 case T_BYTE: 4712 if (ideal_opc == Op_SaturatingAddV) { 4713 evpaddusb(dst, mask, src1, src2, merge, vlen_enc); 4714 } else { 4715 assert(ideal_opc == Op_SaturatingSubV, ""); 4716 evpsubusb(dst, mask, src1, src2, merge, vlen_enc); 4717 } 4718 break; 4719 case T_SHORT: 4720 if (ideal_opc == Op_SaturatingAddV) { 4721 evpaddusw(dst, mask, src1, src2, merge, vlen_enc); 4722 } else { 4723 assert(ideal_opc == Op_SaturatingSubV, ""); 4724 evpsubusw(dst, mask, src1, src2, merge, vlen_enc); 4725 } 4726 break; 4727 default: 4728 fatal("Unsupported type %s", type2name(elem_bt)); 4729 break; 4730 } 4731 } 4732 4733 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4734 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4735 bool is_varshift) { 4736 switch (ideal_opc) { 4737 case Op_AddVB: 4738 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4739 case Op_AddVS: 4740 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4741 case Op_AddVI: 4742 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4743 case Op_AddVL: 4744 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4745 case Op_AddVF: 4746 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4747 case Op_AddVD: 4748 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4749 case Op_SubVB: 4750 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4751 case Op_SubVS: 4752 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4753 case Op_SubVI: 4754 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4755 case Op_SubVL: 4756 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4757 case Op_SubVF: 4758 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4759 case Op_SubVD: 4760 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4761 case Op_MulVS: 4762 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4763 case Op_MulVI: 4764 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4765 case Op_MulVL: 4766 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4767 case Op_MulVF: 4768 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4769 case Op_MulVD: 4770 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4771 case Op_DivVF: 4772 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4773 case Op_DivVD: 4774 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4775 case Op_SqrtVF: 4776 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4777 case Op_SqrtVD: 4778 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4779 case Op_AbsVB: 4780 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4781 case Op_AbsVS: 4782 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4783 case Op_AbsVI: 4784 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4785 case Op_AbsVL: 4786 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4787 case Op_FmaVF: 4788 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_FmaVD: 4790 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4791 case Op_VectorRearrange: 4792 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4793 case Op_LShiftVS: 4794 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4795 case Op_LShiftVI: 4796 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4797 case Op_LShiftVL: 4798 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4799 case Op_RShiftVS: 4800 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4801 case Op_RShiftVI: 4802 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4803 case Op_RShiftVL: 4804 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4805 case Op_URShiftVS: 4806 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4807 case Op_URShiftVI: 4808 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4809 case Op_URShiftVL: 4810 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4811 case Op_RotateLeftV: 4812 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_RotateRightV: 4814 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_MaxV: 4816 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4817 case Op_MinV: 4818 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4819 case Op_UMinV: 4820 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4821 case Op_UMaxV: 4822 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4823 case Op_XorV: 4824 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4825 case Op_OrV: 4826 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4827 case Op_AndV: 4828 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4829 default: 4830 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4831 break; 4832 } 4833 } 4834 4835 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4836 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4837 switch (ideal_opc) { 4838 case Op_AddVB: 4839 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4840 case Op_AddVS: 4841 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4842 case Op_AddVI: 4843 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4844 case Op_AddVL: 4845 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4846 case Op_AddVF: 4847 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4848 case Op_AddVD: 4849 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4850 case Op_SubVB: 4851 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4852 case Op_SubVS: 4853 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4854 case Op_SubVI: 4855 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4856 case Op_SubVL: 4857 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4858 case Op_SubVF: 4859 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4860 case Op_SubVD: 4861 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4862 case Op_MulVS: 4863 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4864 case Op_MulVI: 4865 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4866 case Op_MulVL: 4867 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4868 case Op_MulVF: 4869 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4870 case Op_MulVD: 4871 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4872 case Op_DivVF: 4873 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4874 case Op_DivVD: 4875 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4876 case Op_FmaVF: 4877 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4878 case Op_FmaVD: 4879 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4880 case Op_MaxV: 4881 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4882 case Op_MinV: 4883 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4884 case Op_UMaxV: 4885 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4886 case Op_UMinV: 4887 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4888 case Op_XorV: 4889 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4890 case Op_OrV: 4891 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4892 case Op_AndV: 4893 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4894 default: 4895 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]); 4896 break; 4897 } 4898 } 4899 4900 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4901 KRegister src1, KRegister src2) { 4902 BasicType etype = T_ILLEGAL; 4903 switch(mask_len) { 4904 case 2: 4905 case 4: 4906 case 8: etype = T_BYTE; break; 4907 case 16: etype = T_SHORT; break; 4908 case 32: etype = T_INT; break; 4909 case 64: etype = T_LONG; break; 4910 default: fatal("Unsupported type"); break; 4911 } 4912 assert(etype != T_ILLEGAL, ""); 4913 switch(ideal_opc) { 4914 case Op_AndVMask: 4915 kand(etype, dst, src1, src2); break; 4916 case Op_OrVMask: 4917 kor(etype, dst, src1, src2); break; 4918 case Op_XorVMask: 4919 kxor(etype, dst, src1, src2); break; 4920 default: 4921 fatal("Unsupported masked operation"); break; 4922 } 4923 } 4924 4925 /* 4926 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4927 * If src is NaN, the result is 0. 4928 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4929 * the result is equal to the value of Integer.MIN_VALUE. 4930 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4931 * the result is equal to the value of Integer.MAX_VALUE. 4932 */ 4933 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4934 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4935 Register rscratch, AddressLiteral float_sign_flip, 4936 int vec_enc) { 4937 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4938 Label done; 4939 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4940 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4941 vptest(xtmp2, xtmp2, vec_enc); 4942 jccb(Assembler::equal, done); 4943 4944 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4945 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4946 4947 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4948 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4949 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4950 4951 // Recompute the mask for remaining special value. 4952 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4953 // Extract SRC values corresponding to TRUE mask lanes. 4954 vpand(xtmp4, xtmp2, src, vec_enc); 4955 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4956 // values are set. 4957 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4958 4959 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4960 bind(done); 4961 } 4962 4963 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4964 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4965 Register rscratch, AddressLiteral float_sign_flip, 4966 int vec_enc) { 4967 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4968 Label done; 4969 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4970 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4971 kortestwl(ktmp1, ktmp1); 4972 jccb(Assembler::equal, done); 4973 4974 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4975 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4976 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4977 4978 kxorwl(ktmp1, ktmp1, ktmp2); 4979 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4980 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4981 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4982 bind(done); 4983 } 4984 4985 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4986 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4987 Register rscratch, AddressLiteral double_sign_flip, 4988 int vec_enc) { 4989 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4990 4991 Label done; 4992 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4993 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4994 kortestwl(ktmp1, ktmp1); 4995 jccb(Assembler::equal, done); 4996 4997 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4998 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4999 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5000 5001 kxorwl(ktmp1, ktmp1, ktmp2); 5002 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5003 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5004 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5005 bind(done); 5006 } 5007 5008 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5009 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5010 Register rscratch, AddressLiteral float_sign_flip, 5011 int vec_enc) { 5012 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5013 Label done; 5014 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5015 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5016 kortestwl(ktmp1, ktmp1); 5017 jccb(Assembler::equal, done); 5018 5019 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5020 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5021 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5022 5023 kxorwl(ktmp1, ktmp1, ktmp2); 5024 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5025 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5026 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5027 bind(done); 5028 } 5029 5030 /* 5031 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5032 * If src is NaN, the result is 0. 5033 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5034 * the result is equal to the value of Long.MIN_VALUE. 5035 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5036 * the result is equal to the value of Long.MAX_VALUE. 5037 */ 5038 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5039 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5040 Register rscratch, AddressLiteral double_sign_flip, 5041 int vec_enc) { 5042 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5043 5044 Label done; 5045 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5046 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5047 kortestwl(ktmp1, ktmp1); 5048 jccb(Assembler::equal, done); 5049 5050 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5051 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5052 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5053 5054 kxorwl(ktmp1, ktmp1, ktmp2); 5055 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5056 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5057 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5058 bind(done); 5059 } 5060 5061 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5062 XMMRegister xtmp, int index, int vec_enc) { 5063 assert(vec_enc < Assembler::AVX_512bit, ""); 5064 if (vec_enc == Assembler::AVX_256bit) { 5065 vextractf128_high(xtmp, src); 5066 vshufps(dst, src, xtmp, index, vec_enc); 5067 } else { 5068 vshufps(dst, src, zero, index, vec_enc); 5069 } 5070 } 5071 5072 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5073 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5074 AddressLiteral float_sign_flip, int src_vec_enc) { 5075 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5076 5077 Label done; 5078 // Compare the destination lanes with float_sign_flip 5079 // value to get mask for all special values. 5080 movdqu(xtmp1, float_sign_flip, rscratch); 5081 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5082 ptest(xtmp2, xtmp2); 5083 jccb(Assembler::equal, done); 5084 5085 // Flip float_sign_flip to get max integer value. 5086 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5087 pxor(xtmp1, xtmp4); 5088 5089 // Set detination lanes corresponding to unordered source lanes as zero. 5090 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5091 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5092 5093 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5094 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5095 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5096 5097 // Recompute the mask for remaining special value. 5098 pxor(xtmp2, xtmp3); 5099 // Extract mask corresponding to non-negative source lanes. 5100 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5101 5102 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5103 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5104 pand(xtmp3, xtmp2); 5105 5106 // Replace destination lanes holding special value(0x80000000) with max int 5107 // if corresponding source lane holds a +ve value. 5108 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5109 bind(done); 5110 } 5111 5112 5113 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5114 XMMRegister xtmp, Register rscratch, int vec_enc) { 5115 switch(to_elem_bt) { 5116 case T_SHORT: 5117 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5118 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5119 vpackusdw(dst, dst, zero, vec_enc); 5120 if (vec_enc == Assembler::AVX_256bit) { 5121 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5122 } 5123 break; 5124 case T_BYTE: 5125 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5126 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5127 vpackusdw(dst, dst, zero, vec_enc); 5128 if (vec_enc == Assembler::AVX_256bit) { 5129 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5130 } 5131 vpackuswb(dst, dst, zero, vec_enc); 5132 break; 5133 default: assert(false, "%s", type2name(to_elem_bt)); 5134 } 5135 } 5136 5137 /* 5138 * Algorithm for vector D2L and F2I conversions:- 5139 * a) Perform vector D2L/F2I cast. 5140 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5141 * It signifies that source value could be any of the special floating point 5142 * values(NaN,-Inf,Inf,Max,-Min). 5143 * c) Set destination to zero if source is NaN value. 5144 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5145 */ 5146 5147 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5148 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5149 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5150 int to_elem_sz = type2aelembytes(to_elem_bt); 5151 assert(to_elem_sz <= 4, ""); 5152 vcvttps2dq(dst, src, vec_enc); 5153 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5154 if (to_elem_sz < 4) { 5155 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5156 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5157 } 5158 } 5159 5160 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5161 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5162 Register rscratch, int vec_enc) { 5163 int to_elem_sz = type2aelembytes(to_elem_bt); 5164 assert(to_elem_sz <= 4, ""); 5165 vcvttps2dq(dst, src, vec_enc); 5166 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5167 switch(to_elem_bt) { 5168 case T_INT: 5169 break; 5170 case T_SHORT: 5171 evpmovdw(dst, dst, vec_enc); 5172 break; 5173 case T_BYTE: 5174 evpmovdb(dst, dst, vec_enc); 5175 break; 5176 default: assert(false, "%s", type2name(to_elem_bt)); 5177 } 5178 } 5179 5180 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5181 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5182 Register rscratch, int vec_enc) { 5183 evcvttps2qq(dst, src, vec_enc); 5184 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5185 } 5186 5187 // Handling for downcasting from double to integer or sub-word types on AVX2. 5188 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5189 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5190 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5191 int to_elem_sz = type2aelembytes(to_elem_bt); 5192 assert(to_elem_sz < 8, ""); 5193 vcvttpd2dq(dst, src, vec_enc); 5194 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5195 float_sign_flip, vec_enc); 5196 if (to_elem_sz < 4) { 5197 // xtmp4 holds all zero lanes. 5198 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5199 } 5200 } 5201 5202 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5203 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5204 KRegister ktmp2, AddressLiteral sign_flip, 5205 Register rscratch, int vec_enc) { 5206 if (VM_Version::supports_avx512dq()) { 5207 evcvttpd2qq(dst, src, vec_enc); 5208 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5209 switch(to_elem_bt) { 5210 case T_LONG: 5211 break; 5212 case T_INT: 5213 evpmovsqd(dst, dst, vec_enc); 5214 break; 5215 case T_SHORT: 5216 evpmovsqd(dst, dst, vec_enc); 5217 evpmovdw(dst, dst, vec_enc); 5218 break; 5219 case T_BYTE: 5220 evpmovsqd(dst, dst, vec_enc); 5221 evpmovdb(dst, dst, vec_enc); 5222 break; 5223 default: assert(false, "%s", type2name(to_elem_bt)); 5224 } 5225 } else { 5226 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5227 vcvttpd2dq(dst, src, vec_enc); 5228 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5229 switch(to_elem_bt) { 5230 case T_INT: 5231 break; 5232 case T_SHORT: 5233 evpmovdw(dst, dst, vec_enc); 5234 break; 5235 case T_BYTE: 5236 evpmovdb(dst, dst, vec_enc); 5237 break; 5238 default: assert(false, "%s", type2name(to_elem_bt)); 5239 } 5240 } 5241 } 5242 5243 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5244 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5245 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5246 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5247 // and re-instantiate original MXCSR.RC mode after that. 5248 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5249 5250 mov64(tmp, julong_cast(0.5L)); 5251 evpbroadcastq(xtmp1, tmp, vec_enc); 5252 vaddpd(xtmp1, src , xtmp1, vec_enc); 5253 evcvtpd2qq(dst, xtmp1, vec_enc); 5254 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5255 double_sign_flip, vec_enc);; 5256 5257 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5258 } 5259 5260 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5261 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5262 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5263 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5264 // and re-instantiate original MXCSR.RC mode after that. 5265 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5266 5267 movl(tmp, jint_cast(0.5)); 5268 movq(xtmp1, tmp); 5269 vbroadcastss(xtmp1, xtmp1, vec_enc); 5270 vaddps(xtmp1, src , xtmp1, vec_enc); 5271 vcvtps2dq(dst, xtmp1, vec_enc); 5272 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5273 float_sign_flip, vec_enc); 5274 5275 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5276 } 5277 5278 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5279 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5280 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5281 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5282 // and re-instantiate original MXCSR.RC mode after that. 5283 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5284 5285 movl(tmp, jint_cast(0.5)); 5286 movq(xtmp1, tmp); 5287 vbroadcastss(xtmp1, xtmp1, vec_enc); 5288 vaddps(xtmp1, src , xtmp1, vec_enc); 5289 vcvtps2dq(dst, xtmp1, vec_enc); 5290 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5291 5292 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5293 } 5294 5295 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5296 BasicType from_elem_bt, BasicType to_elem_bt) { 5297 switch (from_elem_bt) { 5298 case T_BYTE: 5299 switch (to_elem_bt) { 5300 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5301 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5302 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5303 default: ShouldNotReachHere(); 5304 } 5305 break; 5306 case T_SHORT: 5307 switch (to_elem_bt) { 5308 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5309 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5310 default: ShouldNotReachHere(); 5311 } 5312 break; 5313 case T_INT: 5314 assert(to_elem_bt == T_LONG, ""); 5315 vpmovzxdq(dst, src, vlen_enc); 5316 break; 5317 default: 5318 ShouldNotReachHere(); 5319 } 5320 } 5321 5322 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5323 BasicType from_elem_bt, BasicType to_elem_bt) { 5324 switch (from_elem_bt) { 5325 case T_BYTE: 5326 switch (to_elem_bt) { 5327 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5328 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5329 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5330 default: ShouldNotReachHere(); 5331 } 5332 break; 5333 case T_SHORT: 5334 switch (to_elem_bt) { 5335 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5336 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5337 default: ShouldNotReachHere(); 5338 } 5339 break; 5340 case T_INT: 5341 assert(to_elem_bt == T_LONG, ""); 5342 vpmovsxdq(dst, src, vlen_enc); 5343 break; 5344 default: 5345 ShouldNotReachHere(); 5346 } 5347 } 5348 5349 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5350 BasicType dst_bt, BasicType src_bt, int vlen) { 5351 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5352 assert(vlen_enc != AVX_512bit, ""); 5353 5354 int dst_bt_size = type2aelembytes(dst_bt); 5355 int src_bt_size = type2aelembytes(src_bt); 5356 if (dst_bt_size > src_bt_size) { 5357 switch (dst_bt_size / src_bt_size) { 5358 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5359 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5360 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5361 default: ShouldNotReachHere(); 5362 } 5363 } else { 5364 assert(dst_bt_size < src_bt_size, ""); 5365 switch (src_bt_size / dst_bt_size) { 5366 case 2: { 5367 if (vlen_enc == AVX_128bit) { 5368 vpacksswb(dst, src, src, vlen_enc); 5369 } else { 5370 vpacksswb(dst, src, src, vlen_enc); 5371 vpermq(dst, dst, 0x08, vlen_enc); 5372 } 5373 break; 5374 } 5375 case 4: { 5376 if (vlen_enc == AVX_128bit) { 5377 vpackssdw(dst, src, src, vlen_enc); 5378 vpacksswb(dst, dst, dst, vlen_enc); 5379 } else { 5380 vpackssdw(dst, src, src, vlen_enc); 5381 vpermq(dst, dst, 0x08, vlen_enc); 5382 vpacksswb(dst, dst, dst, AVX_128bit); 5383 } 5384 break; 5385 } 5386 case 8: { 5387 if (vlen_enc == AVX_128bit) { 5388 vpshufd(dst, src, 0x08, vlen_enc); 5389 vpackssdw(dst, dst, dst, vlen_enc); 5390 vpacksswb(dst, dst, dst, vlen_enc); 5391 } else { 5392 vpshufd(dst, src, 0x08, vlen_enc); 5393 vpermq(dst, dst, 0x08, vlen_enc); 5394 vpackssdw(dst, dst, dst, AVX_128bit); 5395 vpacksswb(dst, dst, dst, AVX_128bit); 5396 } 5397 break; 5398 } 5399 default: ShouldNotReachHere(); 5400 } 5401 } 5402 } 5403 5404 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5405 bool merge, BasicType bt, int vlen_enc) { 5406 if (bt == T_INT) { 5407 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5408 } else { 5409 assert(bt == T_LONG, ""); 5410 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5411 } 5412 } 5413 5414 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5415 bool merge, BasicType bt, int vlen_enc) { 5416 if (bt == T_INT) { 5417 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5418 } else { 5419 assert(bt == T_LONG, ""); 5420 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5421 } 5422 } 5423 5424 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5425 Register rtmp2, XMMRegister xtmp, int mask_len, 5426 int vec_enc) { 5427 int index = 0; 5428 int vindex = 0; 5429 mov64(rtmp1, 0x0101010101010101L); 5430 pdepq(rtmp1, src, rtmp1); 5431 if (mask_len > 8) { 5432 movq(rtmp2, src); 5433 vpxor(xtmp, xtmp, xtmp, vec_enc); 5434 movq(xtmp, rtmp1); 5435 } 5436 movq(dst, rtmp1); 5437 5438 mask_len -= 8; 5439 while (mask_len > 0) { 5440 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5441 index++; 5442 if ((index % 2) == 0) { 5443 pxor(xtmp, xtmp); 5444 } 5445 mov64(rtmp1, 0x0101010101010101L); 5446 shrq(rtmp2, 8); 5447 pdepq(rtmp1, rtmp2, rtmp1); 5448 pinsrq(xtmp, rtmp1, index % 2); 5449 vindex = index / 2; 5450 if (vindex) { 5451 // Write entire 16 byte vector when both 64 bit 5452 // lanes are update to save redundant instructions. 5453 if (index % 2) { 5454 vinsertf128(dst, dst, xtmp, vindex); 5455 } 5456 } else { 5457 vmovdqu(dst, xtmp); 5458 } 5459 mask_len -= 8; 5460 } 5461 } 5462 5463 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5464 switch(opc) { 5465 case Op_VectorMaskTrueCount: 5466 popcntq(dst, tmp); 5467 break; 5468 case Op_VectorMaskLastTrue: 5469 if (VM_Version::supports_lzcnt()) { 5470 lzcntq(tmp, tmp); 5471 movl(dst, 63); 5472 subl(dst, tmp); 5473 } else { 5474 movl(dst, -1); 5475 bsrq(tmp, tmp); 5476 cmov32(Assembler::notZero, dst, tmp); 5477 } 5478 break; 5479 case Op_VectorMaskFirstTrue: 5480 if (VM_Version::supports_bmi1()) { 5481 if (masklen < 32) { 5482 orl(tmp, 1 << masklen); 5483 tzcntl(dst, tmp); 5484 } else if (masklen == 32) { 5485 tzcntl(dst, tmp); 5486 } else { 5487 assert(masklen == 64, ""); 5488 tzcntq(dst, tmp); 5489 } 5490 } else { 5491 if (masklen < 32) { 5492 orl(tmp, 1 << masklen); 5493 bsfl(dst, tmp); 5494 } else { 5495 assert(masklen == 32 || masklen == 64, ""); 5496 movl(dst, masklen); 5497 if (masklen == 32) { 5498 bsfl(tmp, tmp); 5499 } else { 5500 bsfq(tmp, tmp); 5501 } 5502 cmov32(Assembler::notZero, dst, tmp); 5503 } 5504 } 5505 break; 5506 case Op_VectorMaskToLong: 5507 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5508 break; 5509 default: assert(false, "Unhandled mask operation"); 5510 } 5511 } 5512 5513 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5514 int masklen, int masksize, int vec_enc) { 5515 assert(VM_Version::supports_popcnt(), ""); 5516 5517 if(VM_Version::supports_avx512bw()) { 5518 kmovql(tmp, mask); 5519 } else { 5520 assert(masklen <= 16, ""); 5521 kmovwl(tmp, mask); 5522 } 5523 5524 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5525 // operations needs to be clipped. 5526 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5527 andq(tmp, (1 << masklen) - 1); 5528 } 5529 5530 vector_mask_operation_helper(opc, dst, tmp, masklen); 5531 } 5532 5533 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5534 Register tmp, int masklen, BasicType bt, int vec_enc) { 5535 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5536 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5537 assert(VM_Version::supports_popcnt(), ""); 5538 5539 bool need_clip = false; 5540 switch(bt) { 5541 case T_BOOLEAN: 5542 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5543 vpxor(xtmp, xtmp, xtmp, vec_enc); 5544 vpsubb(xtmp, xtmp, mask, vec_enc); 5545 vpmovmskb(tmp, xtmp, vec_enc); 5546 need_clip = masklen < 16; 5547 break; 5548 case T_BYTE: 5549 vpmovmskb(tmp, mask, vec_enc); 5550 need_clip = masklen < 16; 5551 break; 5552 case T_SHORT: 5553 vpacksswb(xtmp, mask, mask, vec_enc); 5554 if (masklen >= 16) { 5555 vpermpd(xtmp, xtmp, 8, vec_enc); 5556 } 5557 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5558 need_clip = masklen < 16; 5559 break; 5560 case T_INT: 5561 case T_FLOAT: 5562 vmovmskps(tmp, mask, vec_enc); 5563 need_clip = masklen < 4; 5564 break; 5565 case T_LONG: 5566 case T_DOUBLE: 5567 vmovmskpd(tmp, mask, vec_enc); 5568 need_clip = masklen < 2; 5569 break; 5570 default: assert(false, "Unhandled type, %s", type2name(bt)); 5571 } 5572 5573 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5574 // operations needs to be clipped. 5575 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5576 // need_clip implies masklen < 32 5577 andq(tmp, (1 << masklen) - 1); 5578 } 5579 5580 vector_mask_operation_helper(opc, dst, tmp, masklen); 5581 } 5582 5583 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5584 Register rtmp2, int mask_len) { 5585 kmov(rtmp1, src); 5586 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5587 mov64(rtmp2, -1L); 5588 pextq(rtmp2, rtmp2, rtmp1); 5589 kmov(dst, rtmp2); 5590 } 5591 5592 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5593 XMMRegister mask, Register rtmp, Register rscratch, 5594 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5595 int vec_enc) { 5596 assert(type2aelembytes(bt) >= 4, ""); 5597 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5598 address compress_perm_table = nullptr; 5599 address expand_perm_table = nullptr; 5600 if (type2aelembytes(bt) == 8) { 5601 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5602 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5603 vmovmskpd(rtmp, mask, vec_enc); 5604 } else { 5605 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5606 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5607 vmovmskps(rtmp, mask, vec_enc); 5608 } 5609 shlq(rtmp, 5); // for 32 byte permute row. 5610 if (opcode == Op_CompressV) { 5611 lea(rscratch, ExternalAddress(compress_perm_table)); 5612 } else { 5613 lea(rscratch, ExternalAddress(expand_perm_table)); 5614 } 5615 addptr(rtmp, rscratch); 5616 vmovdqu(permv, Address(rtmp)); 5617 vpermps(dst, permv, src, Assembler::AVX_256bit); 5618 vpxor(xtmp, xtmp, xtmp, vec_enc); 5619 // Blend the result with zero vector using permute mask, each column entry 5620 // in a permute table row contains either a valid permute index or a -1 (default) 5621 // value, this can potentially be used as a blending mask after 5622 // compressing/expanding the source vector lanes. 5623 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5624 } 5625 5626 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5627 bool merge, BasicType bt, int vec_enc) { 5628 if (opcode == Op_CompressV) { 5629 switch(bt) { 5630 case T_BYTE: 5631 evpcompressb(dst, mask, src, merge, vec_enc); 5632 break; 5633 case T_CHAR: 5634 case T_SHORT: 5635 evpcompressw(dst, mask, src, merge, vec_enc); 5636 break; 5637 case T_INT: 5638 evpcompressd(dst, mask, src, merge, vec_enc); 5639 break; 5640 case T_FLOAT: 5641 evcompressps(dst, mask, src, merge, vec_enc); 5642 break; 5643 case T_LONG: 5644 evpcompressq(dst, mask, src, merge, vec_enc); 5645 break; 5646 case T_DOUBLE: 5647 evcompresspd(dst, mask, src, merge, vec_enc); 5648 break; 5649 default: 5650 fatal("Unsupported type %s", type2name(bt)); 5651 break; 5652 } 5653 } else { 5654 assert(opcode == Op_ExpandV, ""); 5655 switch(bt) { 5656 case T_BYTE: 5657 evpexpandb(dst, mask, src, merge, vec_enc); 5658 break; 5659 case T_CHAR: 5660 case T_SHORT: 5661 evpexpandw(dst, mask, src, merge, vec_enc); 5662 break; 5663 case T_INT: 5664 evpexpandd(dst, mask, src, merge, vec_enc); 5665 break; 5666 case T_FLOAT: 5667 evexpandps(dst, mask, src, merge, vec_enc); 5668 break; 5669 case T_LONG: 5670 evpexpandq(dst, mask, src, merge, vec_enc); 5671 break; 5672 case T_DOUBLE: 5673 evexpandpd(dst, mask, src, merge, vec_enc); 5674 break; 5675 default: 5676 fatal("Unsupported type %s", type2name(bt)); 5677 break; 5678 } 5679 } 5680 } 5681 5682 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5683 KRegister ktmp1, int vec_enc) { 5684 if (opcode == Op_SignumVD) { 5685 vsubpd(dst, zero, one, vec_enc); 5686 // if src < 0 ? -1 : 1 5687 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5688 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5689 // if src == NaN, -0.0 or 0.0 return src. 5690 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5691 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5692 } else { 5693 assert(opcode == Op_SignumVF, ""); 5694 vsubps(dst, zero, one, vec_enc); 5695 // if src < 0 ? -1 : 1 5696 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5697 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5698 // if src == NaN, -0.0 or 0.0 return src. 5699 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5700 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5701 } 5702 } 5703 5704 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5705 XMMRegister xtmp1, int vec_enc) { 5706 if (opcode == Op_SignumVD) { 5707 vsubpd(dst, zero, one, vec_enc); 5708 // if src < 0 ? -1 : 1 5709 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5710 // if src == NaN, -0.0 or 0.0 return src. 5711 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5712 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5713 } else { 5714 assert(opcode == Op_SignumVF, ""); 5715 vsubps(dst, zero, one, vec_enc); 5716 // if src < 0 ? -1 : 1 5717 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5718 // if src == NaN, -0.0 or 0.0 return src. 5719 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5720 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5721 } 5722 } 5723 5724 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5725 if (VM_Version::supports_avx512bw()) { 5726 if (mask_len > 32) { 5727 kmovql(dst, src); 5728 } else { 5729 kmovdl(dst, src); 5730 if (mask_len != 32) { 5731 kshiftrdl(dst, dst, 32 - mask_len); 5732 } 5733 } 5734 } else { 5735 assert(mask_len <= 16, ""); 5736 kmovwl(dst, src); 5737 if (mask_len != 16) { 5738 kshiftrwl(dst, dst, 16 - mask_len); 5739 } 5740 } 5741 } 5742 5743 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5744 int lane_size = type2aelembytes(bt); 5745 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5746 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) { 5747 movptr(rtmp, imm32); 5748 switch(lane_size) { 5749 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5750 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5751 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5752 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5753 fatal("Unsupported lane size %d", lane_size); 5754 break; 5755 } 5756 } else { 5757 movptr(rtmp, imm32); 5758 movq(dst, rtmp); 5759 switch(lane_size) { 5760 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5761 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5762 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5763 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5764 fatal("Unsupported lane size %d", lane_size); 5765 break; 5766 } 5767 } 5768 } 5769 5770 // 5771 // Following is lookup table based popcount computation algorithm:- 5772 // Index Bit set count 5773 // [ 0000 -> 0, 5774 // 0001 -> 1, 5775 // 0010 -> 1, 5776 // 0011 -> 2, 5777 // 0100 -> 1, 5778 // 0101 -> 2, 5779 // 0110 -> 2, 5780 // 0111 -> 3, 5781 // 1000 -> 1, 5782 // 1001 -> 2, 5783 // 1010 -> 3, 5784 // 1011 -> 3, 5785 // 1100 -> 2, 5786 // 1101 -> 3, 5787 // 1111 -> 4 ] 5788 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5789 // shuffle indices for lookup table access. 5790 // b. Right shift each byte of vector lane by 4 positions. 5791 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5792 // shuffle indices for lookup table access. 5793 // d. Add the bitset count of upper and lower 4 bits of each byte. 5794 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5795 // count of all the bytes of a quadword. 5796 // f. Perform step e. for upper 128bit vector lane. 5797 // g. Pack the bitset count of quadwords back to double word. 5798 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5799 5800 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5801 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5802 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5803 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5804 vpsrlw(dst, src, 4, vec_enc); 5805 vpand(dst, dst, xtmp1, vec_enc); 5806 vpand(xtmp1, src, xtmp1, vec_enc); 5807 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5808 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5809 vpshufb(dst, xtmp2, dst, vec_enc); 5810 vpaddb(dst, dst, xtmp1, vec_enc); 5811 } 5812 5813 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5814 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5815 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5816 // Following code is as per steps e,f,g and h of above algorithm. 5817 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5818 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5819 vpsadbw(dst, dst, xtmp2, vec_enc); 5820 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5821 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5822 vpackuswb(dst, xtmp1, dst, vec_enc); 5823 } 5824 5825 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5826 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5827 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5828 // Add the popcount of upper and lower bytes of word. 5829 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5830 vpsrlw(dst, xtmp1, 8, vec_enc); 5831 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5832 vpaddw(dst, dst, xtmp1, vec_enc); 5833 } 5834 5835 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5836 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5837 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5838 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5839 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5840 } 5841 5842 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5843 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5844 switch(bt) { 5845 case T_LONG: 5846 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5847 break; 5848 case T_INT: 5849 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5850 break; 5851 case T_CHAR: 5852 case T_SHORT: 5853 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5854 break; 5855 case T_BYTE: 5856 case T_BOOLEAN: 5857 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5858 break; 5859 default: 5860 fatal("Unsupported type %s", type2name(bt)); 5861 break; 5862 } 5863 } 5864 5865 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5866 KRegister mask, bool merge, int vec_enc) { 5867 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5868 switch(bt) { 5869 case T_LONG: 5870 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5871 evpopcntq(dst, mask, src, merge, vec_enc); 5872 break; 5873 case T_INT: 5874 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5875 evpopcntd(dst, mask, src, merge, vec_enc); 5876 break; 5877 case T_CHAR: 5878 case T_SHORT: 5879 assert(VM_Version::supports_avx512_bitalg(), ""); 5880 evpopcntw(dst, mask, src, merge, vec_enc); 5881 break; 5882 case T_BYTE: 5883 case T_BOOLEAN: 5884 assert(VM_Version::supports_avx512_bitalg(), ""); 5885 evpopcntb(dst, mask, src, merge, vec_enc); 5886 break; 5887 default: 5888 fatal("Unsupported type %s", type2name(bt)); 5889 break; 5890 } 5891 } 5892 5893 // Bit reversal algorithm first reverses the bits of each byte followed by 5894 // a byte level reversal for multi-byte primitive types (short/int/long). 5895 // Algorithm performs a lookup table access to get reverse bit sequence 5896 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5897 // is obtained by swapping the reverse bit sequences of upper and lower 5898 // nibble of a byte. 5899 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5900 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5901 if (VM_Version::supports_avx512vlbw()) { 5902 5903 // Get the reverse bit sequence of lower nibble of each byte. 5904 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5905 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5906 evpandq(dst, xtmp2, src, vec_enc); 5907 vpshufb(dst, xtmp1, dst, vec_enc); 5908 vpsllq(dst, dst, 4, vec_enc); 5909 5910 // Get the reverse bit sequence of upper nibble of each byte. 5911 vpandn(xtmp2, xtmp2, src, vec_enc); 5912 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5913 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5914 5915 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5916 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5917 evporq(xtmp2, dst, xtmp2, vec_enc); 5918 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5919 5920 } else if(vec_enc == Assembler::AVX_512bit) { 5921 // Shift based bit reversal. 5922 assert(bt == T_LONG || bt == T_INT, ""); 5923 5924 // Swap lower and upper nibble of each byte. 5925 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5926 5927 // Swap two least and most significant bits of each nibble. 5928 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5929 5930 // Swap adjacent pair of bits. 5931 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5932 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5933 5934 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5935 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5936 } else { 5937 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5938 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5939 5940 // Get the reverse bit sequence of lower nibble of each byte. 5941 vpand(dst, xtmp2, src, vec_enc); 5942 vpshufb(dst, xtmp1, dst, vec_enc); 5943 vpsllq(dst, dst, 4, vec_enc); 5944 5945 // Get the reverse bit sequence of upper nibble of each byte. 5946 vpandn(xtmp2, xtmp2, src, vec_enc); 5947 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5948 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5949 5950 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5951 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5952 vpor(xtmp2, dst, xtmp2, vec_enc); 5953 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5954 } 5955 } 5956 5957 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5958 XMMRegister xtmp, Register rscratch) { 5959 assert(VM_Version::supports_gfni(), ""); 5960 assert(rscratch != noreg || always_reachable(mask), "missing"); 5961 5962 // Galois field instruction based bit reversal based on following algorithm. 5963 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5964 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5965 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5966 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5967 } 5968 5969 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5970 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5971 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5972 evpandq(dst, xtmp1, src, vec_enc); 5973 vpsllq(dst, dst, nbits, vec_enc); 5974 vpandn(xtmp1, xtmp1, src, vec_enc); 5975 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5976 evporq(dst, dst, xtmp1, vec_enc); 5977 } 5978 5979 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5980 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5981 // Shift based bit reversal. 5982 assert(VM_Version::supports_evex(), ""); 5983 switch(bt) { 5984 case T_LONG: 5985 // Swap upper and lower double word of each quad word. 5986 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5987 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5988 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5989 break; 5990 case T_INT: 5991 // Swap upper and lower word of each double word. 5992 evprord(xtmp1, k0, src, 16, true, vec_enc); 5993 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5994 break; 5995 case T_CHAR: 5996 case T_SHORT: 5997 // Swap upper and lower byte of each word. 5998 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5999 break; 6000 case T_BYTE: 6001 evmovdquq(dst, k0, src, true, vec_enc); 6002 break; 6003 default: 6004 fatal("Unsupported type %s", type2name(bt)); 6005 break; 6006 } 6007 } 6008 6009 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6010 if (bt == T_BYTE) { 6011 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6012 evmovdquq(dst, k0, src, true, vec_enc); 6013 } else { 6014 vmovdqu(dst, src); 6015 } 6016 return; 6017 } 6018 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6019 // pre-computed shuffle indices. 6020 switch(bt) { 6021 case T_LONG: 6022 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6023 break; 6024 case T_INT: 6025 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6026 break; 6027 case T_CHAR: 6028 case T_SHORT: 6029 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6030 break; 6031 default: 6032 fatal("Unsupported type %s", type2name(bt)); 6033 break; 6034 } 6035 vpshufb(dst, src, dst, vec_enc); 6036 } 6037 6038 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6039 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6040 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6041 assert(is_integral_type(bt), ""); 6042 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6043 assert(VM_Version::supports_avx512cd(), ""); 6044 switch(bt) { 6045 case T_LONG: 6046 evplzcntq(dst, ktmp, src, merge, vec_enc); 6047 break; 6048 case T_INT: 6049 evplzcntd(dst, ktmp, src, merge, vec_enc); 6050 break; 6051 case T_SHORT: 6052 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6053 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6054 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6055 vpunpckhwd(dst, xtmp1, src, vec_enc); 6056 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6057 vpackusdw(dst, xtmp2, dst, vec_enc); 6058 break; 6059 case T_BYTE: 6060 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6061 // accessing the lookup table. 6062 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6063 // accessing the lookup table. 6064 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6065 assert(VM_Version::supports_avx512bw(), ""); 6066 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6067 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6068 vpand(xtmp2, dst, src, vec_enc); 6069 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6070 vpsrlw(xtmp3, src, 4, vec_enc); 6071 vpand(xtmp3, dst, xtmp3, vec_enc); 6072 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6073 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6074 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6075 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6076 break; 6077 default: 6078 fatal("Unsupported type %s", type2name(bt)); 6079 break; 6080 } 6081 } 6082 6083 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6084 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6085 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6086 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6087 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6088 // accessing the lookup table. 6089 vpand(dst, xtmp2, src, vec_enc); 6090 vpshufb(dst, xtmp1, dst, vec_enc); 6091 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6092 // accessing the lookup table. 6093 vpsrlw(xtmp3, src, 4, vec_enc); 6094 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6095 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6096 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6097 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6098 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6099 vpaddb(dst, dst, xtmp2, vec_enc); 6100 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6101 } 6102 6103 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6104 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6105 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6106 // Add zero counts of lower byte and upper byte of a word if 6107 // upper byte holds a zero value. 6108 vpsrlw(xtmp3, src, 8, vec_enc); 6109 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6110 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6111 vpsllw(xtmp2, dst, 8, vec_enc); 6112 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6113 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6114 vpsrlw(dst, dst, 8, vec_enc); 6115 } 6116 6117 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6118 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6119 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6120 // hence biased exponent can be used to compute leading zero count as per 6121 // following formula:- 6122 // LZCNT = 31 - (biased_exp - 127) 6123 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6124 6125 // Broadcast 0xFF 6126 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6127 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6128 6129 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher 6130 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit 6131 // contributes to the leading number of zeros. 6132 vpsrld(xtmp2, src, 1, vec_enc); 6133 vpandn(xtmp3, xtmp2, src, vec_enc); 6134 6135 // Extract biased exponent. 6136 vcvtdq2ps(dst, xtmp3, vec_enc); 6137 vpsrld(dst, dst, 23, vec_enc); 6138 vpand(dst, dst, xtmp1, vec_enc); 6139 6140 // Broadcast 127. 6141 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6142 // Exponent = biased_exp - 127 6143 vpsubd(dst, dst, xtmp1, vec_enc); 6144 6145 // Exponent_plus_one = Exponent + 1 6146 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6147 vpaddd(dst, dst, xtmp3, vec_enc); 6148 6149 // Replace -ve exponent with zero, exponent is -ve when src 6150 // lane contains a zero value. 6151 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6152 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6153 6154 // Rematerialize broadcast 32. 6155 vpslld(xtmp1, xtmp3, 5, vec_enc); 6156 // Exponent is 32 if corresponding source lane contains max_int value. 6157 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6158 // LZCNT = 32 - exponent_plus_one 6159 vpsubd(dst, xtmp1, dst, vec_enc); 6160 6161 // Replace LZCNT with a value 1 if corresponding source lane 6162 // contains max_int value. 6163 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6164 6165 // Replace biased_exp with 0 if source lane value is less than zero. 6166 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6167 vblendvps(dst, dst, xtmp2, src, vec_enc); 6168 } 6169 6170 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6171 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6172 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6173 // Add zero counts of lower word and upper word of a double word if 6174 // upper word holds a zero value. 6175 vpsrld(xtmp3, src, 16, vec_enc); 6176 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6177 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6178 vpslld(xtmp2, dst, 16, vec_enc); 6179 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6180 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6181 vpsrld(dst, dst, 16, vec_enc); 6182 // Add zero counts of lower doubleword and upper doubleword of a 6183 // quadword if upper doubleword holds a zero value. 6184 vpsrlq(xtmp3, src, 32, vec_enc); 6185 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6186 vpsllq(xtmp2, dst, 32, vec_enc); 6187 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6188 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6189 vpsrlq(dst, dst, 32, vec_enc); 6190 } 6191 6192 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6193 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6194 Register rtmp, int vec_enc) { 6195 assert(is_integral_type(bt), "unexpected type"); 6196 assert(vec_enc < Assembler::AVX_512bit, ""); 6197 switch(bt) { 6198 case T_LONG: 6199 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6200 break; 6201 case T_INT: 6202 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6203 break; 6204 case T_SHORT: 6205 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6206 break; 6207 case T_BYTE: 6208 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6209 break; 6210 default: 6211 fatal("Unsupported type %s", type2name(bt)); 6212 break; 6213 } 6214 } 6215 6216 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6217 switch(bt) { 6218 case T_BYTE: 6219 vpsubb(dst, src1, src2, vec_enc); 6220 break; 6221 case T_SHORT: 6222 vpsubw(dst, src1, src2, vec_enc); 6223 break; 6224 case T_INT: 6225 vpsubd(dst, src1, src2, vec_enc); 6226 break; 6227 case T_LONG: 6228 vpsubq(dst, src1, src2, vec_enc); 6229 break; 6230 default: 6231 fatal("Unsupported type %s", type2name(bt)); 6232 break; 6233 } 6234 } 6235 6236 // Trailing zero count computation is based on leading zero count operation as per 6237 // following equation. All AVX3 targets support AVX512CD feature which offers 6238 // direct vector instruction to compute leading zero count. 6239 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6240 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6241 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6242 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6243 assert(is_integral_type(bt), ""); 6244 // xtmp = -1 6245 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6246 // xtmp = xtmp + src 6247 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6248 // xtmp = xtmp & ~src 6249 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6250 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6251 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6252 vpsub(bt, dst, xtmp4, dst, vec_enc); 6253 } 6254 6255 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6256 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6257 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6258 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6259 assert(is_integral_type(bt), ""); 6260 // xtmp = 0 6261 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6262 // xtmp = 0 - src 6263 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6264 // xtmp = xtmp | src 6265 vpor(xtmp3, xtmp3, src, vec_enc); 6266 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6267 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6268 vpsub(bt, dst, xtmp1, dst, vec_enc); 6269 } 6270 6271 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6272 Label done; 6273 Label neg_divisor_fastpath; 6274 cmpl(divisor, 0); 6275 jccb(Assembler::less, neg_divisor_fastpath); 6276 xorl(rdx, rdx); 6277 divl(divisor); 6278 jmpb(done); 6279 bind(neg_divisor_fastpath); 6280 // Fastpath for divisor < 0: 6281 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6282 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6283 movl(rdx, rax); 6284 subl(rdx, divisor); 6285 if (VM_Version::supports_bmi1()) { 6286 andnl(rax, rdx, rax); 6287 } else { 6288 notl(rdx); 6289 andl(rax, rdx); 6290 } 6291 shrl(rax, 31); 6292 bind(done); 6293 } 6294 6295 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6296 Label done; 6297 Label neg_divisor_fastpath; 6298 cmpl(divisor, 0); 6299 jccb(Assembler::less, neg_divisor_fastpath); 6300 xorl(rdx, rdx); 6301 divl(divisor); 6302 jmpb(done); 6303 bind(neg_divisor_fastpath); 6304 // Fastpath when divisor < 0: 6305 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6306 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6307 movl(rdx, rax); 6308 subl(rax, divisor); 6309 if (VM_Version::supports_bmi1()) { 6310 andnl(rax, rax, rdx); 6311 } else { 6312 notl(rax); 6313 andl(rax, rdx); 6314 } 6315 sarl(rax, 31); 6316 andl(rax, divisor); 6317 subl(rdx, rax); 6318 bind(done); 6319 } 6320 6321 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6322 Label done; 6323 Label neg_divisor_fastpath; 6324 6325 cmpl(divisor, 0); 6326 jccb(Assembler::less, neg_divisor_fastpath); 6327 xorl(rdx, rdx); 6328 divl(divisor); 6329 jmpb(done); 6330 bind(neg_divisor_fastpath); 6331 // Fastpath for divisor < 0: 6332 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6333 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6334 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6335 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6336 movl(rdx, rax); 6337 subl(rax, divisor); 6338 if (VM_Version::supports_bmi1()) { 6339 andnl(rax, rax, rdx); 6340 } else { 6341 notl(rax); 6342 andl(rax, rdx); 6343 } 6344 movl(tmp, rax); 6345 shrl(rax, 31); // quotient 6346 sarl(tmp, 31); 6347 andl(tmp, divisor); 6348 subl(rdx, tmp); // remainder 6349 bind(done); 6350 } 6351 6352 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6353 XMMRegister xtmp2, Register rtmp) { 6354 if(VM_Version::supports_gfni()) { 6355 // Galois field instruction based bit reversal based on following algorithm. 6356 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6357 mov64(rtmp, 0x8040201008040201L); 6358 movq(xtmp1, src); 6359 movq(xtmp2, rtmp); 6360 gf2p8affineqb(xtmp1, xtmp2, 0); 6361 movq(dst, xtmp1); 6362 } else { 6363 // Swap even and odd numbered bits. 6364 movl(rtmp, src); 6365 andl(rtmp, 0x55555555); 6366 shll(rtmp, 1); 6367 movl(dst, src); 6368 andl(dst, 0xAAAAAAAA); 6369 shrl(dst, 1); 6370 orl(dst, rtmp); 6371 6372 // Swap LSB and MSB 2 bits of each nibble. 6373 movl(rtmp, dst); 6374 andl(rtmp, 0x33333333); 6375 shll(rtmp, 2); 6376 andl(dst, 0xCCCCCCCC); 6377 shrl(dst, 2); 6378 orl(dst, rtmp); 6379 6380 // Swap LSB and MSB 4 bits of each byte. 6381 movl(rtmp, dst); 6382 andl(rtmp, 0x0F0F0F0F); 6383 shll(rtmp, 4); 6384 andl(dst, 0xF0F0F0F0); 6385 shrl(dst, 4); 6386 orl(dst, rtmp); 6387 } 6388 bswapl(dst); 6389 } 6390 6391 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6392 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6393 if(VM_Version::supports_gfni()) { 6394 // Galois field instruction based bit reversal based on following algorithm. 6395 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6396 mov64(rtmp1, 0x8040201008040201L); 6397 movq(xtmp1, src); 6398 movq(xtmp2, rtmp1); 6399 gf2p8affineqb(xtmp1, xtmp2, 0); 6400 movq(dst, xtmp1); 6401 } else { 6402 // Swap even and odd numbered bits. 6403 movq(rtmp1, src); 6404 mov64(rtmp2, 0x5555555555555555L); 6405 andq(rtmp1, rtmp2); 6406 shlq(rtmp1, 1); 6407 movq(dst, src); 6408 notq(rtmp2); 6409 andq(dst, rtmp2); 6410 shrq(dst, 1); 6411 orq(dst, rtmp1); 6412 6413 // Swap LSB and MSB 2 bits of each nibble. 6414 movq(rtmp1, dst); 6415 mov64(rtmp2, 0x3333333333333333L); 6416 andq(rtmp1, rtmp2); 6417 shlq(rtmp1, 2); 6418 notq(rtmp2); 6419 andq(dst, rtmp2); 6420 shrq(dst, 2); 6421 orq(dst, rtmp1); 6422 6423 // Swap LSB and MSB 4 bits of each byte. 6424 movq(rtmp1, dst); 6425 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6426 andq(rtmp1, rtmp2); 6427 shlq(rtmp1, 4); 6428 notq(rtmp2); 6429 andq(dst, rtmp2); 6430 shrq(dst, 4); 6431 orq(dst, rtmp1); 6432 } 6433 bswapq(dst); 6434 } 6435 6436 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6437 Label done; 6438 Label neg_divisor_fastpath; 6439 cmpq(divisor, 0); 6440 jccb(Assembler::less, neg_divisor_fastpath); 6441 xorl(rdx, rdx); 6442 divq(divisor); 6443 jmpb(done); 6444 bind(neg_divisor_fastpath); 6445 // Fastpath for divisor < 0: 6446 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6447 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6448 movq(rdx, rax); 6449 subq(rdx, divisor); 6450 if (VM_Version::supports_bmi1()) { 6451 andnq(rax, rdx, rax); 6452 } else { 6453 notq(rdx); 6454 andq(rax, rdx); 6455 } 6456 shrq(rax, 63); 6457 bind(done); 6458 } 6459 6460 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6461 Label done; 6462 Label neg_divisor_fastpath; 6463 cmpq(divisor, 0); 6464 jccb(Assembler::less, neg_divisor_fastpath); 6465 xorq(rdx, rdx); 6466 divq(divisor); 6467 jmp(done); 6468 bind(neg_divisor_fastpath); 6469 // Fastpath when divisor < 0: 6470 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6471 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6472 movq(rdx, rax); 6473 subq(rax, divisor); 6474 if (VM_Version::supports_bmi1()) { 6475 andnq(rax, rax, rdx); 6476 } else { 6477 notq(rax); 6478 andq(rax, rdx); 6479 } 6480 sarq(rax, 63); 6481 andq(rax, divisor); 6482 subq(rdx, rax); 6483 bind(done); 6484 } 6485 6486 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6487 Label done; 6488 Label neg_divisor_fastpath; 6489 cmpq(divisor, 0); 6490 jccb(Assembler::less, neg_divisor_fastpath); 6491 xorq(rdx, rdx); 6492 divq(divisor); 6493 jmp(done); 6494 bind(neg_divisor_fastpath); 6495 // Fastpath for divisor < 0: 6496 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6497 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6498 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6499 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6500 movq(rdx, rax); 6501 subq(rax, divisor); 6502 if (VM_Version::supports_bmi1()) { 6503 andnq(rax, rax, rdx); 6504 } else { 6505 notq(rax); 6506 andq(rax, rdx); 6507 } 6508 movq(tmp, rax); 6509 shrq(rax, 63); // quotient 6510 sarq(tmp, 63); 6511 andq(tmp, divisor); 6512 subq(rdx, tmp); // remainder 6513 bind(done); 6514 } 6515 6516 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6517 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6518 int vlen_enc) { 6519 assert(VM_Version::supports_avx512bw(), ""); 6520 // Byte shuffles are inlane operations and indices are determined using 6521 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6522 // normalized to index range 0-15. This makes sure that all the multiples 6523 // of an index value are placed at same relative position in 128 bit 6524 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6525 // will be 16th element in their respective 128 bit lanes. 6526 movl(rtmp, 16); 6527 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6528 6529 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6530 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6531 // original shuffle indices and move the shuffled lanes corresponding to true 6532 // mask to destination vector. 6533 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6534 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6535 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6536 6537 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6538 // and broadcasting second 128 bit lane. 6539 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6540 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6541 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6542 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6543 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6544 6545 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6546 // and broadcasting third 128 bit lane. 6547 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6548 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6549 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6550 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6551 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6552 6553 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6554 // and broadcasting third 128 bit lane. 6555 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6556 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6557 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6558 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6559 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6560 } 6561 6562 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6563 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6564 if (vlen_enc == AVX_128bit) { 6565 vpermilps(dst, src, shuffle, vlen_enc); 6566 } else if (bt == T_INT) { 6567 vpermd(dst, shuffle, src, vlen_enc); 6568 } else { 6569 assert(bt == T_FLOAT, ""); 6570 vpermps(dst, shuffle, src, vlen_enc); 6571 } 6572 } 6573 6574 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 6575 switch(opcode) { 6576 case Op_AddHF: vaddsh(dst, src1, src2); break; 6577 case Op_SubHF: vsubsh(dst, src1, src2); break; 6578 case Op_MulHF: vmulsh(dst, src1, src2); break; 6579 case Op_DivHF: vdivsh(dst, src1, src2); break; 6580 default: assert(false, "%s", NodeClassNames[opcode]); break; 6581 } 6582 } 6583 6584 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6585 switch(elem_bt) { 6586 case T_BYTE: 6587 if (ideal_opc == Op_SaturatingAddV) { 6588 vpaddsb(dst, src1, src2, vlen_enc); 6589 } else { 6590 assert(ideal_opc == Op_SaturatingSubV, ""); 6591 vpsubsb(dst, src1, src2, vlen_enc); 6592 } 6593 break; 6594 case T_SHORT: 6595 if (ideal_opc == Op_SaturatingAddV) { 6596 vpaddsw(dst, src1, src2, vlen_enc); 6597 } else { 6598 assert(ideal_opc == Op_SaturatingSubV, ""); 6599 vpsubsw(dst, src1, src2, vlen_enc); 6600 } 6601 break; 6602 default: 6603 fatal("Unsupported type %s", type2name(elem_bt)); 6604 break; 6605 } 6606 } 6607 6608 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6609 switch(elem_bt) { 6610 case T_BYTE: 6611 if (ideal_opc == Op_SaturatingAddV) { 6612 vpaddusb(dst, src1, src2, vlen_enc); 6613 } else { 6614 assert(ideal_opc == Op_SaturatingSubV, ""); 6615 vpsubusb(dst, src1, src2, vlen_enc); 6616 } 6617 break; 6618 case T_SHORT: 6619 if (ideal_opc == Op_SaturatingAddV) { 6620 vpaddusw(dst, src1, src2, vlen_enc); 6621 } else { 6622 assert(ideal_opc == Op_SaturatingSubV, ""); 6623 vpsubusw(dst, src1, src2, vlen_enc); 6624 } 6625 break; 6626 default: 6627 fatal("Unsupported type %s", type2name(elem_bt)); 6628 break; 6629 } 6630 } 6631 6632 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6633 XMMRegister src2, KRegister ktmp, int vlen_enc) { 6634 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input. 6635 // overflow_mask = Inp1 <u Inp2 6636 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc); 6637 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative) 6638 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false); 6639 } 6640 6641 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6642 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6643 // Emulate unsigned comparison using signed comparison 6644 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE 6645 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true); 6646 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc); 6647 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc); 6648 6649 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc); 6650 6651 // Res = INP1 - INP2 (non-commutative and non-associative) 6652 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6653 // Res = Mask ? Zero : Res 6654 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); 6655 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc); 6656 } 6657 6658 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6659 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) { 6660 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation. 6661 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2) 6662 // Res = Signed Add INP1, INP2 6663 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6664 // T1 = SRC1 | SRC2 6665 vpor(xtmp1, src1, src2, vlen_enc); 6666 // Max_Unsigned = -1 6667 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6668 // Unsigned compare: Mask = Res <u T1 6669 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc); 6670 // res = Mask ? Max_Unsigned : Res 6671 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc); 6672 } 6673 6674 // 6675 // Section 2-13 Hacker's Delight list following overflow detection check for saturating 6676 // unsigned addition operation. 6677 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1 6678 // 6679 // We empirically determined its semantic equivalence to following reduced expression 6680 // overflow_mask = (a + b) <u (a | b) 6681 // 6682 // and also verified it though Alive2 solver. 6683 // (https://alive2.llvm.org/ce/z/XDQ7dY) 6684 // 6685 6686 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6687 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) { 6688 // Res = Signed Add INP1, INP2 6689 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6690 // Compute T1 = INP1 | INP2 6691 vpor(xtmp3, src1, src2, vlen_enc); 6692 // T1 = Minimum signed value. 6693 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6694 // Convert T1 to signed value, T1 = T1 + MIN_VALUE 6695 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc); 6696 // Convert Res to signed value, Res<s> = Res + MIN_VALUE 6697 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc); 6698 // Compute overflow detection mask = Res<1> <s T1 6699 if (elem_bt == T_INT) { 6700 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc); 6701 } else { 6702 assert(elem_bt == T_LONG, ""); 6703 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc); 6704 } 6705 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6706 } 6707 6708 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6709 int vlen_enc, bool xtmp2_hold_M1) { 6710 if (VM_Version::supports_avx512dq()) { 6711 evpmovq2m(ktmp, src, vlen_enc); 6712 } else { 6713 assert(VM_Version::supports_evex(), ""); 6714 if (!xtmp2_hold_M1) { 6715 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6716 } 6717 evpsraq(xtmp1, src, 63, vlen_enc); 6718 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6719 } 6720 } 6721 6722 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 6723 int vlen_enc, bool xtmp2_hold_M1) { 6724 if (VM_Version::supports_avx512dq()) { 6725 evpmovd2m(ktmp, src, vlen_enc); 6726 } else { 6727 assert(VM_Version::supports_evex(), ""); 6728 if (!xtmp2_hold_M1) { 6729 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc); 6730 } 6731 vpsrad(xtmp1, src, 31, vlen_enc); 6732 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc); 6733 } 6734 } 6735 6736 6737 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 6738 if (elem_bt == T_LONG) { 6739 if (VM_Version::supports_evex()) { 6740 evpsraq(dst, src, 63, vlen_enc); 6741 } else { 6742 vpsrad(dst, src, 31, vlen_enc); 6743 vpshufd(dst, dst, 0xF5, vlen_enc); 6744 } 6745 } else { 6746 assert(elem_bt == T_INT, ""); 6747 vpsrad(dst, src, 31, vlen_enc); 6748 } 6749 } 6750 6751 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6752 if (compute_allones) { 6753 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6754 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6755 } else { 6756 vpcmpeqq(allones, allones, allones, vlen_enc); 6757 } 6758 } 6759 if (elem_bt == T_LONG) { 6760 vpsrlq(dst, allones, 1, vlen_enc); 6761 } else { 6762 assert(elem_bt == T_INT, ""); 6763 vpsrld(dst, allones, 1, vlen_enc); 6764 } 6765 } 6766 6767 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) { 6768 if (compute_allones) { 6769 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) { 6770 vpternlogd(allones, 0xff, allones, allones, vlen_enc); 6771 } else { 6772 vpcmpeqq(allones, allones, allones, vlen_enc); 6773 } 6774 } 6775 if (elem_bt == T_LONG) { 6776 vpsllq(dst, allones, 63, vlen_enc); 6777 } else { 6778 assert(elem_bt == T_INT, ""); 6779 vpslld(dst, allones, 31, vlen_enc); 6780 } 6781 } 6782 6783 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2, 6784 Assembler::ComparisonPredicate cond, int vlen_enc) { 6785 switch(elem_bt) { 6786 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break; 6787 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break; 6788 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break; 6789 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break; 6790 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6791 } 6792 } 6793 6794 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 6795 switch(elem_bt) { 6796 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break; 6797 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break; 6798 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break; 6799 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break; 6800 default: fatal("Unsupported type %s", type2name(elem_bt)); break; 6801 } 6802 } 6803 6804 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1, 6805 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) { 6806 if (elem_bt == T_LONG) { 6807 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6808 } else { 6809 assert(elem_bt == T_INT, ""); 6810 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1); 6811 } 6812 } 6813 6814 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6815 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6816 KRegister ktmp1, KRegister ktmp2, int vlen_enc) { 6817 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6818 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6819 // Overflow detection based on Hacker's delight section 2-13. 6820 if (ideal_opc == Op_SaturatingAddV) { 6821 // res = src1 + src2 6822 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6823 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6824 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6825 vpxor(xtmp1, dst, src1, vlen_enc); 6826 vpxor(xtmp2, dst, src2, vlen_enc); 6827 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6828 } else { 6829 assert(ideal_opc == Op_SaturatingSubV, ""); 6830 // res = src1 - src2 6831 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6832 // Overflow occurs when both inputs have opposite polarity and 6833 // result polarity does not comply with first input polarity. 6834 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6835 vpxor(xtmp1, src1, src2, vlen_enc); 6836 vpxor(xtmp2, dst, src1, vlen_enc); 6837 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6838 } 6839 6840 // Compute overflow detection mask. 6841 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc); 6842 // Note: xtmp1 hold -1 in all its lanes after above call. 6843 6844 // Compute mask based on first input polarity. 6845 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true); 6846 6847 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true); 6848 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6849 6850 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to 6851 // set bits in first input polarity mask holds a min value. 6852 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc); 6853 // Blend destination lanes with saturated values using overflow detection mask. 6854 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc); 6855 } 6856 6857 6858 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6859 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, 6860 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) { 6861 assert(elem_bt == T_INT || elem_bt == T_LONG, ""); 6862 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness. 6863 // Overflow detection based on Hacker's delight section 2-13. 6864 if (ideal_opc == Op_SaturatingAddV) { 6865 // res = src1 + src2 6866 vpadd(elem_bt, dst, src1, src2, vlen_enc); 6867 // Overflow occurs if result polarity does not comply with equivalent polarity inputs. 6868 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1 6869 vpxor(xtmp1, dst, src1, vlen_enc); 6870 vpxor(xtmp2, dst, src2, vlen_enc); 6871 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6872 } else { 6873 assert(ideal_opc == Op_SaturatingSubV, ""); 6874 // res = src1 - src2 6875 vpsub(elem_bt, dst, src1, src2, vlen_enc); 6876 // Overflow occurs when both inputs have opposite polarity and 6877 // result polarity does not comply with first input polarity. 6878 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1; 6879 vpxor(xtmp1, src1, src2, vlen_enc); 6880 vpxor(xtmp2, dst, src1, vlen_enc); 6881 vpand(xtmp2, xtmp1, xtmp2, vlen_enc); 6882 } 6883 6884 // Sign-extend to compute overflow detection mask. 6885 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc); 6886 6887 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc); 6888 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc); 6889 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc); 6890 6891 // Compose saturating min/max vector using first input polarity mask. 6892 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc); 6893 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc); 6894 6895 // Blend result with saturating vector using overflow detection mask. 6896 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc); 6897 } 6898 6899 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6900 switch(elem_bt) { 6901 case T_BYTE: 6902 if (ideal_opc == Op_SaturatingAddV) { 6903 vpaddsb(dst, src1, src2, vlen_enc); 6904 } else { 6905 assert(ideal_opc == Op_SaturatingSubV, ""); 6906 vpsubsb(dst, src1, src2, vlen_enc); 6907 } 6908 break; 6909 case T_SHORT: 6910 if (ideal_opc == Op_SaturatingAddV) { 6911 vpaddsw(dst, src1, src2, vlen_enc); 6912 } else { 6913 assert(ideal_opc == Op_SaturatingSubV, ""); 6914 vpsubsw(dst, src1, src2, vlen_enc); 6915 } 6916 break; 6917 default: 6918 fatal("Unsupported type %s", type2name(elem_bt)); 6919 break; 6920 } 6921 } 6922 6923 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) { 6924 switch(elem_bt) { 6925 case T_BYTE: 6926 if (ideal_opc == Op_SaturatingAddV) { 6927 vpaddusb(dst, src1, src2, vlen_enc); 6928 } else { 6929 assert(ideal_opc == Op_SaturatingSubV, ""); 6930 vpsubusb(dst, src1, src2, vlen_enc); 6931 } 6932 break; 6933 case T_SHORT: 6934 if (ideal_opc == Op_SaturatingAddV) { 6935 vpaddusw(dst, src1, src2, vlen_enc); 6936 } else { 6937 assert(ideal_opc == Op_SaturatingSubV, ""); 6938 vpsubusw(dst, src1, src2, vlen_enc); 6939 } 6940 break; 6941 default: 6942 fatal("Unsupported type %s", type2name(elem_bt)); 6943 break; 6944 } 6945 } 6946 6947 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, 6948 XMMRegister src2, int vlen_enc) { 6949 switch(elem_bt) { 6950 case T_BYTE: 6951 evpermi2b(dst, src1, src2, vlen_enc); 6952 break; 6953 case T_SHORT: 6954 evpermi2w(dst, src1, src2, vlen_enc); 6955 break; 6956 case T_INT: 6957 evpermi2d(dst, src1, src2, vlen_enc); 6958 break; 6959 case T_LONG: 6960 evpermi2q(dst, src1, src2, vlen_enc); 6961 break; 6962 case T_FLOAT: 6963 evpermi2ps(dst, src1, src2, vlen_enc); 6964 break; 6965 case T_DOUBLE: 6966 evpermi2pd(dst, src1, src2, vlen_enc); 6967 break; 6968 default: 6969 fatal("Unsupported type %s", type2name(elem_bt)); 6970 break; 6971 } 6972 } 6973 6974 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) { 6975 if (is_unsigned) { 6976 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6977 } else { 6978 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6979 } 6980 } 6981 6982 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) { 6983 if (is_unsigned) { 6984 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6985 } else { 6986 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc); 6987 } 6988 } 6989 6990 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, 6991 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) { 6992 if (opcode == Op_MaxHF) { 6993 // Move sign bits of src2 to mask register. 6994 evpmovw2m(ktmp, src2, vlen_enc); 6995 // xtmp1 = src2 < 0 ? src2 : src1 6996 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 6997 // xtmp2 = src2 < 0 ? ? src1 : src2 6998 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 6999 // Idea behind above swapping is to make seconds source operand a +ve value. 7000 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in 7001 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction, 7002 // the second source operand, either a NaN or a valid floating-point value, is returned 7003 // dst = max(xtmp1, xtmp2) 7004 vmaxsh(dst, xtmp1, xtmp2); 7005 // isNaN = is_unordered_quiet(xtmp1) 7006 evcmpsh(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q); 7007 // Final result is same as first source if its a NaN value, 7008 // in case second operand holds a NaN value then as per above semantics 7009 // result is same as second operand. 7010 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7011 } else { 7012 assert(opcode == Op_MinHF, ""); 7013 // Move sign bits of src1 to mask register. 7014 evpmovw2m(ktmp, src1, vlen_enc); 7015 // xtmp1 = src1 < 0 ? src2 : src1 7016 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc); 7017 // xtmp2 = src1 < 0 ? src1 : src2 7018 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc); 7019 // Idea behind above swapping is to make seconds source operand a -ve value. 7020 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in 7021 // the second source operand is returned. 7022 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN 7023 // or a valid floating-point value, is written to the result. 7024 // dst = min(xtmp1, xtmp2) 7025 vminsh(dst, xtmp1, xtmp2); 7026 // isNaN = is_unordered_quiet(xtmp1) 7027 evcmpsh(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q); 7028 // Final result is same as first source if its a NaN value, 7029 // in case second operand holds a NaN value then as per above semantics 7030 // result is same as second operand. 7031 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); 7032 } 7033 }