1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address (displaced header location) - KILLED 249 // rax,: tmp -- KILLED 250 // scr: tmp -- KILLED 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 252 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 253 Metadata* method_data) { 254 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 255 // Ensure the register assignments are disjoint 256 assert(tmpReg == rax, ""); 257 assert(cx1Reg == noreg, ""); 258 assert(cx2Reg == noreg, ""); 259 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 260 261 // Possible cases that we'll encounter in fast_lock 262 // ------------------------------------------------ 263 // * Inflated 264 // -- unlocked 265 // -- Locked 266 // = by self 267 // = by other 268 // * neutral 269 // * stack-locked 270 // -- by self 271 // = sp-proximity test hits 272 // = sp-proximity test generates false-negative 273 // -- by other 274 // 275 276 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 277 278 if (DiagnoseSyncOnValueBasedClasses != 0) { 279 load_klass(tmpReg, objReg, scrReg); 280 testb(Address(tmpReg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 281 jcc(Assembler::notZero, DONE_LABEL); 282 } 283 284 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 285 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 286 jcc(Assembler::notZero, IsInflated); 287 288 if (LockingMode == LM_MONITOR) { 289 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 290 testptr(objReg, objReg); 291 } else { 292 assert(LockingMode == LM_LEGACY, "must be"); 293 // Attempt stack-locking ... 294 orptr (tmpReg, markWord::unlocked_value); 295 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 296 lock(); 297 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 298 jcc(Assembler::equal, COUNT); // Success 299 300 // Recursive locking. 301 // The object is stack-locked: markword contains stack pointer to BasicLock. 302 // Locked by current thread if difference with current SP is less than one page. 303 subptr(tmpReg, rsp); 304 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 305 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 306 movptr(Address(boxReg, 0), tmpReg); 307 } 308 jmp(DONE_LABEL); 309 310 bind(IsInflated); 311 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 312 313 #ifndef _LP64 314 // The object is inflated. 315 316 // boxReg refers to the on-stack BasicLock in the current frame. 317 // We'd like to write: 318 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 319 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 320 // additional latency as we have another ST in the store buffer that must drain. 321 322 // avoid ST-before-CAS 323 // register juggle because we need tmpReg for cmpxchgptr below 324 movptr(scrReg, boxReg); 325 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 326 327 // Optimistic form: consider XORL tmpReg,tmpReg 328 movptr(tmpReg, NULL_WORD); 329 330 // Appears unlocked - try to swing _owner from null to non-null. 331 // Ideally, I'd manifest "Self" with get_thread and then attempt 332 // to CAS the register containing Self into m->Owner. 333 // But we don't have enough registers, so instead we can either try to CAS 334 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 335 // we later store "Self" into m->Owner. Transiently storing a stack address 336 // (rsp or the address of the box) into m->owner is harmless. 337 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 338 lock(); 339 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 340 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 341 // If we weren't able to swing _owner from null to the BasicLock 342 // then take the slow path. 343 jccb (Assembler::notZero, NO_COUNT); 344 // update _owner from BasicLock to thread 345 get_thread (scrReg); // beware: clobbers ICCs 346 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 347 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 348 349 // If the CAS fails we can either retry or pass control to the slow path. 350 // We use the latter tactic. 351 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 352 // If the CAS was successful ... 353 // Self has acquired the lock 354 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 355 // Intentional fall-through into DONE_LABEL ... 356 #else // _LP64 357 // It's inflated and we use scrReg for ObjectMonitor* in this section. 358 movq(scrReg, tmpReg); 359 xorq(tmpReg, tmpReg); 360 lock(); 361 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 362 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 363 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 364 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 365 // Propagate ICC.ZF from CAS above into DONE_LABEL. 366 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 367 368 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 369 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 370 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 371 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 372 #endif // _LP64 373 bind(DONE_LABEL); 374 375 // ZFlag == 1 count in fast path 376 // ZFlag == 0 count in slow path 377 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 378 379 bind(COUNT); 380 // Count monitors in fast path 381 increment(Address(thread, JavaThread::held_monitor_count_offset())); 382 383 xorl(tmpReg, tmpReg); // Set ZF == 1 384 385 bind(NO_COUNT); 386 387 // At NO_COUNT the icc ZFlag is set as follows ... 388 // fast_unlock uses the same protocol. 389 // ZFlag == 1 -> Success 390 // ZFlag == 0 -> Failure - force control through the slow path 391 } 392 393 // obj: object to unlock 394 // box: box address (displaced header location), killed. Must be EAX. 395 // tmp: killed, cannot be obj nor box. 396 // 397 // Some commentary on balanced locking: 398 // 399 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 400 // Methods that don't have provably balanced locking are forced to run in the 401 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 402 // The interpreter provides two properties: 403 // I1: At return-time the interpreter automatically and quietly unlocks any 404 // objects acquired the current activation (frame). Recall that the 405 // interpreter maintains an on-stack list of locks currently held by 406 // a frame. 407 // I2: If a method attempts to unlock an object that is not held by the 408 // the frame the interpreter throws IMSX. 409 // 410 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 411 // B() doesn't have provably balanced locking so it runs in the interpreter. 412 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 413 // is still locked by A(). 414 // 415 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 416 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 417 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 418 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 419 // Arguably given that the spec legislates the JNI case as undefined our implementation 420 // could reasonably *avoid* checking owner in fast_unlock(). 421 // In the interest of performance we elide m->Owner==Self check in unlock. 422 // A perfectly viable alternative is to elide the owner check except when 423 // Xcheck:jni is enabled. 424 425 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 426 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 427 assert(boxReg == rax, ""); 428 assert_different_registers(objReg, boxReg, tmpReg); 429 430 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 431 432 if (LockingMode == LM_LEGACY) { 433 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 434 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 435 } 436 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 437 if (LockingMode != LM_MONITOR) { 438 testptr(tmpReg, markWord::monitor_value); // Inflated? 439 jcc(Assembler::zero, Stacked); 440 } 441 442 // It's inflated. 443 444 // Despite our balanced locking property we still check that m->_owner == Self 445 // as java routines or native JNI code called by this thread might 446 // have released the lock. 447 // Refer to the comments in synchronizer.cpp for how we might encode extra 448 // state in _succ so we can avoid fetching EntryList|cxq. 449 // 450 // If there's no contention try a 1-0 exit. That is, exit without 451 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 452 // we detect and recover from the race that the 1-0 exit admits. 453 // 454 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 455 // before it STs null into _owner, releasing the lock. Updates 456 // to data protected by the critical section must be visible before 457 // we drop the lock (and thus before any other thread could acquire 458 // the lock and observe the fields protected by the lock). 459 // IA32's memory-model is SPO, so STs are ordered with respect to 460 // each other and there's no need for an explicit barrier (fence). 461 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 462 Label LSuccess, LNotRecursive; 463 464 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 465 jccb(Assembler::equal, LNotRecursive); 466 467 // Recursive inflated unlock 468 decrement(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 469 jmpb(LSuccess); 470 471 bind(LNotRecursive); 472 473 // Set owner to null. 474 // Release to satisfy the JMM 475 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 476 // We need a full fence after clearing owner to avoid stranding. 477 // StoreLoad achieves this. 478 membar(StoreLoad); 479 480 // Check if the entry lists are empty. 481 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 482 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 483 jccb(Assembler::zero, LSuccess); // If so we are done. 484 485 // Check if there is a successor. 486 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 487 jccb(Assembler::notZero, LSuccess); // If so we are done. 488 489 // Save the monitor pointer in the current thread, so we can try to 490 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 491 andptr(tmpReg, ~(int32_t)markWord::monitor_value); 492 #ifndef _LP64 493 get_thread(boxReg); 494 movptr(Address(boxReg, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 495 #else // _LP64 496 movptr(Address(r15_thread, JavaThread::unlocked_inflated_monitor_offset()), tmpReg); 497 #endif 498 499 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 500 jmpb (DONE_LABEL); 501 502 bind (LSuccess); 503 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 504 jmpb (DONE_LABEL); 505 506 if (LockingMode == LM_LEGACY) { 507 bind (Stacked); 508 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 509 lock(); 510 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 511 // Intentional fall-thru into DONE_LABEL 512 } 513 514 bind(DONE_LABEL); 515 516 // ZFlag == 1 count in fast path 517 // ZFlag == 0 count in slow path 518 jccb(Assembler::notZero, NO_COUNT); 519 520 bind(COUNT); 521 // Count monitors in fast path 522 #ifndef _LP64 523 get_thread(tmpReg); 524 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 525 #else // _LP64 526 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 527 #endif 528 529 xorl(tmpReg, tmpReg); // Set ZF == 1 530 531 bind(NO_COUNT); 532 } 533 534 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 535 Register t, Register thread) { 536 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 537 assert(rax_reg == rax, "Used for CAS"); 538 assert_different_registers(obj, box, rax_reg, t, thread); 539 540 // Handle inflated monitor. 541 Label inflated; 542 // Finish fast lock successfully. ZF value is irrelevant. 543 Label locked; 544 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 545 Label slow_path; 546 547 if (UseObjectMonitorTable) { 548 // Clear cache in case fast locking succeeds. 549 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 550 } 551 552 if (DiagnoseSyncOnValueBasedClasses != 0) { 553 load_klass(rax_reg, obj, t); 554 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class); 555 jcc(Assembler::notZero, slow_path); 556 } 557 558 const Register mark = t; 559 560 { // Lightweight Lock 561 562 Label push; 563 564 const Register top = UseObjectMonitorTable ? rax_reg : box; 565 566 // Load the mark. 567 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 568 569 // Prefetch top. 570 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 571 572 // Check for monitor (0b10). 573 testptr(mark, markWord::monitor_value); 574 jcc(Assembler::notZero, inflated); 575 576 // Check if lock-stack is full. 577 cmpl(top, LockStack::end_offset() - 1); 578 jcc(Assembler::greater, slow_path); 579 580 // Check if recursive. 581 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 582 jccb(Assembler::equal, push); 583 584 // Try to lock. Transition lock bits 0b01 => 0b00 585 movptr(rax_reg, mark); 586 orptr(rax_reg, markWord::unlocked_value); 587 andptr(mark, ~(int32_t)markWord::unlocked_value); 588 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 589 jcc(Assembler::notEqual, slow_path); 590 591 if (UseObjectMonitorTable) { 592 // Need to reload top, clobbered by CAS. 593 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 594 } 595 bind(push); 596 // After successful lock, push object on lock-stack. 597 movptr(Address(thread, top), obj); 598 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 599 jmpb(locked); 600 } 601 602 { // Handle inflated monitor. 603 bind(inflated); 604 605 const Register monitor = t; 606 607 if (!UseObjectMonitorTable) { 608 assert(mark == monitor, "should be the same here"); 609 } else { 610 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 611 // Fetch ObjectMonitor* from the cache or take the slow-path. 612 Label monitor_found; 613 614 // Load cache address 615 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 616 617 const int num_unrolled = 2; 618 for (int i = 0; i < num_unrolled; i++) { 619 cmpptr(obj, Address(t)); 620 jccb(Assembler::equal, monitor_found); 621 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 622 } 623 624 Label loop; 625 626 // Search for obj in cache. 627 bind(loop); 628 629 // Check for match. 630 cmpptr(obj, Address(t)); 631 jccb(Assembler::equal, monitor_found); 632 633 // Search until null encountered, guaranteed _null_sentinel at end. 634 cmpptr(Address(t), 1); 635 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 636 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 637 jmpb(loop); 638 639 // Cache hit. 640 bind(monitor_found); 641 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 642 } 643 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 644 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag); 645 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag); 646 647 Label monitor_locked; 648 // Lock the monitor. 649 650 // CAS owner (null => current thread). 651 xorptr(rax_reg, rax_reg); 652 lock(); cmpxchgptr(thread, owner_address); 653 jccb(Assembler::equal, monitor_locked); 654 655 // Check if recursive. 656 cmpptr(thread, rax_reg); 657 jccb(Assembler::notEqual, slow_path); 658 659 // Recursive. 660 increment(recursions_address); 661 662 bind(monitor_locked); 663 if (UseObjectMonitorTable) { 664 // Cache the monitor for unlock 665 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 666 } 667 } 668 669 bind(locked); 670 increment(Address(thread, JavaThread::held_monitor_count_offset())); 671 // Set ZF = 1 672 xorl(rax_reg, rax_reg); 673 674 #ifdef ASSERT 675 // Check that locked label is reached with ZF set. 676 Label zf_correct; 677 Label zf_bad_zero; 678 jcc(Assembler::zero, zf_correct); 679 jmp(zf_bad_zero); 680 #endif 681 682 bind(slow_path); 683 #ifdef ASSERT 684 // Check that slow_path label is reached with ZF not set. 685 jcc(Assembler::notZero, zf_correct); 686 stop("Fast Lock ZF != 0"); 687 bind(zf_bad_zero); 688 stop("Fast Lock ZF != 1"); 689 bind(zf_correct); 690 #endif 691 // C2 uses the value of ZF to determine the continuation. 692 } 693 694 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 695 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 696 assert(reg_rax == rax, "Used for CAS"); 697 assert_different_registers(obj, reg_rax, t); 698 699 // Handle inflated monitor. 700 Label inflated, inflated_check_lock_stack; 701 // Finish fast unlock successfully. MUST jump with ZF == 1 702 Label unlocked, slow_path; 703 704 const Register mark = t; 705 const Register monitor = t; 706 const Register top = UseObjectMonitorTable ? t : reg_rax; 707 const Register box = reg_rax; 708 709 Label dummy; 710 C2FastUnlockLightweightStub* stub = nullptr; 711 712 if (!Compile::current()->output()->in_scratch_emit_size()) { 713 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 714 Compile::current()->output()->add_stub(stub); 715 } 716 717 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 718 719 { // Lightweight Unlock 720 721 // Load top. 722 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 723 724 if (!UseObjectMonitorTable) { 725 // Prefetch mark. 726 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 727 } 728 729 // Check if obj is top of lock-stack. 730 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 731 // Top of lock stack was not obj. Must be monitor. 732 jcc(Assembler::notEqual, inflated_check_lock_stack); 733 734 // Pop lock-stack. 735 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 736 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 737 738 // Check if recursive. 739 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 740 jcc(Assembler::equal, unlocked); 741 742 // We elide the monitor check, let the CAS fail instead. 743 744 if (UseObjectMonitorTable) { 745 // Load mark. 746 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 747 } 748 749 // Try to unlock. Transition lock bits 0b00 => 0b01 750 movptr(reg_rax, mark); 751 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 752 orptr(mark, markWord::unlocked_value); 753 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 754 jcc(Assembler::notEqual, push_and_slow_path); 755 jmp(unlocked); 756 } 757 758 759 { // Handle inflated monitor. 760 bind(inflated_check_lock_stack); 761 #ifdef ASSERT 762 Label check_done; 763 subl(top, oopSize); 764 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 765 jcc(Assembler::below, check_done); 766 cmpptr(obj, Address(thread, top)); 767 jccb(Assembler::notEqual, inflated_check_lock_stack); 768 stop("Fast Unlock lock on stack"); 769 bind(check_done); 770 if (UseObjectMonitorTable) { 771 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 772 } 773 testptr(mark, markWord::monitor_value); 774 jccb(Assembler::notZero, inflated); 775 stop("Fast Unlock not monitor"); 776 #endif 777 778 bind(inflated); 779 780 if (!UseObjectMonitorTable) { 781 assert(mark == monitor, "should be the same here"); 782 } else { 783 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 784 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 785 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 786 cmpptr(monitor, alignof(ObjectMonitor*)); 787 jcc(Assembler::below, slow_path); 788 } 789 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 790 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 791 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 792 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag}; 793 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 794 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 795 796 Label recursive; 797 798 // Check if recursive. 799 cmpptr(recursions_address, 0); 800 jccb(Assembler::notZero, recursive); 801 802 // Set owner to null. 803 // Release to satisfy the JMM 804 movptr(owner_address, NULL_WORD); 805 // We need a full fence after clearing owner to avoid stranding. 806 // StoreLoad achieves this. 807 membar(StoreLoad); 808 809 // Check if the entry lists are empty. 810 movptr(reg_rax, cxq_address); 811 orptr(reg_rax, EntryList_address); 812 jccb(Assembler::zero, unlocked); // If so we are done. 813 814 // Check if there is a successor. 815 cmpptr(succ_address, NULL_WORD); 816 jccb(Assembler::notZero, unlocked); // If so we are done. 817 818 // Save the monitor pointer in the current thread, so we can try to 819 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 820 if (!UseObjectMonitorTable) { 821 andptr(monitor, ~(int32_t)markWord::monitor_value); 822 } 823 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor); 824 825 testl(monitor, monitor); // Fast Unlock ZF = 0 826 jmpb(slow_path); 827 828 // Recursive unlock. 829 bind(recursive); 830 decrement(recursions_address); 831 } 832 833 bind(unlocked); 834 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 835 xorl(t, t); // Fast Unlock ZF = 1 836 837 #ifdef ASSERT 838 // Check that unlocked label is reached with ZF set. 839 Label zf_correct; 840 jcc(Assembler::zero, zf_correct); 841 stop("Fast Unlock ZF != 1"); 842 #endif 843 844 bind(slow_path); 845 if (stub != nullptr) { 846 bind(stub->slow_path_continuation()); 847 } 848 #ifdef ASSERT 849 // Check that stub->continuation() label is reached with ZF not set. 850 jccb(Assembler::notZero, zf_correct); 851 stop("Fast Unlock ZF != 0"); 852 bind(zf_correct); 853 #endif 854 // C2 uses the value of ZF to determine the continuation. 855 } 856 857 //------------------------------------------------------------------------------------------- 858 // Generic instructions support for use in .ad files C2 code generation 859 860 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 861 if (dst != src) { 862 movdqu(dst, src); 863 } 864 if (opcode == Op_AbsVD) { 865 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 866 } else { 867 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 868 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 869 } 870 } 871 872 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 873 if (opcode == Op_AbsVD) { 874 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 875 } else { 876 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 877 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 878 } 879 } 880 881 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 882 if (dst != src) { 883 movdqu(dst, src); 884 } 885 if (opcode == Op_AbsVF) { 886 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 887 } else { 888 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 889 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 890 } 891 } 892 893 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 894 if (opcode == Op_AbsVF) { 895 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 896 } else { 897 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 898 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 899 } 900 } 901 902 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 903 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 904 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 905 906 if (opcode == Op_MinV) { 907 if (elem_bt == T_BYTE) { 908 pminsb(dst, src); 909 } else if (elem_bt == T_SHORT) { 910 pminsw(dst, src); 911 } else if (elem_bt == T_INT) { 912 pminsd(dst, src); 913 } else { 914 assert(elem_bt == T_LONG, "required"); 915 assert(tmp == xmm0, "required"); 916 assert_different_registers(dst, src, tmp); 917 movdqu(xmm0, dst); 918 pcmpgtq(xmm0, src); 919 blendvpd(dst, src); // xmm0 as mask 920 } 921 } else { // opcode == Op_MaxV 922 if (elem_bt == T_BYTE) { 923 pmaxsb(dst, src); 924 } else if (elem_bt == T_SHORT) { 925 pmaxsw(dst, src); 926 } else if (elem_bt == T_INT) { 927 pmaxsd(dst, src); 928 } else { 929 assert(elem_bt == T_LONG, "required"); 930 assert(tmp == xmm0, "required"); 931 assert_different_registers(dst, src, tmp); 932 movdqu(xmm0, src); 933 pcmpgtq(xmm0, dst); 934 blendvpd(dst, src); // xmm0 as mask 935 } 936 } 937 } 938 939 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 940 XMMRegister dst, XMMRegister src1, XMMRegister src2, 941 int vlen_enc) { 942 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 943 944 if (opcode == Op_MinV) { 945 if (elem_bt == T_BYTE) { 946 vpminsb(dst, src1, src2, vlen_enc); 947 } else if (elem_bt == T_SHORT) { 948 vpminsw(dst, src1, src2, vlen_enc); 949 } else if (elem_bt == T_INT) { 950 vpminsd(dst, src1, src2, vlen_enc); 951 } else { 952 assert(elem_bt == T_LONG, "required"); 953 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 954 vpminsq(dst, src1, src2, vlen_enc); 955 } else { 956 assert_different_registers(dst, src1, src2); 957 vpcmpgtq(dst, src1, src2, vlen_enc); 958 vblendvpd(dst, src1, src2, dst, vlen_enc); 959 } 960 } 961 } else { // opcode == Op_MaxV 962 if (elem_bt == T_BYTE) { 963 vpmaxsb(dst, src1, src2, vlen_enc); 964 } else if (elem_bt == T_SHORT) { 965 vpmaxsw(dst, src1, src2, vlen_enc); 966 } else if (elem_bt == T_INT) { 967 vpmaxsd(dst, src1, src2, vlen_enc); 968 } else { 969 assert(elem_bt == T_LONG, "required"); 970 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 971 vpmaxsq(dst, src1, src2, vlen_enc); 972 } else { 973 assert_different_registers(dst, src1, src2); 974 vpcmpgtq(dst, src1, src2, vlen_enc); 975 vblendvpd(dst, src2, src1, dst, vlen_enc); 976 } 977 } 978 } 979 } 980 981 // Float/Double min max 982 983 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 984 XMMRegister dst, XMMRegister a, XMMRegister b, 985 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 986 int vlen_enc) { 987 assert(UseAVX > 0, "required"); 988 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 989 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 990 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 991 assert_different_registers(a, tmp, atmp, btmp); 992 assert_different_registers(b, tmp, atmp, btmp); 993 994 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 995 bool is_double_word = is_double_word_type(elem_bt); 996 997 /* Note on 'non-obvious' assembly sequence: 998 * 999 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1000 * and Java on how they handle floats: 1001 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1002 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1003 * 1004 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1005 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1006 * (only useful when signs differ, noop otherwise) 1007 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1008 1009 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1010 * btmp = (b < +0.0) ? a : b 1011 * atmp = (b < +0.0) ? b : a 1012 * Tmp = Max_Float(atmp , btmp) 1013 * Res = (atmp == NaN) ? atmp : Tmp 1014 */ 1015 1016 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1017 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1018 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1019 XMMRegister mask; 1020 1021 if (!is_double_word && is_min) { 1022 mask = a; 1023 vblend = &MacroAssembler::vblendvps; 1024 vmaxmin = &MacroAssembler::vminps; 1025 vcmp = &MacroAssembler::vcmpps; 1026 } else if (!is_double_word && !is_min) { 1027 mask = b; 1028 vblend = &MacroAssembler::vblendvps; 1029 vmaxmin = &MacroAssembler::vmaxps; 1030 vcmp = &MacroAssembler::vcmpps; 1031 } else if (is_double_word && is_min) { 1032 mask = a; 1033 vblend = &MacroAssembler::vblendvpd; 1034 vmaxmin = &MacroAssembler::vminpd; 1035 vcmp = &MacroAssembler::vcmppd; 1036 } else { 1037 assert(is_double_word && !is_min, "sanity"); 1038 mask = b; 1039 vblend = &MacroAssembler::vblendvpd; 1040 vmaxmin = &MacroAssembler::vmaxpd; 1041 vcmp = &MacroAssembler::vcmppd; 1042 } 1043 1044 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1045 XMMRegister maxmin, scratch; 1046 if (dst == btmp) { 1047 maxmin = btmp; 1048 scratch = tmp; 1049 } else { 1050 maxmin = tmp; 1051 scratch = btmp; 1052 } 1053 1054 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1055 if (precompute_mask && !is_double_word) { 1056 vpsrad(tmp, mask, 32, vlen_enc); 1057 mask = tmp; 1058 } else if (precompute_mask && is_double_word) { 1059 vpxor(tmp, tmp, tmp, vlen_enc); 1060 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1061 mask = tmp; 1062 } 1063 1064 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1065 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1066 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1067 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1068 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1069 } 1070 1071 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1072 XMMRegister dst, XMMRegister a, XMMRegister b, 1073 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1074 int vlen_enc) { 1075 assert(UseAVX > 2, "required"); 1076 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1077 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1078 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1079 assert_different_registers(dst, a, atmp, btmp); 1080 assert_different_registers(dst, b, atmp, btmp); 1081 1082 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1083 bool is_double_word = is_double_word_type(elem_bt); 1084 bool merge = true; 1085 1086 if (!is_double_word && is_min) { 1087 evpmovd2m(ktmp, a, vlen_enc); 1088 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1089 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1090 vminps(dst, atmp, btmp, vlen_enc); 1091 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1092 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1093 } else if (!is_double_word && !is_min) { 1094 evpmovd2m(ktmp, b, vlen_enc); 1095 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1096 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1097 vmaxps(dst, atmp, btmp, vlen_enc); 1098 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1099 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1100 } else if (is_double_word && is_min) { 1101 evpmovq2m(ktmp, a, vlen_enc); 1102 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1103 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1104 vminpd(dst, atmp, btmp, vlen_enc); 1105 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1106 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1107 } else { 1108 assert(is_double_word && !is_min, "sanity"); 1109 evpmovq2m(ktmp, b, vlen_enc); 1110 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1111 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1112 vmaxpd(dst, atmp, btmp, vlen_enc); 1113 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1114 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1115 } 1116 } 1117 1118 // Float/Double signum 1119 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1120 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1121 1122 Label DONE_LABEL; 1123 1124 if (opcode == Op_SignumF) { 1125 assert(UseSSE > 0, "required"); 1126 ucomiss(dst, zero); 1127 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1128 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1129 movflt(dst, one); 1130 jcc(Assembler::above, DONE_LABEL); 1131 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1132 } else if (opcode == Op_SignumD) { 1133 assert(UseSSE > 1, "required"); 1134 ucomisd(dst, zero); 1135 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1136 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1137 movdbl(dst, one); 1138 jcc(Assembler::above, DONE_LABEL); 1139 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1140 } 1141 1142 bind(DONE_LABEL); 1143 } 1144 1145 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1146 if (sign) { 1147 pmovsxbw(dst, src); 1148 } else { 1149 pmovzxbw(dst, src); 1150 } 1151 } 1152 1153 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1154 if (sign) { 1155 vpmovsxbw(dst, src, vector_len); 1156 } else { 1157 vpmovzxbw(dst, src, vector_len); 1158 } 1159 } 1160 1161 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1162 if (sign) { 1163 vpmovsxbd(dst, src, vector_len); 1164 } else { 1165 vpmovzxbd(dst, src, vector_len); 1166 } 1167 } 1168 1169 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1170 if (sign) { 1171 vpmovsxwd(dst, src, vector_len); 1172 } else { 1173 vpmovzxwd(dst, src, vector_len); 1174 } 1175 } 1176 1177 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1178 int shift, int vector_len) { 1179 if (opcode == Op_RotateLeftV) { 1180 if (etype == T_INT) { 1181 evprold(dst, src, shift, vector_len); 1182 } else { 1183 assert(etype == T_LONG, "expected type T_LONG"); 1184 evprolq(dst, src, shift, vector_len); 1185 } 1186 } else { 1187 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1188 if (etype == T_INT) { 1189 evprord(dst, src, shift, vector_len); 1190 } else { 1191 assert(etype == T_LONG, "expected type T_LONG"); 1192 evprorq(dst, src, shift, vector_len); 1193 } 1194 } 1195 } 1196 1197 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1198 XMMRegister shift, int vector_len) { 1199 if (opcode == Op_RotateLeftV) { 1200 if (etype == T_INT) { 1201 evprolvd(dst, src, shift, vector_len); 1202 } else { 1203 assert(etype == T_LONG, "expected type T_LONG"); 1204 evprolvq(dst, src, shift, vector_len); 1205 } 1206 } else { 1207 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1208 if (etype == T_INT) { 1209 evprorvd(dst, src, shift, vector_len); 1210 } else { 1211 assert(etype == T_LONG, "expected type T_LONG"); 1212 evprorvq(dst, src, shift, vector_len); 1213 } 1214 } 1215 } 1216 1217 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1218 if (opcode == Op_RShiftVI) { 1219 psrad(dst, shift); 1220 } else if (opcode == Op_LShiftVI) { 1221 pslld(dst, shift); 1222 } else { 1223 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1224 psrld(dst, shift); 1225 } 1226 } 1227 1228 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1229 switch (opcode) { 1230 case Op_RShiftVI: psrad(dst, shift); break; 1231 case Op_LShiftVI: pslld(dst, shift); break; 1232 case Op_URShiftVI: psrld(dst, shift); break; 1233 1234 default: assert(false, "%s", NodeClassNames[opcode]); 1235 } 1236 } 1237 1238 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1239 if (opcode == Op_RShiftVI) { 1240 vpsrad(dst, nds, shift, vector_len); 1241 } else if (opcode == Op_LShiftVI) { 1242 vpslld(dst, nds, shift, vector_len); 1243 } else { 1244 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1245 vpsrld(dst, nds, shift, vector_len); 1246 } 1247 } 1248 1249 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1250 switch (opcode) { 1251 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1252 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1253 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1254 1255 default: assert(false, "%s", NodeClassNames[opcode]); 1256 } 1257 } 1258 1259 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1260 switch (opcode) { 1261 case Op_RShiftVB: // fall-through 1262 case Op_RShiftVS: psraw(dst, shift); break; 1263 1264 case Op_LShiftVB: // fall-through 1265 case Op_LShiftVS: psllw(dst, shift); break; 1266 1267 case Op_URShiftVS: // fall-through 1268 case Op_URShiftVB: psrlw(dst, shift); break; 1269 1270 default: assert(false, "%s", NodeClassNames[opcode]); 1271 } 1272 } 1273 1274 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1275 switch (opcode) { 1276 case Op_RShiftVB: // fall-through 1277 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1278 1279 case Op_LShiftVB: // fall-through 1280 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1281 1282 case Op_URShiftVS: // fall-through 1283 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1284 1285 default: assert(false, "%s", NodeClassNames[opcode]); 1286 } 1287 } 1288 1289 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1290 switch (opcode) { 1291 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1292 case Op_LShiftVL: psllq(dst, shift); break; 1293 case Op_URShiftVL: psrlq(dst, shift); break; 1294 1295 default: assert(false, "%s", NodeClassNames[opcode]); 1296 } 1297 } 1298 1299 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1300 if (opcode == Op_RShiftVL) { 1301 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1302 } else if (opcode == Op_LShiftVL) { 1303 psllq(dst, shift); 1304 } else { 1305 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1306 psrlq(dst, shift); 1307 } 1308 } 1309 1310 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1311 switch (opcode) { 1312 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1313 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1314 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1315 1316 default: assert(false, "%s", NodeClassNames[opcode]); 1317 } 1318 } 1319 1320 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1321 if (opcode == Op_RShiftVL) { 1322 evpsraq(dst, nds, shift, vector_len); 1323 } else if (opcode == Op_LShiftVL) { 1324 vpsllq(dst, nds, shift, vector_len); 1325 } else { 1326 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1327 vpsrlq(dst, nds, shift, vector_len); 1328 } 1329 } 1330 1331 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1332 switch (opcode) { 1333 case Op_RShiftVB: // fall-through 1334 case Op_RShiftVS: // fall-through 1335 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1336 1337 case Op_LShiftVB: // fall-through 1338 case Op_LShiftVS: // fall-through 1339 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1340 1341 case Op_URShiftVB: // fall-through 1342 case Op_URShiftVS: // fall-through 1343 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1344 1345 default: assert(false, "%s", NodeClassNames[opcode]); 1346 } 1347 } 1348 1349 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1350 switch (opcode) { 1351 case Op_RShiftVB: // fall-through 1352 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1353 1354 case Op_LShiftVB: // fall-through 1355 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1356 1357 case Op_URShiftVB: // fall-through 1358 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1359 1360 default: assert(false, "%s", NodeClassNames[opcode]); 1361 } 1362 } 1363 1364 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1365 assert(UseAVX >= 2, "required"); 1366 switch (opcode) { 1367 case Op_RShiftVL: { 1368 if (UseAVX > 2) { 1369 assert(tmp == xnoreg, "not used"); 1370 if (!VM_Version::supports_avx512vl()) { 1371 vlen_enc = Assembler::AVX_512bit; 1372 } 1373 evpsravq(dst, src, shift, vlen_enc); 1374 } else { 1375 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1376 vpsrlvq(dst, src, shift, vlen_enc); 1377 vpsrlvq(tmp, tmp, shift, vlen_enc); 1378 vpxor(dst, dst, tmp, vlen_enc); 1379 vpsubq(dst, dst, tmp, vlen_enc); 1380 } 1381 break; 1382 } 1383 case Op_LShiftVL: { 1384 assert(tmp == xnoreg, "not used"); 1385 vpsllvq(dst, src, shift, vlen_enc); 1386 break; 1387 } 1388 case Op_URShiftVL: { 1389 assert(tmp == xnoreg, "not used"); 1390 vpsrlvq(dst, src, shift, vlen_enc); 1391 break; 1392 } 1393 default: assert(false, "%s", NodeClassNames[opcode]); 1394 } 1395 } 1396 1397 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1398 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1399 assert(opcode == Op_LShiftVB || 1400 opcode == Op_RShiftVB || 1401 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1402 bool sign = (opcode != Op_URShiftVB); 1403 assert(vector_len == 0, "required"); 1404 vextendbd(sign, dst, src, 1); 1405 vpmovzxbd(vtmp, shift, 1); 1406 varshiftd(opcode, dst, dst, vtmp, 1); 1407 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1408 vextracti128_high(vtmp, dst); 1409 vpackusdw(dst, dst, vtmp, 0); 1410 } 1411 1412 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1413 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1414 assert(opcode == Op_LShiftVB || 1415 opcode == Op_RShiftVB || 1416 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1417 bool sign = (opcode != Op_URShiftVB); 1418 int ext_vector_len = vector_len + 1; 1419 vextendbw(sign, dst, src, ext_vector_len); 1420 vpmovzxbw(vtmp, shift, ext_vector_len); 1421 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1422 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1423 if (vector_len == 0) { 1424 vextracti128_high(vtmp, dst); 1425 vpackuswb(dst, dst, vtmp, vector_len); 1426 } else { 1427 vextracti64x4_high(vtmp, dst); 1428 vpackuswb(dst, dst, vtmp, vector_len); 1429 vpermq(dst, dst, 0xD8, vector_len); 1430 } 1431 } 1432 1433 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1434 switch(typ) { 1435 case T_BYTE: 1436 pinsrb(dst, val, idx); 1437 break; 1438 case T_SHORT: 1439 pinsrw(dst, val, idx); 1440 break; 1441 case T_INT: 1442 pinsrd(dst, val, idx); 1443 break; 1444 case T_LONG: 1445 pinsrq(dst, val, idx); 1446 break; 1447 default: 1448 assert(false,"Should not reach here."); 1449 break; 1450 } 1451 } 1452 1453 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1454 switch(typ) { 1455 case T_BYTE: 1456 vpinsrb(dst, src, val, idx); 1457 break; 1458 case T_SHORT: 1459 vpinsrw(dst, src, val, idx); 1460 break; 1461 case T_INT: 1462 vpinsrd(dst, src, val, idx); 1463 break; 1464 case T_LONG: 1465 vpinsrq(dst, src, val, idx); 1466 break; 1467 default: 1468 assert(false,"Should not reach here."); 1469 break; 1470 } 1471 } 1472 1473 #ifdef _LP64 1474 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1475 XMMRegister dst, Register base, 1476 Register idx_base, 1477 Register offset, Register mask, 1478 Register mask_idx, Register rtmp, 1479 int vlen_enc) { 1480 vpxor(dst, dst, dst, vlen_enc); 1481 if (elem_bt == T_SHORT) { 1482 for (int i = 0; i < 4; i++) { 1483 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1484 Label skip_load; 1485 btq(mask, mask_idx); 1486 jccb(Assembler::carryClear, skip_load); 1487 movl(rtmp, Address(idx_base, i * 4)); 1488 if (offset != noreg) { 1489 addl(rtmp, offset); 1490 } 1491 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1492 bind(skip_load); 1493 incq(mask_idx); 1494 } 1495 } else { 1496 assert(elem_bt == T_BYTE, ""); 1497 for (int i = 0; i < 8; i++) { 1498 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1499 Label skip_load; 1500 btq(mask, mask_idx); 1501 jccb(Assembler::carryClear, skip_load); 1502 movl(rtmp, Address(idx_base, i * 4)); 1503 if (offset != noreg) { 1504 addl(rtmp, offset); 1505 } 1506 pinsrb(dst, Address(base, rtmp), i); 1507 bind(skip_load); 1508 incq(mask_idx); 1509 } 1510 } 1511 } 1512 #endif // _LP64 1513 1514 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1515 Register base, Register idx_base, 1516 Register offset, Register rtmp, 1517 int vlen_enc) { 1518 vpxor(dst, dst, dst, vlen_enc); 1519 if (elem_bt == T_SHORT) { 1520 for (int i = 0; i < 4; i++) { 1521 // dst[i] = src[offset + idx_base[i]] 1522 movl(rtmp, Address(idx_base, i * 4)); 1523 if (offset != noreg) { 1524 addl(rtmp, offset); 1525 } 1526 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1527 } 1528 } else { 1529 assert(elem_bt == T_BYTE, ""); 1530 for (int i = 0; i < 8; i++) { 1531 // dst[i] = src[offset + idx_base[i]] 1532 movl(rtmp, Address(idx_base, i * 4)); 1533 if (offset != noreg) { 1534 addl(rtmp, offset); 1535 } 1536 pinsrb(dst, Address(base, rtmp), i); 1537 } 1538 } 1539 } 1540 1541 /* 1542 * Gather using hybrid algorithm, first partially unroll scalar loop 1543 * to accumulate values from gather indices into a quad-word(64bit) slice. 1544 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1545 * permutation to place the slice into appropriate vector lane 1546 * locations in destination vector. Following pseudo code describes the 1547 * algorithm in detail: 1548 * 1549 * DST_VEC = ZERO_VEC 1550 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1551 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1552 * FOREACH_ITER: 1553 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1554 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1555 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1556 * PERM_INDEX = PERM_INDEX - TWO_VEC 1557 * 1558 * With each iteration, doubleword permute indices (0,1) corresponding 1559 * to gathered quadword gets right shifted by two lane positions. 1560 * 1561 */ 1562 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1563 Register base, Register idx_base, 1564 Register offset, Register mask, 1565 XMMRegister xtmp1, XMMRegister xtmp2, 1566 XMMRegister temp_dst, Register rtmp, 1567 Register mask_idx, Register length, 1568 int vector_len, int vlen_enc) { 1569 Label GATHER8_LOOP; 1570 assert(is_subword_type(elem_ty), ""); 1571 movl(length, vector_len); 1572 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1573 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1574 vallones(xtmp2, vlen_enc); 1575 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1576 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1577 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1578 1579 bind(GATHER8_LOOP); 1580 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1581 if (mask == noreg) { 1582 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1583 } else { 1584 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1585 } 1586 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1587 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1588 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1589 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1590 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1591 vpor(dst, dst, temp_dst, vlen_enc); 1592 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1593 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1594 jcc(Assembler::notEqual, GATHER8_LOOP); 1595 } 1596 1597 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1598 switch(typ) { 1599 case T_INT: 1600 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1601 break; 1602 case T_FLOAT: 1603 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1604 break; 1605 case T_LONG: 1606 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1607 break; 1608 case T_DOUBLE: 1609 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1610 break; 1611 default: 1612 assert(false,"Should not reach here."); 1613 break; 1614 } 1615 } 1616 1617 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1618 switch(typ) { 1619 case T_INT: 1620 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1621 break; 1622 case T_FLOAT: 1623 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1624 break; 1625 case T_LONG: 1626 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1627 break; 1628 case T_DOUBLE: 1629 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1630 break; 1631 default: 1632 assert(false,"Should not reach here."); 1633 break; 1634 } 1635 } 1636 1637 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1638 switch(typ) { 1639 case T_INT: 1640 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1641 break; 1642 case T_FLOAT: 1643 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1644 break; 1645 case T_LONG: 1646 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1647 break; 1648 case T_DOUBLE: 1649 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1650 break; 1651 default: 1652 assert(false,"Should not reach here."); 1653 break; 1654 } 1655 } 1656 1657 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1658 if (vlen_in_bytes <= 16) { 1659 pxor (dst, dst); 1660 psubb(dst, src); 1661 switch (elem_bt) { 1662 case T_BYTE: /* nothing to do */ break; 1663 case T_SHORT: pmovsxbw(dst, dst); break; 1664 case T_INT: pmovsxbd(dst, dst); break; 1665 case T_FLOAT: pmovsxbd(dst, dst); break; 1666 case T_LONG: pmovsxbq(dst, dst); break; 1667 case T_DOUBLE: pmovsxbq(dst, dst); break; 1668 1669 default: assert(false, "%s", type2name(elem_bt)); 1670 } 1671 } else { 1672 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1673 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1674 1675 vpxor (dst, dst, dst, vlen_enc); 1676 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1677 1678 switch (elem_bt) { 1679 case T_BYTE: /* nothing to do */ break; 1680 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1681 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1682 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1683 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1684 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1685 1686 default: assert(false, "%s", type2name(elem_bt)); 1687 } 1688 } 1689 } 1690 1691 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1692 if (novlbwdq) { 1693 vpmovsxbd(xtmp, src, vlen_enc); 1694 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1695 Assembler::eq, true, vlen_enc, noreg); 1696 } else { 1697 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1698 vpsubb(xtmp, xtmp, src, vlen_enc); 1699 evpmovb2m(dst, xtmp, vlen_enc); 1700 } 1701 } 1702 1703 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1704 switch (vlen_in_bytes) { 1705 case 4: movdl(dst, src); break; 1706 case 8: movq(dst, src); break; 1707 case 16: movdqu(dst, src); break; 1708 case 32: vmovdqu(dst, src); break; 1709 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1710 default: ShouldNotReachHere(); 1711 } 1712 } 1713 1714 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1715 assert(rscratch != noreg || always_reachable(src), "missing"); 1716 1717 if (reachable(src)) { 1718 load_vector(dst, as_Address(src), vlen_in_bytes); 1719 } else { 1720 lea(rscratch, src); 1721 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1722 } 1723 } 1724 1725 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1726 int vlen_enc = vector_length_encoding(vlen); 1727 if (VM_Version::supports_avx()) { 1728 if (bt == T_LONG) { 1729 if (VM_Version::supports_avx2()) { 1730 vpbroadcastq(dst, src, vlen_enc); 1731 } else { 1732 vmovddup(dst, src, vlen_enc); 1733 } 1734 } else if (bt == T_DOUBLE) { 1735 if (vlen_enc != Assembler::AVX_128bit) { 1736 vbroadcastsd(dst, src, vlen_enc, noreg); 1737 } else { 1738 vmovddup(dst, src, vlen_enc); 1739 } 1740 } else { 1741 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1742 vpbroadcastd(dst, src, vlen_enc); 1743 } else { 1744 vbroadcastss(dst, src, vlen_enc); 1745 } 1746 } 1747 } else if (VM_Version::supports_sse3()) { 1748 movddup(dst, src); 1749 } else { 1750 movq(dst, src); 1751 if (vlen == 16) { 1752 punpcklqdq(dst, dst); 1753 } 1754 } 1755 } 1756 1757 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1758 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1759 int offset = exact_log2(type2aelembytes(bt)) << 6; 1760 if (is_floating_point_type(bt)) { 1761 offset += 128; 1762 } 1763 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1764 load_vector(dst, addr, vlen_in_bytes); 1765 } 1766 1767 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1768 1769 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1770 int vector_len = Assembler::AVX_128bit; 1771 1772 switch (opcode) { 1773 case Op_AndReductionV: pand(dst, src); break; 1774 case Op_OrReductionV: por (dst, src); break; 1775 case Op_XorReductionV: pxor(dst, src); break; 1776 case Op_MinReductionV: 1777 switch (typ) { 1778 case T_BYTE: pminsb(dst, src); break; 1779 case T_SHORT: pminsw(dst, src); break; 1780 case T_INT: pminsd(dst, src); break; 1781 case T_LONG: assert(UseAVX > 2, "required"); 1782 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1783 default: assert(false, "wrong type"); 1784 } 1785 break; 1786 case Op_MaxReductionV: 1787 switch (typ) { 1788 case T_BYTE: pmaxsb(dst, src); break; 1789 case T_SHORT: pmaxsw(dst, src); break; 1790 case T_INT: pmaxsd(dst, src); break; 1791 case T_LONG: assert(UseAVX > 2, "required"); 1792 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1793 default: assert(false, "wrong type"); 1794 } 1795 break; 1796 case Op_AddReductionVF: addss(dst, src); break; 1797 case Op_AddReductionVD: addsd(dst, src); break; 1798 case Op_AddReductionVI: 1799 switch (typ) { 1800 case T_BYTE: paddb(dst, src); break; 1801 case T_SHORT: paddw(dst, src); break; 1802 case T_INT: paddd(dst, src); break; 1803 default: assert(false, "wrong type"); 1804 } 1805 break; 1806 case Op_AddReductionVL: paddq(dst, src); break; 1807 case Op_MulReductionVF: mulss(dst, src); break; 1808 case Op_MulReductionVD: mulsd(dst, src); break; 1809 case Op_MulReductionVI: 1810 switch (typ) { 1811 case T_SHORT: pmullw(dst, src); break; 1812 case T_INT: pmulld(dst, src); break; 1813 default: assert(false, "wrong type"); 1814 } 1815 break; 1816 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1817 evpmullq(dst, dst, src, vector_len); break; 1818 default: assert(false, "wrong opcode"); 1819 } 1820 } 1821 1822 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1823 switch (opcode) { 1824 case Op_AddReductionVF: addps(dst, src); break; 1825 case Op_AddReductionVD: addpd(dst, src); break; 1826 case Op_MulReductionVF: mulps(dst, src); break; 1827 case Op_MulReductionVD: mulpd(dst, src); break; 1828 default: assert(false, "%s", NodeClassNames[opcode]); 1829 } 1830 } 1831 1832 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1833 int vector_len = Assembler::AVX_256bit; 1834 1835 switch (opcode) { 1836 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1837 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1838 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1839 case Op_MinReductionV: 1840 switch (typ) { 1841 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1842 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1843 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1844 case T_LONG: assert(UseAVX > 2, "required"); 1845 vpminsq(dst, src1, src2, vector_len); break; 1846 default: assert(false, "wrong type"); 1847 } 1848 break; 1849 case Op_MaxReductionV: 1850 switch (typ) { 1851 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1852 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1853 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1854 case T_LONG: assert(UseAVX > 2, "required"); 1855 vpmaxsq(dst, src1, src2, vector_len); break; 1856 default: assert(false, "wrong type"); 1857 } 1858 break; 1859 case Op_AddReductionVI: 1860 switch (typ) { 1861 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1862 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1863 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1864 default: assert(false, "wrong type"); 1865 } 1866 break; 1867 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1868 case Op_MulReductionVI: 1869 switch (typ) { 1870 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1871 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1872 default: assert(false, "wrong type"); 1873 } 1874 break; 1875 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1876 default: assert(false, "wrong opcode"); 1877 } 1878 } 1879 1880 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1881 int vector_len = Assembler::AVX_256bit; 1882 1883 switch (opcode) { 1884 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break; 1885 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break; 1886 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break; 1887 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break; 1888 default: assert(false, "%s", NodeClassNames[opcode]); 1889 } 1890 } 1891 1892 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1893 XMMRegister dst, XMMRegister src, 1894 XMMRegister vtmp1, XMMRegister vtmp2) { 1895 switch (opcode) { 1896 case Op_AddReductionVF: 1897 case Op_MulReductionVF: 1898 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1899 break; 1900 1901 case Op_AddReductionVD: 1902 case Op_MulReductionVD: 1903 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1904 break; 1905 1906 default: assert(false, "wrong opcode"); 1907 } 1908 } 1909 1910 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen, 1911 XMMRegister dst, XMMRegister src, 1912 XMMRegister vtmp1, XMMRegister vtmp2) { 1913 switch (opcode) { 1914 case Op_AddReductionVF: 1915 case Op_MulReductionVF: 1916 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1917 break; 1918 1919 case Op_AddReductionVD: 1920 case Op_MulReductionVD: 1921 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1922 break; 1923 1924 default: assert(false, "%s", NodeClassNames[opcode]); 1925 } 1926 } 1927 1928 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1929 Register dst, Register src1, XMMRegister src2, 1930 XMMRegister vtmp1, XMMRegister vtmp2) { 1931 switch (vlen) { 1932 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1933 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1934 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1935 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1936 1937 default: assert(false, "wrong vector length"); 1938 } 1939 } 1940 1941 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1942 Register dst, Register src1, XMMRegister src2, 1943 XMMRegister vtmp1, XMMRegister vtmp2) { 1944 switch (vlen) { 1945 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1946 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1947 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1948 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1949 1950 default: assert(false, "wrong vector length"); 1951 } 1952 } 1953 1954 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1955 Register dst, Register src1, XMMRegister src2, 1956 XMMRegister vtmp1, XMMRegister vtmp2) { 1957 switch (vlen) { 1958 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1959 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1960 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1961 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1962 1963 default: assert(false, "wrong vector length"); 1964 } 1965 } 1966 1967 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1968 Register dst, Register src1, XMMRegister src2, 1969 XMMRegister vtmp1, XMMRegister vtmp2) { 1970 switch (vlen) { 1971 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1972 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1973 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1974 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1975 1976 default: assert(false, "wrong vector length"); 1977 } 1978 } 1979 1980 #ifdef _LP64 1981 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1982 Register dst, Register src1, XMMRegister src2, 1983 XMMRegister vtmp1, XMMRegister vtmp2) { 1984 switch (vlen) { 1985 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1986 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1987 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1988 1989 default: assert(false, "wrong vector length"); 1990 } 1991 } 1992 #endif // _LP64 1993 1994 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1995 switch (vlen) { 1996 case 2: 1997 assert(vtmp2 == xnoreg, ""); 1998 reduce2F(opcode, dst, src, vtmp1); 1999 break; 2000 case 4: 2001 assert(vtmp2 == xnoreg, ""); 2002 reduce4F(opcode, dst, src, vtmp1); 2003 break; 2004 case 8: 2005 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2006 break; 2007 case 16: 2008 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2009 break; 2010 default: assert(false, "wrong vector length"); 2011 } 2012 } 2013 2014 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2015 switch (vlen) { 2016 case 2: 2017 assert(vtmp2 == xnoreg, ""); 2018 reduce2D(opcode, dst, src, vtmp1); 2019 break; 2020 case 4: 2021 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2022 break; 2023 case 8: 2024 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2025 break; 2026 default: assert(false, "wrong vector length"); 2027 } 2028 } 2029 2030 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2031 switch (vlen) { 2032 case 2: 2033 assert(vtmp1 == xnoreg, ""); 2034 assert(vtmp2 == xnoreg, ""); 2035 unorderedReduce2F(opcode, dst, src); 2036 break; 2037 case 4: 2038 assert(vtmp2 == xnoreg, ""); 2039 unorderedReduce4F(opcode, dst, src, vtmp1); 2040 break; 2041 case 8: 2042 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2); 2043 break; 2044 case 16: 2045 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2); 2046 break; 2047 default: assert(false, "wrong vector length"); 2048 } 2049 } 2050 2051 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2052 switch (vlen) { 2053 case 2: 2054 assert(vtmp1 == xnoreg, ""); 2055 assert(vtmp2 == xnoreg, ""); 2056 unorderedReduce2D(opcode, dst, src); 2057 break; 2058 case 4: 2059 assert(vtmp2 == xnoreg, ""); 2060 unorderedReduce4D(opcode, dst, src, vtmp1); 2061 break; 2062 case 8: 2063 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2); 2064 break; 2065 default: assert(false, "wrong vector length"); 2066 } 2067 } 2068 2069 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2070 if (opcode == Op_AddReductionVI) { 2071 if (vtmp1 != src2) { 2072 movdqu(vtmp1, src2); 2073 } 2074 phaddd(vtmp1, vtmp1); 2075 } else { 2076 pshufd(vtmp1, src2, 0x1); 2077 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2078 } 2079 movdl(vtmp2, src1); 2080 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2081 movdl(dst, vtmp1); 2082 } 2083 2084 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2085 if (opcode == Op_AddReductionVI) { 2086 if (vtmp1 != src2) { 2087 movdqu(vtmp1, src2); 2088 } 2089 phaddd(vtmp1, src2); 2090 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2091 } else { 2092 pshufd(vtmp2, src2, 0xE); 2093 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2094 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2095 } 2096 } 2097 2098 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2099 if (opcode == Op_AddReductionVI) { 2100 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2101 vextracti128_high(vtmp2, vtmp1); 2102 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2103 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2104 } else { 2105 vextracti128_high(vtmp1, src2); 2106 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2107 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2108 } 2109 } 2110 2111 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2112 vextracti64x4_high(vtmp2, src2); 2113 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2114 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2115 } 2116 2117 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2118 pshufd(vtmp2, src2, 0x1); 2119 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2120 movdqu(vtmp1, vtmp2); 2121 psrldq(vtmp1, 2); 2122 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2123 movdqu(vtmp2, vtmp1); 2124 psrldq(vtmp2, 1); 2125 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2126 movdl(vtmp2, src1); 2127 pmovsxbd(vtmp1, vtmp1); 2128 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2129 pextrb(dst, vtmp1, 0x0); 2130 movsbl(dst, dst); 2131 } 2132 2133 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2134 pshufd(vtmp1, src2, 0xE); 2135 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2136 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2137 } 2138 2139 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2140 vextracti128_high(vtmp2, src2); 2141 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2142 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2143 } 2144 2145 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2146 vextracti64x4_high(vtmp1, src2); 2147 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2148 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2149 } 2150 2151 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2152 pmovsxbw(vtmp2, src2); 2153 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2154 } 2155 2156 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2157 if (UseAVX > 1) { 2158 int vector_len = Assembler::AVX_256bit; 2159 vpmovsxbw(vtmp1, src2, vector_len); 2160 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2161 } else { 2162 pmovsxbw(vtmp2, src2); 2163 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2164 pshufd(vtmp2, src2, 0x1); 2165 pmovsxbw(vtmp2, src2); 2166 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2167 } 2168 } 2169 2170 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2171 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2172 int vector_len = Assembler::AVX_512bit; 2173 vpmovsxbw(vtmp1, src2, vector_len); 2174 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2175 } else { 2176 assert(UseAVX >= 2,"Should not reach here."); 2177 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2178 vextracti128_high(vtmp2, src2); 2179 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2180 } 2181 } 2182 2183 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2184 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2185 vextracti64x4_high(vtmp2, src2); 2186 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2187 } 2188 2189 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2190 if (opcode == Op_AddReductionVI) { 2191 if (vtmp1 != src2) { 2192 movdqu(vtmp1, src2); 2193 } 2194 phaddw(vtmp1, vtmp1); 2195 phaddw(vtmp1, vtmp1); 2196 } else { 2197 pshufd(vtmp2, src2, 0x1); 2198 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2199 movdqu(vtmp1, vtmp2); 2200 psrldq(vtmp1, 2); 2201 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2202 } 2203 movdl(vtmp2, src1); 2204 pmovsxwd(vtmp1, vtmp1); 2205 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2206 pextrw(dst, vtmp1, 0x0); 2207 movswl(dst, dst); 2208 } 2209 2210 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2211 if (opcode == Op_AddReductionVI) { 2212 if (vtmp1 != src2) { 2213 movdqu(vtmp1, src2); 2214 } 2215 phaddw(vtmp1, src2); 2216 } else { 2217 pshufd(vtmp1, src2, 0xE); 2218 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2219 } 2220 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2221 } 2222 2223 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2224 if (opcode == Op_AddReductionVI) { 2225 int vector_len = Assembler::AVX_256bit; 2226 vphaddw(vtmp2, src2, src2, vector_len); 2227 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2228 } else { 2229 vextracti128_high(vtmp2, src2); 2230 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2231 } 2232 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2233 } 2234 2235 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2236 int vector_len = Assembler::AVX_256bit; 2237 vextracti64x4_high(vtmp1, src2); 2238 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2239 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2240 } 2241 2242 #ifdef _LP64 2243 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2244 pshufd(vtmp2, src2, 0xE); 2245 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2246 movdq(vtmp1, src1); 2247 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2248 movdq(dst, vtmp1); 2249 } 2250 2251 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2252 vextracti128_high(vtmp1, src2); 2253 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2254 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2255 } 2256 2257 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2258 vextracti64x4_high(vtmp2, src2); 2259 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2260 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2261 } 2262 2263 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2264 mov64(temp, -1L); 2265 bzhiq(temp, temp, len); 2266 kmovql(dst, temp); 2267 } 2268 #endif // _LP64 2269 2270 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2271 reduce_operation_128(T_FLOAT, opcode, dst, src); 2272 pshufd(vtmp, src, 0x1); 2273 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2274 } 2275 2276 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2277 reduce2F(opcode, dst, src, vtmp); 2278 pshufd(vtmp, src, 0x2); 2279 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2280 pshufd(vtmp, src, 0x3); 2281 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2282 } 2283 2284 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2285 reduce4F(opcode, dst, src, vtmp2); 2286 vextractf128_high(vtmp2, src); 2287 reduce4F(opcode, dst, vtmp2, vtmp1); 2288 } 2289 2290 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2291 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2292 vextracti64x4_high(vtmp1, src); 2293 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2294 } 2295 2296 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) { 2297 pshufd(dst, src, 0x1); 2298 reduce_operation_128(T_FLOAT, opcode, dst, src); 2299 } 2300 2301 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2302 pshufd(vtmp, src, 0xE); 2303 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src); 2304 unorderedReduce2F(opcode, dst, vtmp); 2305 } 2306 2307 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2308 vextractf128_high(vtmp1, src); 2309 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src); 2310 unorderedReduce4F(opcode, dst, vtmp1, vtmp2); 2311 } 2312 2313 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2314 vextractf64x4_high(vtmp2, src); 2315 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src); 2316 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2); 2317 } 2318 2319 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2320 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2321 pshufd(vtmp, src, 0xE); 2322 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2323 } 2324 2325 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2326 reduce2D(opcode, dst, src, vtmp2); 2327 vextractf128_high(vtmp2, src); 2328 reduce2D(opcode, dst, vtmp2, vtmp1); 2329 } 2330 2331 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2332 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2333 vextracti64x4_high(vtmp1, src); 2334 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2335 } 2336 2337 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) { 2338 pshufd(dst, src, 0xE); 2339 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2340 } 2341 2342 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2343 vextractf128_high(vtmp, src); 2344 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src); 2345 unorderedReduce2D(opcode, dst, vtmp); 2346 } 2347 2348 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2349 vextractf64x4_high(vtmp2, src); 2350 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src); 2351 unorderedReduce4D(opcode, dst, vtmp2, vtmp1); 2352 } 2353 2354 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2355 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2356 } 2357 2358 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2359 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2360 } 2361 2362 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2363 int vec_enc) { 2364 switch(elem_bt) { 2365 case T_INT: 2366 case T_FLOAT: 2367 vmaskmovps(dst, src, mask, vec_enc); 2368 break; 2369 case T_LONG: 2370 case T_DOUBLE: 2371 vmaskmovpd(dst, src, mask, vec_enc); 2372 break; 2373 default: 2374 fatal("Unsupported type %s", type2name(elem_bt)); 2375 break; 2376 } 2377 } 2378 2379 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2380 int vec_enc) { 2381 switch(elem_bt) { 2382 case T_INT: 2383 case T_FLOAT: 2384 vmaskmovps(dst, src, mask, vec_enc); 2385 break; 2386 case T_LONG: 2387 case T_DOUBLE: 2388 vmaskmovpd(dst, src, mask, vec_enc); 2389 break; 2390 default: 2391 fatal("Unsupported type %s", type2name(elem_bt)); 2392 break; 2393 } 2394 } 2395 2396 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2397 XMMRegister dst, XMMRegister src, 2398 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2399 XMMRegister xmm_0, XMMRegister xmm_1) { 2400 const int permconst[] = {1, 14}; 2401 XMMRegister wsrc = src; 2402 XMMRegister wdst = xmm_0; 2403 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2404 2405 int vlen_enc = Assembler::AVX_128bit; 2406 if (vlen == 16) { 2407 vlen_enc = Assembler::AVX_256bit; 2408 } 2409 2410 for (int i = log2(vlen) - 1; i >=0; i--) { 2411 if (i == 0 && !is_dst_valid) { 2412 wdst = dst; 2413 } 2414 if (i == 3) { 2415 vextracti64x4_high(wtmp, wsrc); 2416 } else if (i == 2) { 2417 vextracti128_high(wtmp, wsrc); 2418 } else { // i = [0,1] 2419 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2420 } 2421 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2422 wsrc = wdst; 2423 vlen_enc = Assembler::AVX_128bit; 2424 } 2425 if (is_dst_valid) { 2426 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2427 } 2428 } 2429 2430 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2431 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2432 XMMRegister xmm_0, XMMRegister xmm_1) { 2433 XMMRegister wsrc = src; 2434 XMMRegister wdst = xmm_0; 2435 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2436 int vlen_enc = Assembler::AVX_128bit; 2437 if (vlen == 8) { 2438 vlen_enc = Assembler::AVX_256bit; 2439 } 2440 for (int i = log2(vlen) - 1; i >=0; i--) { 2441 if (i == 0 && !is_dst_valid) { 2442 wdst = dst; 2443 } 2444 if (i == 1) { 2445 vextracti128_high(wtmp, wsrc); 2446 } else if (i == 2) { 2447 vextracti64x4_high(wtmp, wsrc); 2448 } else { 2449 assert(i == 0, "%d", i); 2450 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2451 } 2452 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2453 wsrc = wdst; 2454 vlen_enc = Assembler::AVX_128bit; 2455 } 2456 if (is_dst_valid) { 2457 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2458 } 2459 } 2460 2461 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2462 switch (bt) { 2463 case T_BYTE: pextrb(dst, src, idx); break; 2464 case T_SHORT: pextrw(dst, src, idx); break; 2465 case T_INT: pextrd(dst, src, idx); break; 2466 case T_LONG: pextrq(dst, src, idx); break; 2467 2468 default: 2469 assert(false,"Should not reach here."); 2470 break; 2471 } 2472 } 2473 2474 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2475 int esize = type2aelembytes(typ); 2476 int elem_per_lane = 16/esize; 2477 int lane = elemindex / elem_per_lane; 2478 int eindex = elemindex % elem_per_lane; 2479 2480 if (lane >= 2) { 2481 assert(UseAVX > 2, "required"); 2482 vextractf32x4(dst, src, lane & 3); 2483 return dst; 2484 } else if (lane > 0) { 2485 assert(UseAVX > 0, "required"); 2486 vextractf128(dst, src, lane); 2487 return dst; 2488 } else { 2489 return src; 2490 } 2491 } 2492 2493 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2494 if (typ == T_BYTE) { 2495 movsbl(dst, dst); 2496 } else if (typ == T_SHORT) { 2497 movswl(dst, dst); 2498 } 2499 } 2500 2501 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2502 int esize = type2aelembytes(typ); 2503 int elem_per_lane = 16/esize; 2504 int eindex = elemindex % elem_per_lane; 2505 assert(is_integral_type(typ),"required"); 2506 2507 if (eindex == 0) { 2508 if (typ == T_LONG) { 2509 movq(dst, src); 2510 } else { 2511 movdl(dst, src); 2512 movsxl(typ, dst); 2513 } 2514 } else { 2515 extract(typ, dst, src, eindex); 2516 movsxl(typ, dst); 2517 } 2518 } 2519 2520 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2521 int esize = type2aelembytes(typ); 2522 int elem_per_lane = 16/esize; 2523 int eindex = elemindex % elem_per_lane; 2524 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2525 2526 if (eindex == 0) { 2527 movq(dst, src); 2528 } else { 2529 if (typ == T_FLOAT) { 2530 if (UseAVX == 0) { 2531 movdqu(dst, src); 2532 shufps(dst, dst, eindex); 2533 } else { 2534 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2535 } 2536 } else { 2537 if (UseAVX == 0) { 2538 movdqu(dst, src); 2539 psrldq(dst, eindex*esize); 2540 } else { 2541 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2542 } 2543 movq(dst, dst); 2544 } 2545 } 2546 // Zero upper bits 2547 if (typ == T_FLOAT) { 2548 if (UseAVX == 0) { 2549 assert(vtmp != xnoreg, "required."); 2550 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2551 pand(dst, vtmp); 2552 } else { 2553 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2554 } 2555 } 2556 } 2557 2558 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2559 switch(typ) { 2560 case T_BYTE: 2561 case T_BOOLEAN: 2562 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2563 break; 2564 case T_SHORT: 2565 case T_CHAR: 2566 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2567 break; 2568 case T_INT: 2569 case T_FLOAT: 2570 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2571 break; 2572 case T_LONG: 2573 case T_DOUBLE: 2574 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2575 break; 2576 default: 2577 assert(false,"Should not reach here."); 2578 break; 2579 } 2580 } 2581 2582 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2583 assert(rscratch != noreg || always_reachable(src2), "missing"); 2584 2585 switch(typ) { 2586 case T_BOOLEAN: 2587 case T_BYTE: 2588 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2589 break; 2590 case T_CHAR: 2591 case T_SHORT: 2592 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2593 break; 2594 case T_INT: 2595 case T_FLOAT: 2596 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2597 break; 2598 case T_LONG: 2599 case T_DOUBLE: 2600 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2601 break; 2602 default: 2603 assert(false,"Should not reach here."); 2604 break; 2605 } 2606 } 2607 2608 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2609 switch(typ) { 2610 case T_BYTE: 2611 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2612 break; 2613 case T_SHORT: 2614 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2615 break; 2616 case T_INT: 2617 case T_FLOAT: 2618 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2619 break; 2620 case T_LONG: 2621 case T_DOUBLE: 2622 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2623 break; 2624 default: 2625 assert(false,"Should not reach here."); 2626 break; 2627 } 2628 } 2629 2630 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2631 assert(vlen_in_bytes <= 32, ""); 2632 int esize = type2aelembytes(bt); 2633 if (vlen_in_bytes == 32) { 2634 assert(vtmp == xnoreg, "required."); 2635 if (esize >= 4) { 2636 vtestps(src1, src2, AVX_256bit); 2637 } else { 2638 vptest(src1, src2, AVX_256bit); 2639 } 2640 return; 2641 } 2642 if (vlen_in_bytes < 16) { 2643 // Duplicate the lower part to fill the whole register, 2644 // Don't need to do so for src2 2645 assert(vtmp != xnoreg, "required"); 2646 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2647 pshufd(vtmp, src1, shuffle_imm); 2648 } else { 2649 assert(vtmp == xnoreg, "required"); 2650 vtmp = src1; 2651 } 2652 if (esize >= 4 && VM_Version::supports_avx()) { 2653 vtestps(vtmp, src2, AVX_128bit); 2654 } else { 2655 ptest(vtmp, src2); 2656 } 2657 } 2658 2659 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2660 assert(UseAVX >= 2, "required"); 2661 #ifdef ASSERT 2662 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2663 bool is_bw_supported = VM_Version::supports_avx512bw(); 2664 if (is_bw && !is_bw_supported) { 2665 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2666 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2667 "XMM register should be 0-15"); 2668 } 2669 #endif // ASSERT 2670 switch (elem_bt) { 2671 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2672 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2673 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2674 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2675 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2676 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2677 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2678 } 2679 } 2680 2681 #ifdef _LP64 2682 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2683 assert(UseAVX >= 2, "required"); 2684 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2685 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2686 if ((UseAVX > 2) && 2687 (!is_bw || VM_Version::supports_avx512bw()) && 2688 (!is_vl || VM_Version::supports_avx512vl())) { 2689 switch (elem_bt) { 2690 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2691 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2692 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2693 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2694 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2695 } 2696 } else { 2697 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2698 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2699 switch (elem_bt) { 2700 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2701 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2702 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2703 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2704 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2705 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2706 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2707 } 2708 } 2709 } 2710 #endif 2711 2712 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2713 switch (to_elem_bt) { 2714 case T_SHORT: 2715 vpmovsxbw(dst, src, vlen_enc); 2716 break; 2717 case T_INT: 2718 vpmovsxbd(dst, src, vlen_enc); 2719 break; 2720 case T_FLOAT: 2721 vpmovsxbd(dst, src, vlen_enc); 2722 vcvtdq2ps(dst, dst, vlen_enc); 2723 break; 2724 case T_LONG: 2725 vpmovsxbq(dst, src, vlen_enc); 2726 break; 2727 case T_DOUBLE: { 2728 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2729 vpmovsxbd(dst, src, mid_vlen_enc); 2730 vcvtdq2pd(dst, dst, vlen_enc); 2731 break; 2732 } 2733 default: 2734 fatal("Unsupported type %s", type2name(to_elem_bt)); 2735 break; 2736 } 2737 } 2738 2739 //------------------------------------------------------------------------------------------- 2740 2741 // IndexOf for constant substrings with size >= 8 chars 2742 // which don't need to be loaded through stack. 2743 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2744 Register cnt1, Register cnt2, 2745 int int_cnt2, Register result, 2746 XMMRegister vec, Register tmp, 2747 int ae) { 2748 ShortBranchVerifier sbv(this); 2749 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2750 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2751 2752 // This method uses the pcmpestri instruction with bound registers 2753 // inputs: 2754 // xmm - substring 2755 // rax - substring length (elements count) 2756 // mem - scanned string 2757 // rdx - string length (elements count) 2758 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2759 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2760 // outputs: 2761 // rcx - matched index in string 2762 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2763 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2764 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2765 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2766 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2767 2768 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2769 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2770 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2771 2772 // Note, inline_string_indexOf() generates checks: 2773 // if (substr.count > string.count) return -1; 2774 // if (substr.count == 0) return 0; 2775 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2776 2777 // Load substring. 2778 if (ae == StrIntrinsicNode::UL) { 2779 pmovzxbw(vec, Address(str2, 0)); 2780 } else { 2781 movdqu(vec, Address(str2, 0)); 2782 } 2783 movl(cnt2, int_cnt2); 2784 movptr(result, str1); // string addr 2785 2786 if (int_cnt2 > stride) { 2787 jmpb(SCAN_TO_SUBSTR); 2788 2789 // Reload substr for rescan, this code 2790 // is executed only for large substrings (> 8 chars) 2791 bind(RELOAD_SUBSTR); 2792 if (ae == StrIntrinsicNode::UL) { 2793 pmovzxbw(vec, Address(str2, 0)); 2794 } else { 2795 movdqu(vec, Address(str2, 0)); 2796 } 2797 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2798 2799 bind(RELOAD_STR); 2800 // We came here after the beginning of the substring was 2801 // matched but the rest of it was not so we need to search 2802 // again. Start from the next element after the previous match. 2803 2804 // cnt2 is number of substring reminding elements and 2805 // cnt1 is number of string reminding elements when cmp failed. 2806 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2807 subl(cnt1, cnt2); 2808 addl(cnt1, int_cnt2); 2809 movl(cnt2, int_cnt2); // Now restore cnt2 2810 2811 decrementl(cnt1); // Shift to next element 2812 cmpl(cnt1, cnt2); 2813 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2814 2815 addptr(result, (1<<scale1)); 2816 2817 } // (int_cnt2 > 8) 2818 2819 // Scan string for start of substr in 16-byte vectors 2820 bind(SCAN_TO_SUBSTR); 2821 pcmpestri(vec, Address(result, 0), mode); 2822 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2823 subl(cnt1, stride); 2824 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2825 cmpl(cnt1, cnt2); 2826 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2827 addptr(result, 16); 2828 jmpb(SCAN_TO_SUBSTR); 2829 2830 // Found a potential substr 2831 bind(FOUND_CANDIDATE); 2832 // Matched whole vector if first element matched (tmp(rcx) == 0). 2833 if (int_cnt2 == stride) { 2834 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2835 } else { // int_cnt2 > 8 2836 jccb(Assembler::overflow, FOUND_SUBSTR); 2837 } 2838 // After pcmpestri tmp(rcx) contains matched element index 2839 // Compute start addr of substr 2840 lea(result, Address(result, tmp, scale1)); 2841 2842 // Make sure string is still long enough 2843 subl(cnt1, tmp); 2844 cmpl(cnt1, cnt2); 2845 if (int_cnt2 == stride) { 2846 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2847 } else { // int_cnt2 > 8 2848 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2849 } 2850 // Left less then substring. 2851 2852 bind(RET_NOT_FOUND); 2853 movl(result, -1); 2854 jmp(EXIT); 2855 2856 if (int_cnt2 > stride) { 2857 // This code is optimized for the case when whole substring 2858 // is matched if its head is matched. 2859 bind(MATCH_SUBSTR_HEAD); 2860 pcmpestri(vec, Address(result, 0), mode); 2861 // Reload only string if does not match 2862 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2863 2864 Label CONT_SCAN_SUBSTR; 2865 // Compare the rest of substring (> 8 chars). 2866 bind(FOUND_SUBSTR); 2867 // First 8 chars are already matched. 2868 negptr(cnt2); 2869 addptr(cnt2, stride); 2870 2871 bind(SCAN_SUBSTR); 2872 subl(cnt1, stride); 2873 cmpl(cnt2, -stride); // Do not read beyond substring 2874 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2875 // Back-up strings to avoid reading beyond substring: 2876 // cnt1 = cnt1 - cnt2 + 8 2877 addl(cnt1, cnt2); // cnt2 is negative 2878 addl(cnt1, stride); 2879 movl(cnt2, stride); negptr(cnt2); 2880 bind(CONT_SCAN_SUBSTR); 2881 if (int_cnt2 < (int)G) { 2882 int tail_off1 = int_cnt2<<scale1; 2883 int tail_off2 = int_cnt2<<scale2; 2884 if (ae == StrIntrinsicNode::UL) { 2885 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2886 } else { 2887 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2888 } 2889 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2890 } else { 2891 // calculate index in register to avoid integer overflow (int_cnt2*2) 2892 movl(tmp, int_cnt2); 2893 addptr(tmp, cnt2); 2894 if (ae == StrIntrinsicNode::UL) { 2895 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2896 } else { 2897 movdqu(vec, Address(str2, tmp, scale2, 0)); 2898 } 2899 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2900 } 2901 // Need to reload strings pointers if not matched whole vector 2902 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2903 addptr(cnt2, stride); 2904 jcc(Assembler::negative, SCAN_SUBSTR); 2905 // Fall through if found full substring 2906 2907 } // (int_cnt2 > 8) 2908 2909 bind(RET_FOUND); 2910 // Found result if we matched full small substring. 2911 // Compute substr offset 2912 subptr(result, str1); 2913 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2914 shrl(result, 1); // index 2915 } 2916 bind(EXIT); 2917 2918 } // string_indexofC8 2919 2920 // Small strings are loaded through stack if they cross page boundary. 2921 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2922 Register cnt1, Register cnt2, 2923 int int_cnt2, Register result, 2924 XMMRegister vec, Register tmp, 2925 int ae) { 2926 ShortBranchVerifier sbv(this); 2927 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2928 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2929 2930 // 2931 // int_cnt2 is length of small (< 8 chars) constant substring 2932 // or (-1) for non constant substring in which case its length 2933 // is in cnt2 register. 2934 // 2935 // Note, inline_string_indexOf() generates checks: 2936 // if (substr.count > string.count) return -1; 2937 // if (substr.count == 0) return 0; 2938 // 2939 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2940 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2941 // This method uses the pcmpestri instruction with bound registers 2942 // inputs: 2943 // xmm - substring 2944 // rax - substring length (elements count) 2945 // mem - scanned string 2946 // rdx - string length (elements count) 2947 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2948 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2949 // outputs: 2950 // rcx - matched index in string 2951 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2952 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2953 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2954 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2955 2956 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2957 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2958 FOUND_CANDIDATE; 2959 2960 { //======================================================== 2961 // We don't know where these strings are located 2962 // and we can't read beyond them. Load them through stack. 2963 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2964 2965 movptr(tmp, rsp); // save old SP 2966 2967 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2968 if (int_cnt2 == (1>>scale2)) { // One byte 2969 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2970 load_unsigned_byte(result, Address(str2, 0)); 2971 movdl(vec, result); // move 32 bits 2972 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2973 // Not enough header space in 32-bit VM: 12+3 = 15. 2974 movl(result, Address(str2, -1)); 2975 shrl(result, 8); 2976 movdl(vec, result); // move 32 bits 2977 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2978 load_unsigned_short(result, Address(str2, 0)); 2979 movdl(vec, result); // move 32 bits 2980 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2981 movdl(vec, Address(str2, 0)); // move 32 bits 2982 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2983 movq(vec, Address(str2, 0)); // move 64 bits 2984 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2985 // Array header size is 12 bytes in 32-bit VM 2986 // + 6 bytes for 3 chars == 18 bytes, 2987 // enough space to load vec and shift. 2988 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2989 if (ae == StrIntrinsicNode::UL) { 2990 int tail_off = int_cnt2-8; 2991 pmovzxbw(vec, Address(str2, tail_off)); 2992 psrldq(vec, -2*tail_off); 2993 } 2994 else { 2995 int tail_off = int_cnt2*(1<<scale2); 2996 movdqu(vec, Address(str2, tail_off-16)); 2997 psrldq(vec, 16-tail_off); 2998 } 2999 } 3000 } else { // not constant substring 3001 cmpl(cnt2, stride); 3002 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3003 3004 // We can read beyond string if srt+16 does not cross page boundary 3005 // since heaps are aligned and mapped by pages. 3006 assert(os::vm_page_size() < (int)G, "default page should be small"); 3007 movl(result, str2); // We need only low 32 bits 3008 andl(result, ((int)os::vm_page_size()-1)); 3009 cmpl(result, ((int)os::vm_page_size()-16)); 3010 jccb(Assembler::belowEqual, CHECK_STR); 3011 3012 // Move small strings to stack to allow load 16 bytes into vec. 3013 subptr(rsp, 16); 3014 int stk_offset = wordSize-(1<<scale2); 3015 push(cnt2); 3016 3017 bind(COPY_SUBSTR); 3018 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3019 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3020 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3021 } else if (ae == StrIntrinsicNode::UU) { 3022 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3023 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3024 } 3025 decrement(cnt2); 3026 jccb(Assembler::notZero, COPY_SUBSTR); 3027 3028 pop(cnt2); 3029 movptr(str2, rsp); // New substring address 3030 } // non constant 3031 3032 bind(CHECK_STR); 3033 cmpl(cnt1, stride); 3034 jccb(Assembler::aboveEqual, BIG_STRINGS); 3035 3036 // Check cross page boundary. 3037 movl(result, str1); // We need only low 32 bits 3038 andl(result, ((int)os::vm_page_size()-1)); 3039 cmpl(result, ((int)os::vm_page_size()-16)); 3040 jccb(Assembler::belowEqual, BIG_STRINGS); 3041 3042 subptr(rsp, 16); 3043 int stk_offset = -(1<<scale1); 3044 if (int_cnt2 < 0) { // not constant 3045 push(cnt2); 3046 stk_offset += wordSize; 3047 } 3048 movl(cnt2, cnt1); 3049 3050 bind(COPY_STR); 3051 if (ae == StrIntrinsicNode::LL) { 3052 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3053 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3054 } else { 3055 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3056 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3057 } 3058 decrement(cnt2); 3059 jccb(Assembler::notZero, COPY_STR); 3060 3061 if (int_cnt2 < 0) { // not constant 3062 pop(cnt2); 3063 } 3064 movptr(str1, rsp); // New string address 3065 3066 bind(BIG_STRINGS); 3067 // Load substring. 3068 if (int_cnt2 < 0) { // -1 3069 if (ae == StrIntrinsicNode::UL) { 3070 pmovzxbw(vec, Address(str2, 0)); 3071 } else { 3072 movdqu(vec, Address(str2, 0)); 3073 } 3074 push(cnt2); // substr count 3075 push(str2); // substr addr 3076 push(str1); // string addr 3077 } else { 3078 // Small (< 8 chars) constant substrings are loaded already. 3079 movl(cnt2, int_cnt2); 3080 } 3081 push(tmp); // original SP 3082 3083 } // Finished loading 3084 3085 //======================================================== 3086 // Start search 3087 // 3088 3089 movptr(result, str1); // string addr 3090 3091 if (int_cnt2 < 0) { // Only for non constant substring 3092 jmpb(SCAN_TO_SUBSTR); 3093 3094 // SP saved at sp+0 3095 // String saved at sp+1*wordSize 3096 // Substr saved at sp+2*wordSize 3097 // Substr count saved at sp+3*wordSize 3098 3099 // Reload substr for rescan, this code 3100 // is executed only for large substrings (> 8 chars) 3101 bind(RELOAD_SUBSTR); 3102 movptr(str2, Address(rsp, 2*wordSize)); 3103 movl(cnt2, Address(rsp, 3*wordSize)); 3104 if (ae == StrIntrinsicNode::UL) { 3105 pmovzxbw(vec, Address(str2, 0)); 3106 } else { 3107 movdqu(vec, Address(str2, 0)); 3108 } 3109 // We came here after the beginning of the substring was 3110 // matched but the rest of it was not so we need to search 3111 // again. Start from the next element after the previous match. 3112 subptr(str1, result); // Restore counter 3113 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3114 shrl(str1, 1); 3115 } 3116 addl(cnt1, str1); 3117 decrementl(cnt1); // Shift to next element 3118 cmpl(cnt1, cnt2); 3119 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3120 3121 addptr(result, (1<<scale1)); 3122 } // non constant 3123 3124 // Scan string for start of substr in 16-byte vectors 3125 bind(SCAN_TO_SUBSTR); 3126 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3127 pcmpestri(vec, Address(result, 0), mode); 3128 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3129 subl(cnt1, stride); 3130 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3131 cmpl(cnt1, cnt2); 3132 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3133 addptr(result, 16); 3134 3135 bind(ADJUST_STR); 3136 cmpl(cnt1, stride); // Do not read beyond string 3137 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3138 // Back-up string to avoid reading beyond string. 3139 lea(result, Address(result, cnt1, scale1, -16)); 3140 movl(cnt1, stride); 3141 jmpb(SCAN_TO_SUBSTR); 3142 3143 // Found a potential substr 3144 bind(FOUND_CANDIDATE); 3145 // After pcmpestri tmp(rcx) contains matched element index 3146 3147 // Make sure string is still long enough 3148 subl(cnt1, tmp); 3149 cmpl(cnt1, cnt2); 3150 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3151 // Left less then substring. 3152 3153 bind(RET_NOT_FOUND); 3154 movl(result, -1); 3155 jmp(CLEANUP); 3156 3157 bind(FOUND_SUBSTR); 3158 // Compute start addr of substr 3159 lea(result, Address(result, tmp, scale1)); 3160 if (int_cnt2 > 0) { // Constant substring 3161 // Repeat search for small substring (< 8 chars) 3162 // from new point without reloading substring. 3163 // Have to check that we don't read beyond string. 3164 cmpl(tmp, stride-int_cnt2); 3165 jccb(Assembler::greater, ADJUST_STR); 3166 // Fall through if matched whole substring. 3167 } else { // non constant 3168 assert(int_cnt2 == -1, "should be != 0"); 3169 3170 addl(tmp, cnt2); 3171 // Found result if we matched whole substring. 3172 cmpl(tmp, stride); 3173 jcc(Assembler::lessEqual, RET_FOUND); 3174 3175 // Repeat search for small substring (<= 8 chars) 3176 // from new point 'str1' without reloading substring. 3177 cmpl(cnt2, stride); 3178 // Have to check that we don't read beyond string. 3179 jccb(Assembler::lessEqual, ADJUST_STR); 3180 3181 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3182 // Compare the rest of substring (> 8 chars). 3183 movptr(str1, result); 3184 3185 cmpl(tmp, cnt2); 3186 // First 8 chars are already matched. 3187 jccb(Assembler::equal, CHECK_NEXT); 3188 3189 bind(SCAN_SUBSTR); 3190 pcmpestri(vec, Address(str1, 0), mode); 3191 // Need to reload strings pointers if not matched whole vector 3192 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3193 3194 bind(CHECK_NEXT); 3195 subl(cnt2, stride); 3196 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3197 addptr(str1, 16); 3198 if (ae == StrIntrinsicNode::UL) { 3199 addptr(str2, 8); 3200 } else { 3201 addptr(str2, 16); 3202 } 3203 subl(cnt1, stride); 3204 cmpl(cnt2, stride); // Do not read beyond substring 3205 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3206 // Back-up strings to avoid reading beyond substring. 3207 3208 if (ae == StrIntrinsicNode::UL) { 3209 lea(str2, Address(str2, cnt2, scale2, -8)); 3210 lea(str1, Address(str1, cnt2, scale1, -16)); 3211 } else { 3212 lea(str2, Address(str2, cnt2, scale2, -16)); 3213 lea(str1, Address(str1, cnt2, scale1, -16)); 3214 } 3215 subl(cnt1, cnt2); 3216 movl(cnt2, stride); 3217 addl(cnt1, stride); 3218 bind(CONT_SCAN_SUBSTR); 3219 if (ae == StrIntrinsicNode::UL) { 3220 pmovzxbw(vec, Address(str2, 0)); 3221 } else { 3222 movdqu(vec, Address(str2, 0)); 3223 } 3224 jmp(SCAN_SUBSTR); 3225 3226 bind(RET_FOUND_LONG); 3227 movptr(str1, Address(rsp, wordSize)); 3228 } // non constant 3229 3230 bind(RET_FOUND); 3231 // Compute substr offset 3232 subptr(result, str1); 3233 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3234 shrl(result, 1); // index 3235 } 3236 bind(CLEANUP); 3237 pop(rsp); // restore SP 3238 3239 } // string_indexof 3240 3241 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3242 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3243 ShortBranchVerifier sbv(this); 3244 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3245 3246 int stride = 8; 3247 3248 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3249 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3250 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3251 FOUND_SEQ_CHAR, DONE_LABEL; 3252 3253 movptr(result, str1); 3254 if (UseAVX >= 2) { 3255 cmpl(cnt1, stride); 3256 jcc(Assembler::less, SCAN_TO_CHAR); 3257 cmpl(cnt1, 2*stride); 3258 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3259 movdl(vec1, ch); 3260 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3261 vpxor(vec2, vec2); 3262 movl(tmp, cnt1); 3263 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3264 andl(cnt1,0x0000000F); //tail count (in chars) 3265 3266 bind(SCAN_TO_16_CHAR_LOOP); 3267 vmovdqu(vec3, Address(result, 0)); 3268 vpcmpeqw(vec3, vec3, vec1, 1); 3269 vptest(vec2, vec3); 3270 jcc(Assembler::carryClear, FOUND_CHAR); 3271 addptr(result, 32); 3272 subl(tmp, 2*stride); 3273 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3274 jmp(SCAN_TO_8_CHAR); 3275 bind(SCAN_TO_8_CHAR_INIT); 3276 movdl(vec1, ch); 3277 pshuflw(vec1, vec1, 0x00); 3278 pshufd(vec1, vec1, 0); 3279 pxor(vec2, vec2); 3280 } 3281 bind(SCAN_TO_8_CHAR); 3282 cmpl(cnt1, stride); 3283 jcc(Assembler::less, SCAN_TO_CHAR); 3284 if (UseAVX < 2) { 3285 movdl(vec1, ch); 3286 pshuflw(vec1, vec1, 0x00); 3287 pshufd(vec1, vec1, 0); 3288 pxor(vec2, vec2); 3289 } 3290 movl(tmp, cnt1); 3291 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3292 andl(cnt1,0x00000007); //tail count (in chars) 3293 3294 bind(SCAN_TO_8_CHAR_LOOP); 3295 movdqu(vec3, Address(result, 0)); 3296 pcmpeqw(vec3, vec1); 3297 ptest(vec2, vec3); 3298 jcc(Assembler::carryClear, FOUND_CHAR); 3299 addptr(result, 16); 3300 subl(tmp, stride); 3301 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3302 bind(SCAN_TO_CHAR); 3303 testl(cnt1, cnt1); 3304 jcc(Assembler::zero, RET_NOT_FOUND); 3305 bind(SCAN_TO_CHAR_LOOP); 3306 load_unsigned_short(tmp, Address(result, 0)); 3307 cmpl(ch, tmp); 3308 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3309 addptr(result, 2); 3310 subl(cnt1, 1); 3311 jccb(Assembler::zero, RET_NOT_FOUND); 3312 jmp(SCAN_TO_CHAR_LOOP); 3313 3314 bind(RET_NOT_FOUND); 3315 movl(result, -1); 3316 jmpb(DONE_LABEL); 3317 3318 bind(FOUND_CHAR); 3319 if (UseAVX >= 2) { 3320 vpmovmskb(tmp, vec3); 3321 } else { 3322 pmovmskb(tmp, vec3); 3323 } 3324 bsfl(ch, tmp); 3325 addptr(result, ch); 3326 3327 bind(FOUND_SEQ_CHAR); 3328 subptr(result, str1); 3329 shrl(result, 1); 3330 3331 bind(DONE_LABEL); 3332 } // string_indexof_char 3333 3334 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3335 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3336 ShortBranchVerifier sbv(this); 3337 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3338 3339 int stride = 16; 3340 3341 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3342 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3343 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3344 FOUND_SEQ_CHAR, DONE_LABEL; 3345 3346 movptr(result, str1); 3347 if (UseAVX >= 2) { 3348 cmpl(cnt1, stride); 3349 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3350 cmpl(cnt1, stride*2); 3351 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3352 movdl(vec1, ch); 3353 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3354 vpxor(vec2, vec2); 3355 movl(tmp, cnt1); 3356 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3357 andl(cnt1,0x0000001F); //tail count (in chars) 3358 3359 bind(SCAN_TO_32_CHAR_LOOP); 3360 vmovdqu(vec3, Address(result, 0)); 3361 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3362 vptest(vec2, vec3); 3363 jcc(Assembler::carryClear, FOUND_CHAR); 3364 addptr(result, 32); 3365 subl(tmp, stride*2); 3366 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3367 jmp(SCAN_TO_16_CHAR); 3368 3369 bind(SCAN_TO_16_CHAR_INIT); 3370 movdl(vec1, ch); 3371 pxor(vec2, vec2); 3372 pshufb(vec1, vec2); 3373 } 3374 3375 bind(SCAN_TO_16_CHAR); 3376 cmpl(cnt1, stride); 3377 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3378 if (UseAVX < 2) { 3379 movdl(vec1, ch); 3380 pxor(vec2, vec2); 3381 pshufb(vec1, vec2); 3382 } 3383 movl(tmp, cnt1); 3384 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3385 andl(cnt1,0x0000000F); //tail count (in bytes) 3386 3387 bind(SCAN_TO_16_CHAR_LOOP); 3388 movdqu(vec3, Address(result, 0)); 3389 pcmpeqb(vec3, vec1); 3390 ptest(vec2, vec3); 3391 jcc(Assembler::carryClear, FOUND_CHAR); 3392 addptr(result, 16); 3393 subl(tmp, stride); 3394 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3395 3396 bind(SCAN_TO_CHAR_INIT); 3397 testl(cnt1, cnt1); 3398 jcc(Assembler::zero, RET_NOT_FOUND); 3399 bind(SCAN_TO_CHAR_LOOP); 3400 load_unsigned_byte(tmp, Address(result, 0)); 3401 cmpl(ch, tmp); 3402 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3403 addptr(result, 1); 3404 subl(cnt1, 1); 3405 jccb(Assembler::zero, RET_NOT_FOUND); 3406 jmp(SCAN_TO_CHAR_LOOP); 3407 3408 bind(RET_NOT_FOUND); 3409 movl(result, -1); 3410 jmpb(DONE_LABEL); 3411 3412 bind(FOUND_CHAR); 3413 if (UseAVX >= 2) { 3414 vpmovmskb(tmp, vec3); 3415 } else { 3416 pmovmskb(tmp, vec3); 3417 } 3418 bsfl(ch, tmp); 3419 addptr(result, ch); 3420 3421 bind(FOUND_SEQ_CHAR); 3422 subptr(result, str1); 3423 3424 bind(DONE_LABEL); 3425 } // stringL_indexof_char 3426 3427 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3428 switch (eltype) { 3429 case T_BOOLEAN: return sizeof(jboolean); 3430 case T_BYTE: return sizeof(jbyte); 3431 case T_SHORT: return sizeof(jshort); 3432 case T_CHAR: return sizeof(jchar); 3433 case T_INT: return sizeof(jint); 3434 default: 3435 ShouldNotReachHere(); 3436 return -1; 3437 } 3438 } 3439 3440 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3441 switch (eltype) { 3442 // T_BOOLEAN used as surrogate for unsigned byte 3443 case T_BOOLEAN: movzbl(dst, src); break; 3444 case T_BYTE: movsbl(dst, src); break; 3445 case T_SHORT: movswl(dst, src); break; 3446 case T_CHAR: movzwl(dst, src); break; 3447 case T_INT: movl(dst, src); break; 3448 default: 3449 ShouldNotReachHere(); 3450 } 3451 } 3452 3453 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3454 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3455 } 3456 3457 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3458 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3459 } 3460 3461 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3462 const int vlen = Assembler::AVX_256bit; 3463 switch (eltype) { 3464 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3465 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3466 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3467 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3468 case T_INT: 3469 // do nothing 3470 break; 3471 default: 3472 ShouldNotReachHere(); 3473 } 3474 } 3475 3476 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3477 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3478 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3479 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3480 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3481 BasicType eltype) { 3482 ShortBranchVerifier sbv(this); 3483 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3484 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3485 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3486 3487 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3488 SHORT_UNROLLED_LOOP_EXIT, 3489 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3490 UNROLLED_VECTOR_LOOP_BEGIN, 3491 END; 3492 switch (eltype) { 3493 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3494 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3495 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3496 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3497 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3498 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3499 } 3500 3501 // For "renaming" for readibility of the code 3502 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3503 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3504 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3505 3506 const int elsize = arrays_hashcode_elsize(eltype); 3507 3508 /* 3509 if (cnt1 >= 2) { 3510 if (cnt1 >= 32) { 3511 UNROLLED VECTOR LOOP 3512 } 3513 UNROLLED SCALAR LOOP 3514 } 3515 SINGLE SCALAR 3516 */ 3517 3518 cmpl(cnt1, 32); 3519 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3520 3521 // cnt1 >= 32 && generate_vectorized_loop 3522 xorl(index, index); 3523 3524 // vresult = IntVector.zero(I256); 3525 for (int idx = 0; idx < 4; idx++) { 3526 vpxor(vresult[idx], vresult[idx]); 3527 } 3528 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3529 Register bound = tmp2; 3530 Register next = tmp3; 3531 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3532 movl(next, Address(tmp2, 0)); 3533 movdl(vnext, next); 3534 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3535 3536 // index = 0; 3537 // bound = cnt1 & ~(32 - 1); 3538 movl(bound, cnt1); 3539 andl(bound, ~(32 - 1)); 3540 // for (; index < bound; index += 32) { 3541 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3542 // result *= next; 3543 imull(result, next); 3544 // loop fission to upfront the cost of fetching from memory, OOO execution 3545 // can then hopefully do a better job of prefetching 3546 for (int idx = 0; idx < 4; idx++) { 3547 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3548 } 3549 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3550 for (int idx = 0; idx < 4; idx++) { 3551 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3552 arrays_hashcode_elvcast(vtmp[idx], eltype); 3553 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3554 } 3555 // index += 32; 3556 addl(index, 32); 3557 // index < bound; 3558 cmpl(index, bound); 3559 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3560 // } 3561 3562 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3563 subl(cnt1, bound); 3564 // release bound 3565 3566 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3567 for (int idx = 0; idx < 4; idx++) { 3568 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3569 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3570 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3571 } 3572 // result += vresult.reduceLanes(ADD); 3573 for (int idx = 0; idx < 4; idx++) { 3574 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3575 } 3576 3577 // } else if (cnt1 < 32) { 3578 3579 bind(SHORT_UNROLLED_BEGIN); 3580 // int i = 1; 3581 movl(index, 1); 3582 cmpl(index, cnt1); 3583 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3584 3585 // for (; i < cnt1 ; i += 2) { 3586 bind(SHORT_UNROLLED_LOOP_BEGIN); 3587 movl(tmp3, 961); 3588 imull(result, tmp3); 3589 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3590 movl(tmp3, tmp2); 3591 shll(tmp3, 5); 3592 subl(tmp3, tmp2); 3593 addl(result, tmp3); 3594 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3595 addl(result, tmp3); 3596 addl(index, 2); 3597 cmpl(index, cnt1); 3598 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3599 3600 // } 3601 // if (i >= cnt1) { 3602 bind(SHORT_UNROLLED_LOOP_EXIT); 3603 jccb(Assembler::greater, END); 3604 movl(tmp2, result); 3605 shll(result, 5); 3606 subl(result, tmp2); 3607 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3608 addl(result, tmp3); 3609 // } 3610 bind(END); 3611 3612 BLOCK_COMMENT("} // arrays_hashcode"); 3613 3614 } // arrays_hashcode 3615 3616 // helper function for string_compare 3617 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3618 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3619 Address::ScaleFactor scale2, Register index, int ae) { 3620 if (ae == StrIntrinsicNode::LL) { 3621 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3622 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3623 } else if (ae == StrIntrinsicNode::UU) { 3624 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3625 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3626 } else { 3627 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3628 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3629 } 3630 } 3631 3632 // Compare strings, used for char[] and byte[]. 3633 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3634 Register cnt1, Register cnt2, Register result, 3635 XMMRegister vec1, int ae, KRegister mask) { 3636 ShortBranchVerifier sbv(this); 3637 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3638 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3639 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3640 int stride2x2 = 0x40; 3641 Address::ScaleFactor scale = Address::no_scale; 3642 Address::ScaleFactor scale1 = Address::no_scale; 3643 Address::ScaleFactor scale2 = Address::no_scale; 3644 3645 if (ae != StrIntrinsicNode::LL) { 3646 stride2x2 = 0x20; 3647 } 3648 3649 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3650 shrl(cnt2, 1); 3651 } 3652 // Compute the minimum of the string lengths and the 3653 // difference of the string lengths (stack). 3654 // Do the conditional move stuff 3655 movl(result, cnt1); 3656 subl(cnt1, cnt2); 3657 push(cnt1); 3658 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3659 3660 // Is the minimum length zero? 3661 testl(cnt2, cnt2); 3662 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3663 if (ae == StrIntrinsicNode::LL) { 3664 // Load first bytes 3665 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3666 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3667 } else if (ae == StrIntrinsicNode::UU) { 3668 // Load first characters 3669 load_unsigned_short(result, Address(str1, 0)); 3670 load_unsigned_short(cnt1, Address(str2, 0)); 3671 } else { 3672 load_unsigned_byte(result, Address(str1, 0)); 3673 load_unsigned_short(cnt1, Address(str2, 0)); 3674 } 3675 subl(result, cnt1); 3676 jcc(Assembler::notZero, POP_LABEL); 3677 3678 if (ae == StrIntrinsicNode::UU) { 3679 // Divide length by 2 to get number of chars 3680 shrl(cnt2, 1); 3681 } 3682 cmpl(cnt2, 1); 3683 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3684 3685 // Check if the strings start at the same location and setup scale and stride 3686 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3687 cmpptr(str1, str2); 3688 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3689 if (ae == StrIntrinsicNode::LL) { 3690 scale = Address::times_1; 3691 stride = 16; 3692 } else { 3693 scale = Address::times_2; 3694 stride = 8; 3695 } 3696 } else { 3697 scale1 = Address::times_1; 3698 scale2 = Address::times_2; 3699 // scale not used 3700 stride = 8; 3701 } 3702 3703 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3704 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3705 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3706 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3707 Label COMPARE_TAIL_LONG; 3708 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3709 3710 int pcmpmask = 0x19; 3711 if (ae == StrIntrinsicNode::LL) { 3712 pcmpmask &= ~0x01; 3713 } 3714 3715 // Setup to compare 16-chars (32-bytes) vectors, 3716 // start from first character again because it has aligned address. 3717 if (ae == StrIntrinsicNode::LL) { 3718 stride2 = 32; 3719 } else { 3720 stride2 = 16; 3721 } 3722 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3723 adr_stride = stride << scale; 3724 } else { 3725 adr_stride1 = 8; //stride << scale1; 3726 adr_stride2 = 16; //stride << scale2; 3727 } 3728 3729 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3730 // rax and rdx are used by pcmpestri as elements counters 3731 movl(result, cnt2); 3732 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3733 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3734 3735 // fast path : compare first 2 8-char vectors. 3736 bind(COMPARE_16_CHARS); 3737 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3738 movdqu(vec1, Address(str1, 0)); 3739 } else { 3740 pmovzxbw(vec1, Address(str1, 0)); 3741 } 3742 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3743 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3744 3745 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3746 movdqu(vec1, Address(str1, adr_stride)); 3747 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3748 } else { 3749 pmovzxbw(vec1, Address(str1, adr_stride1)); 3750 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3751 } 3752 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3753 addl(cnt1, stride); 3754 3755 // Compare the characters at index in cnt1 3756 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3757 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3758 subl(result, cnt2); 3759 jmp(POP_LABEL); 3760 3761 // Setup the registers to start vector comparison loop 3762 bind(COMPARE_WIDE_VECTORS); 3763 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3764 lea(str1, Address(str1, result, scale)); 3765 lea(str2, Address(str2, result, scale)); 3766 } else { 3767 lea(str1, Address(str1, result, scale1)); 3768 lea(str2, Address(str2, result, scale2)); 3769 } 3770 subl(result, stride2); 3771 subl(cnt2, stride2); 3772 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3773 negptr(result); 3774 3775 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3776 bind(COMPARE_WIDE_VECTORS_LOOP); 3777 3778 #ifdef _LP64 3779 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3780 cmpl(cnt2, stride2x2); 3781 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3782 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3783 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3784 3785 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3786 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3787 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3788 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3789 } else { 3790 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3791 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3792 } 3793 kortestql(mask, mask); 3794 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3795 addptr(result, stride2x2); // update since we already compared at this addr 3796 subl(cnt2, stride2x2); // and sub the size too 3797 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3798 3799 vpxor(vec1, vec1); 3800 jmpb(COMPARE_WIDE_TAIL); 3801 }//if (VM_Version::supports_avx512vlbw()) 3802 #endif // _LP64 3803 3804 3805 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3806 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3807 vmovdqu(vec1, Address(str1, result, scale)); 3808 vpxor(vec1, Address(str2, result, scale)); 3809 } else { 3810 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3811 vpxor(vec1, Address(str2, result, scale2)); 3812 } 3813 vptest(vec1, vec1); 3814 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3815 addptr(result, stride2); 3816 subl(cnt2, stride2); 3817 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3818 // clean upper bits of YMM registers 3819 vpxor(vec1, vec1); 3820 3821 // compare wide vectors tail 3822 bind(COMPARE_WIDE_TAIL); 3823 testptr(result, result); 3824 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3825 3826 movl(result, stride2); 3827 movl(cnt2, result); 3828 negptr(result); 3829 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3830 3831 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3832 bind(VECTOR_NOT_EQUAL); 3833 // clean upper bits of YMM registers 3834 vpxor(vec1, vec1); 3835 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3836 lea(str1, Address(str1, result, scale)); 3837 lea(str2, Address(str2, result, scale)); 3838 } else { 3839 lea(str1, Address(str1, result, scale1)); 3840 lea(str2, Address(str2, result, scale2)); 3841 } 3842 jmp(COMPARE_16_CHARS); 3843 3844 // Compare tail chars, length between 1 to 15 chars 3845 bind(COMPARE_TAIL_LONG); 3846 movl(cnt2, result); 3847 cmpl(cnt2, stride); 3848 jcc(Assembler::less, COMPARE_SMALL_STR); 3849 3850 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3851 movdqu(vec1, Address(str1, 0)); 3852 } else { 3853 pmovzxbw(vec1, Address(str1, 0)); 3854 } 3855 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3856 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3857 subptr(cnt2, stride); 3858 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3859 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3860 lea(str1, Address(str1, result, scale)); 3861 lea(str2, Address(str2, result, scale)); 3862 } else { 3863 lea(str1, Address(str1, result, scale1)); 3864 lea(str2, Address(str2, result, scale2)); 3865 } 3866 negptr(cnt2); 3867 jmpb(WHILE_HEAD_LABEL); 3868 3869 bind(COMPARE_SMALL_STR); 3870 } else if (UseSSE42Intrinsics) { 3871 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3872 int pcmpmask = 0x19; 3873 // Setup to compare 8-char (16-byte) vectors, 3874 // start from first character again because it has aligned address. 3875 movl(result, cnt2); 3876 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3877 if (ae == StrIntrinsicNode::LL) { 3878 pcmpmask &= ~0x01; 3879 } 3880 jcc(Assembler::zero, COMPARE_TAIL); 3881 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3882 lea(str1, Address(str1, result, scale)); 3883 lea(str2, Address(str2, result, scale)); 3884 } else { 3885 lea(str1, Address(str1, result, scale1)); 3886 lea(str2, Address(str2, result, scale2)); 3887 } 3888 negptr(result); 3889 3890 // pcmpestri 3891 // inputs: 3892 // vec1- substring 3893 // rax - negative string length (elements count) 3894 // mem - scanned string 3895 // rdx - string length (elements count) 3896 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3897 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3898 // outputs: 3899 // rcx - first mismatched element index 3900 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3901 3902 bind(COMPARE_WIDE_VECTORS); 3903 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3904 movdqu(vec1, Address(str1, result, scale)); 3905 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3906 } else { 3907 pmovzxbw(vec1, Address(str1, result, scale1)); 3908 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3909 } 3910 // After pcmpestri cnt1(rcx) contains mismatched element index 3911 3912 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3913 addptr(result, stride); 3914 subptr(cnt2, stride); 3915 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3916 3917 // compare wide vectors tail 3918 testptr(result, result); 3919 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3920 3921 movl(cnt2, stride); 3922 movl(result, stride); 3923 negptr(result); 3924 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3925 movdqu(vec1, Address(str1, result, scale)); 3926 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3927 } else { 3928 pmovzxbw(vec1, Address(str1, result, scale1)); 3929 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3930 } 3931 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3932 3933 // Mismatched characters in the vectors 3934 bind(VECTOR_NOT_EQUAL); 3935 addptr(cnt1, result); 3936 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3937 subl(result, cnt2); 3938 jmpb(POP_LABEL); 3939 3940 bind(COMPARE_TAIL); // limit is zero 3941 movl(cnt2, result); 3942 // Fallthru to tail compare 3943 } 3944 // Shift str2 and str1 to the end of the arrays, negate min 3945 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3946 lea(str1, Address(str1, cnt2, scale)); 3947 lea(str2, Address(str2, cnt2, scale)); 3948 } else { 3949 lea(str1, Address(str1, cnt2, scale1)); 3950 lea(str2, Address(str2, cnt2, scale2)); 3951 } 3952 decrementl(cnt2); // first character was compared already 3953 negptr(cnt2); 3954 3955 // Compare the rest of the elements 3956 bind(WHILE_HEAD_LABEL); 3957 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3958 subl(result, cnt1); 3959 jccb(Assembler::notZero, POP_LABEL); 3960 increment(cnt2); 3961 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3962 3963 // Strings are equal up to min length. Return the length difference. 3964 bind(LENGTH_DIFF_LABEL); 3965 pop(result); 3966 if (ae == StrIntrinsicNode::UU) { 3967 // Divide diff by 2 to get number of chars 3968 sarl(result, 1); 3969 } 3970 jmpb(DONE_LABEL); 3971 3972 #ifdef _LP64 3973 if (VM_Version::supports_avx512vlbw()) { 3974 3975 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3976 3977 kmovql(cnt1, mask); 3978 notq(cnt1); 3979 bsfq(cnt2, cnt1); 3980 if (ae != StrIntrinsicNode::LL) { 3981 // Divide diff by 2 to get number of chars 3982 sarl(cnt2, 1); 3983 } 3984 addq(result, cnt2); 3985 if (ae == StrIntrinsicNode::LL) { 3986 load_unsigned_byte(cnt1, Address(str2, result)); 3987 load_unsigned_byte(result, Address(str1, result)); 3988 } else if (ae == StrIntrinsicNode::UU) { 3989 load_unsigned_short(cnt1, Address(str2, result, scale)); 3990 load_unsigned_short(result, Address(str1, result, scale)); 3991 } else { 3992 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3993 load_unsigned_byte(result, Address(str1, result, scale1)); 3994 } 3995 subl(result, cnt1); 3996 jmpb(POP_LABEL); 3997 }//if (VM_Version::supports_avx512vlbw()) 3998 #endif // _LP64 3999 4000 // Discard the stored length difference 4001 bind(POP_LABEL); 4002 pop(cnt1); 4003 4004 // That's it 4005 bind(DONE_LABEL); 4006 if(ae == StrIntrinsicNode::UL) { 4007 negl(result); 4008 } 4009 4010 } 4011 4012 // Search for Non-ASCII character (Negative byte value) in a byte array, 4013 // return the index of the first such character, otherwise the length 4014 // of the array segment searched. 4015 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4016 // @IntrinsicCandidate 4017 // public static int countPositives(byte[] ba, int off, int len) { 4018 // for (int i = off; i < off + len; i++) { 4019 // if (ba[i] < 0) { 4020 // return i - off; 4021 // } 4022 // } 4023 // return len; 4024 // } 4025 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4026 Register result, Register tmp1, 4027 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4028 // rsi: byte array 4029 // rcx: len 4030 // rax: result 4031 ShortBranchVerifier sbv(this); 4032 assert_different_registers(ary1, len, result, tmp1); 4033 assert_different_registers(vec1, vec2); 4034 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4035 4036 movl(result, len); // copy 4037 // len == 0 4038 testl(len, len); 4039 jcc(Assembler::zero, DONE); 4040 4041 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4042 VM_Version::supports_avx512vlbw() && 4043 VM_Version::supports_bmi2()) { 4044 4045 Label test_64_loop, test_tail, BREAK_LOOP; 4046 movl(tmp1, len); 4047 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4048 4049 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4050 andl(len, 0xffffffc0); // vector count (in chars) 4051 jccb(Assembler::zero, test_tail); 4052 4053 lea(ary1, Address(ary1, len, Address::times_1)); 4054 negptr(len); 4055 4056 bind(test_64_loop); 4057 // Check whether our 64 elements of size byte contain negatives 4058 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4059 kortestql(mask1, mask1); 4060 jcc(Assembler::notZero, BREAK_LOOP); 4061 4062 addptr(len, 64); 4063 jccb(Assembler::notZero, test_64_loop); 4064 4065 bind(test_tail); 4066 // bail out when there is nothing to be done 4067 testl(tmp1, -1); 4068 jcc(Assembler::zero, DONE); 4069 4070 4071 // check the tail for absense of negatives 4072 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4073 #ifdef _LP64 4074 { 4075 Register tmp3_aliased = len; 4076 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4077 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4078 notq(tmp3_aliased); 4079 kmovql(mask2, tmp3_aliased); 4080 } 4081 #else 4082 Label k_init; 4083 jmp(k_init); 4084 4085 // We could not read 64-bits from a general purpose register thus we move 4086 // data required to compose 64 1's to the instruction stream 4087 // We emit 64 byte wide series of elements from 0..63 which later on would 4088 // be used as a compare targets with tail count contained in tmp1 register. 4089 // Result would be a k register having tmp1 consecutive number or 1 4090 // counting from least significant bit. 4091 address tmp = pc(); 4092 emit_int64(0x0706050403020100); 4093 emit_int64(0x0F0E0D0C0B0A0908); 4094 emit_int64(0x1716151413121110); 4095 emit_int64(0x1F1E1D1C1B1A1918); 4096 emit_int64(0x2726252423222120); 4097 emit_int64(0x2F2E2D2C2B2A2928); 4098 emit_int64(0x3736353433323130); 4099 emit_int64(0x3F3E3D3C3B3A3938); 4100 4101 bind(k_init); 4102 lea(len, InternalAddress(tmp)); 4103 // create mask to test for negative byte inside a vector 4104 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4105 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4106 4107 #endif 4108 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4109 ktestq(mask1, mask2); 4110 jcc(Assembler::zero, DONE); 4111 4112 // do a full check for negative registers in the tail 4113 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4114 // ary1 already pointing to the right place 4115 jmpb(TAIL_START); 4116 4117 bind(BREAK_LOOP); 4118 // At least one byte in the last 64 byte block was negative. 4119 // Set up to look at the last 64 bytes as if they were a tail 4120 lea(ary1, Address(ary1, len, Address::times_1)); 4121 addptr(result, len); 4122 // Ignore the very last byte: if all others are positive, 4123 // it must be negative, so we can skip right to the 2+1 byte 4124 // end comparison at this point 4125 orl(result, 63); 4126 movl(len, 63); 4127 // Fallthru to tail compare 4128 } else { 4129 4130 if (UseAVX >= 2 && UseSSE >= 2) { 4131 // With AVX2, use 32-byte vector compare 4132 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4133 4134 // Compare 32-byte vectors 4135 testl(len, 0xffffffe0); // vector count (in bytes) 4136 jccb(Assembler::zero, TAIL_START); 4137 4138 andl(len, 0xffffffe0); 4139 lea(ary1, Address(ary1, len, Address::times_1)); 4140 negptr(len); 4141 4142 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4143 movdl(vec2, tmp1); 4144 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4145 4146 bind(COMPARE_WIDE_VECTORS); 4147 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4148 vptest(vec1, vec2); 4149 jccb(Assembler::notZero, BREAK_LOOP); 4150 addptr(len, 32); 4151 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4152 4153 testl(result, 0x0000001f); // any bytes remaining? 4154 jcc(Assembler::zero, DONE); 4155 4156 // Quick test using the already prepared vector mask 4157 movl(len, result); 4158 andl(len, 0x0000001f); 4159 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4160 vptest(vec1, vec2); 4161 jcc(Assembler::zero, DONE); 4162 // There are zeros, jump to the tail to determine exactly where 4163 jmpb(TAIL_START); 4164 4165 bind(BREAK_LOOP); 4166 // At least one byte in the last 32-byte vector is negative. 4167 // Set up to look at the last 32 bytes as if they were a tail 4168 lea(ary1, Address(ary1, len, Address::times_1)); 4169 addptr(result, len); 4170 // Ignore the very last byte: if all others are positive, 4171 // it must be negative, so we can skip right to the 2+1 byte 4172 // end comparison at this point 4173 orl(result, 31); 4174 movl(len, 31); 4175 // Fallthru to tail compare 4176 } else if (UseSSE42Intrinsics) { 4177 // With SSE4.2, use double quad vector compare 4178 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4179 4180 // Compare 16-byte vectors 4181 testl(len, 0xfffffff0); // vector count (in bytes) 4182 jcc(Assembler::zero, TAIL_START); 4183 4184 andl(len, 0xfffffff0); 4185 lea(ary1, Address(ary1, len, Address::times_1)); 4186 negptr(len); 4187 4188 movl(tmp1, 0x80808080); 4189 movdl(vec2, tmp1); 4190 pshufd(vec2, vec2, 0); 4191 4192 bind(COMPARE_WIDE_VECTORS); 4193 movdqu(vec1, Address(ary1, len, Address::times_1)); 4194 ptest(vec1, vec2); 4195 jccb(Assembler::notZero, BREAK_LOOP); 4196 addptr(len, 16); 4197 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4198 4199 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4200 jcc(Assembler::zero, DONE); 4201 4202 // Quick test using the already prepared vector mask 4203 movl(len, result); 4204 andl(len, 0x0000000f); // tail count (in bytes) 4205 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4206 ptest(vec1, vec2); 4207 jcc(Assembler::zero, DONE); 4208 jmpb(TAIL_START); 4209 4210 bind(BREAK_LOOP); 4211 // At least one byte in the last 16-byte vector is negative. 4212 // Set up and look at the last 16 bytes as if they were a tail 4213 lea(ary1, Address(ary1, len, Address::times_1)); 4214 addptr(result, len); 4215 // Ignore the very last byte: if all others are positive, 4216 // it must be negative, so we can skip right to the 2+1 byte 4217 // end comparison at this point 4218 orl(result, 15); 4219 movl(len, 15); 4220 // Fallthru to tail compare 4221 } 4222 } 4223 4224 bind(TAIL_START); 4225 // Compare 4-byte vectors 4226 andl(len, 0xfffffffc); // vector count (in bytes) 4227 jccb(Assembler::zero, COMPARE_CHAR); 4228 4229 lea(ary1, Address(ary1, len, Address::times_1)); 4230 negptr(len); 4231 4232 bind(COMPARE_VECTORS); 4233 movl(tmp1, Address(ary1, len, Address::times_1)); 4234 andl(tmp1, 0x80808080); 4235 jccb(Assembler::notZero, TAIL_ADJUST); 4236 addptr(len, 4); 4237 jccb(Assembler::notZero, COMPARE_VECTORS); 4238 4239 // Compare trailing char (final 2-3 bytes), if any 4240 bind(COMPARE_CHAR); 4241 4242 testl(result, 0x2); // tail char 4243 jccb(Assembler::zero, COMPARE_BYTE); 4244 load_unsigned_short(tmp1, Address(ary1, 0)); 4245 andl(tmp1, 0x00008080); 4246 jccb(Assembler::notZero, CHAR_ADJUST); 4247 lea(ary1, Address(ary1, 2)); 4248 4249 bind(COMPARE_BYTE); 4250 testl(result, 0x1); // tail byte 4251 jccb(Assembler::zero, DONE); 4252 load_unsigned_byte(tmp1, Address(ary1, 0)); 4253 testl(tmp1, 0x00000080); 4254 jccb(Assembler::zero, DONE); 4255 subptr(result, 1); 4256 jmpb(DONE); 4257 4258 bind(TAIL_ADJUST); 4259 // there are negative bits in the last 4 byte block. 4260 // Adjust result and check the next three bytes 4261 addptr(result, len); 4262 orl(result, 3); 4263 lea(ary1, Address(ary1, len, Address::times_1)); 4264 jmpb(COMPARE_CHAR); 4265 4266 bind(CHAR_ADJUST); 4267 // We are looking at a char + optional byte tail, and found that one 4268 // of the bytes in the char is negative. Adjust the result, check the 4269 // first byte and readjust if needed. 4270 andl(result, 0xfffffffc); 4271 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4272 jccb(Assembler::notZero, DONE); 4273 addptr(result, 1); 4274 4275 // That's it 4276 bind(DONE); 4277 if (UseAVX >= 2 && UseSSE >= 2) { 4278 // clean upper bits of YMM registers 4279 vpxor(vec1, vec1); 4280 vpxor(vec2, vec2); 4281 } 4282 } 4283 4284 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4285 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4286 Register limit, Register result, Register chr, 4287 XMMRegister vec1, XMMRegister vec2, bool is_char, 4288 KRegister mask, bool expand_ary2) { 4289 // for expand_ary2, limit is the (smaller) size of the second array. 4290 ShortBranchVerifier sbv(this); 4291 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4292 4293 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4294 "Expansion only implemented for AVX2"); 4295 4296 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4297 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4298 4299 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4300 int scaleIncr = expand_ary2 ? 8 : 16; 4301 4302 if (is_array_equ) { 4303 // Check the input args 4304 cmpoop(ary1, ary2); 4305 jcc(Assembler::equal, TRUE_LABEL); 4306 4307 // Need additional checks for arrays_equals. 4308 testptr(ary1, ary1); 4309 jcc(Assembler::zero, FALSE_LABEL); 4310 testptr(ary2, ary2); 4311 jcc(Assembler::zero, FALSE_LABEL); 4312 4313 // Check the lengths 4314 movl(limit, Address(ary1, length_offset)); 4315 cmpl(limit, Address(ary2, length_offset)); 4316 jcc(Assembler::notEqual, FALSE_LABEL); 4317 } 4318 4319 // count == 0 4320 testl(limit, limit); 4321 jcc(Assembler::zero, TRUE_LABEL); 4322 4323 if (is_array_equ) { 4324 // Load array address 4325 lea(ary1, Address(ary1, base_offset)); 4326 lea(ary2, Address(ary2, base_offset)); 4327 } 4328 4329 if (is_array_equ && is_char) { 4330 // arrays_equals when used for char[]. 4331 shll(limit, 1); // byte count != 0 4332 } 4333 movl(result, limit); // copy 4334 4335 if (UseAVX >= 2) { 4336 // With AVX2, use 32-byte vector compare 4337 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4338 4339 // Compare 32-byte vectors 4340 if (expand_ary2) { 4341 andl(result, 0x0000000f); // tail count (in bytes) 4342 andl(limit, 0xfffffff0); // vector count (in bytes) 4343 jcc(Assembler::zero, COMPARE_TAIL); 4344 } else { 4345 andl(result, 0x0000001f); // tail count (in bytes) 4346 andl(limit, 0xffffffe0); // vector count (in bytes) 4347 jcc(Assembler::zero, COMPARE_TAIL_16); 4348 } 4349 4350 lea(ary1, Address(ary1, limit, scaleFactor)); 4351 lea(ary2, Address(ary2, limit, Address::times_1)); 4352 negptr(limit); 4353 4354 #ifdef _LP64 4355 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4356 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4357 4358 cmpl(limit, -64); 4359 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4360 4361 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4362 4363 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4364 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4365 kortestql(mask, mask); 4366 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4367 addptr(limit, 64); // update since we already compared at this addr 4368 cmpl(limit, -64); 4369 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4370 4371 // At this point we may still need to compare -limit+result bytes. 4372 // We could execute the next two instruction and just continue via non-wide path: 4373 // cmpl(limit, 0); 4374 // jcc(Assembler::equal, COMPARE_TAIL); // true 4375 // But since we stopped at the points ary{1,2}+limit which are 4376 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4377 // (|limit| <= 32 and result < 32), 4378 // we may just compare the last 64 bytes. 4379 // 4380 addptr(result, -64); // it is safe, bc we just came from this area 4381 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4382 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4383 kortestql(mask, mask); 4384 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4385 4386 jmp(TRUE_LABEL); 4387 4388 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4389 4390 }//if (VM_Version::supports_avx512vlbw()) 4391 #endif //_LP64 4392 bind(COMPARE_WIDE_VECTORS); 4393 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4394 if (expand_ary2) { 4395 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4396 } else { 4397 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4398 } 4399 vpxor(vec1, vec2); 4400 4401 vptest(vec1, vec1); 4402 jcc(Assembler::notZero, FALSE_LABEL); 4403 addptr(limit, scaleIncr * 2); 4404 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4405 4406 testl(result, result); 4407 jcc(Assembler::zero, TRUE_LABEL); 4408 4409 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4410 if (expand_ary2) { 4411 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4412 } else { 4413 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4414 } 4415 vpxor(vec1, vec2); 4416 4417 vptest(vec1, vec1); 4418 jcc(Assembler::notZero, FALSE_LABEL); 4419 jmp(TRUE_LABEL); 4420 4421 bind(COMPARE_TAIL_16); // limit is zero 4422 movl(limit, result); 4423 4424 // Compare 16-byte chunks 4425 andl(result, 0x0000000f); // tail count (in bytes) 4426 andl(limit, 0xfffffff0); // vector count (in bytes) 4427 jcc(Assembler::zero, COMPARE_TAIL); 4428 4429 lea(ary1, Address(ary1, limit, scaleFactor)); 4430 lea(ary2, Address(ary2, limit, Address::times_1)); 4431 negptr(limit); 4432 4433 bind(COMPARE_WIDE_VECTORS_16); 4434 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4435 if (expand_ary2) { 4436 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4437 } else { 4438 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4439 } 4440 pxor(vec1, vec2); 4441 4442 ptest(vec1, vec1); 4443 jcc(Assembler::notZero, FALSE_LABEL); 4444 addptr(limit, scaleIncr); 4445 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4446 4447 bind(COMPARE_TAIL); // limit is zero 4448 movl(limit, result); 4449 // Fallthru to tail compare 4450 } else if (UseSSE42Intrinsics) { 4451 // With SSE4.2, use double quad vector compare 4452 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4453 4454 // Compare 16-byte vectors 4455 andl(result, 0x0000000f); // tail count (in bytes) 4456 andl(limit, 0xfffffff0); // vector count (in bytes) 4457 jcc(Assembler::zero, COMPARE_TAIL); 4458 4459 lea(ary1, Address(ary1, limit, Address::times_1)); 4460 lea(ary2, Address(ary2, limit, Address::times_1)); 4461 negptr(limit); 4462 4463 bind(COMPARE_WIDE_VECTORS); 4464 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4465 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4466 pxor(vec1, vec2); 4467 4468 ptest(vec1, vec1); 4469 jcc(Assembler::notZero, FALSE_LABEL); 4470 addptr(limit, 16); 4471 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4472 4473 testl(result, result); 4474 jcc(Assembler::zero, TRUE_LABEL); 4475 4476 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4477 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4478 pxor(vec1, vec2); 4479 4480 ptest(vec1, vec1); 4481 jccb(Assembler::notZero, FALSE_LABEL); 4482 jmpb(TRUE_LABEL); 4483 4484 bind(COMPARE_TAIL); // limit is zero 4485 movl(limit, result); 4486 // Fallthru to tail compare 4487 } 4488 4489 // Compare 4-byte vectors 4490 if (expand_ary2) { 4491 testl(result, result); 4492 jccb(Assembler::zero, TRUE_LABEL); 4493 } else { 4494 andl(limit, 0xfffffffc); // vector count (in bytes) 4495 jccb(Assembler::zero, COMPARE_CHAR); 4496 } 4497 4498 lea(ary1, Address(ary1, limit, scaleFactor)); 4499 lea(ary2, Address(ary2, limit, Address::times_1)); 4500 negptr(limit); 4501 4502 bind(COMPARE_VECTORS); 4503 if (expand_ary2) { 4504 // There are no "vector" operations for bytes to shorts 4505 movzbl(chr, Address(ary2, limit, Address::times_1)); 4506 cmpw(Address(ary1, limit, Address::times_2), chr); 4507 jccb(Assembler::notEqual, FALSE_LABEL); 4508 addptr(limit, 1); 4509 jcc(Assembler::notZero, COMPARE_VECTORS); 4510 jmp(TRUE_LABEL); 4511 } else { 4512 movl(chr, Address(ary1, limit, Address::times_1)); 4513 cmpl(chr, Address(ary2, limit, Address::times_1)); 4514 jccb(Assembler::notEqual, FALSE_LABEL); 4515 addptr(limit, 4); 4516 jcc(Assembler::notZero, COMPARE_VECTORS); 4517 } 4518 4519 // Compare trailing char (final 2 bytes), if any 4520 bind(COMPARE_CHAR); 4521 testl(result, 0x2); // tail char 4522 jccb(Assembler::zero, COMPARE_BYTE); 4523 load_unsigned_short(chr, Address(ary1, 0)); 4524 load_unsigned_short(limit, Address(ary2, 0)); 4525 cmpl(chr, limit); 4526 jccb(Assembler::notEqual, FALSE_LABEL); 4527 4528 if (is_array_equ && is_char) { 4529 bind(COMPARE_BYTE); 4530 } else { 4531 lea(ary1, Address(ary1, 2)); 4532 lea(ary2, Address(ary2, 2)); 4533 4534 bind(COMPARE_BYTE); 4535 testl(result, 0x1); // tail byte 4536 jccb(Assembler::zero, TRUE_LABEL); 4537 load_unsigned_byte(chr, Address(ary1, 0)); 4538 load_unsigned_byte(limit, Address(ary2, 0)); 4539 cmpl(chr, limit); 4540 jccb(Assembler::notEqual, FALSE_LABEL); 4541 } 4542 bind(TRUE_LABEL); 4543 movl(result, 1); // return true 4544 jmpb(DONE); 4545 4546 bind(FALSE_LABEL); 4547 xorl(result, result); // return false 4548 4549 // That's it 4550 bind(DONE); 4551 if (UseAVX >= 2) { 4552 // clean upper bits of YMM registers 4553 vpxor(vec1, vec1); 4554 vpxor(vec2, vec2); 4555 } 4556 } 4557 4558 #ifdef _LP64 4559 4560 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4561 #define __ masm. 4562 Register dst = stub.data<0>(); 4563 XMMRegister src = stub.data<1>(); 4564 address target = stub.data<2>(); 4565 __ bind(stub.entry()); 4566 __ subptr(rsp, 8); 4567 __ movdbl(Address(rsp), src); 4568 __ call(RuntimeAddress(target)); 4569 __ pop(dst); 4570 __ jmp(stub.continuation()); 4571 #undef __ 4572 } 4573 4574 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4575 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4576 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4577 4578 address slowpath_target; 4579 if (dst_bt == T_INT) { 4580 if (src_bt == T_FLOAT) { 4581 cvttss2sil(dst, src); 4582 cmpl(dst, 0x80000000); 4583 slowpath_target = StubRoutines::x86::f2i_fixup(); 4584 } else { 4585 cvttsd2sil(dst, src); 4586 cmpl(dst, 0x80000000); 4587 slowpath_target = StubRoutines::x86::d2i_fixup(); 4588 } 4589 } else { 4590 if (src_bt == T_FLOAT) { 4591 cvttss2siq(dst, src); 4592 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4593 slowpath_target = StubRoutines::x86::f2l_fixup(); 4594 } else { 4595 cvttsd2siq(dst, src); 4596 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4597 slowpath_target = StubRoutines::x86::d2l_fixup(); 4598 } 4599 } 4600 4601 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4602 jcc(Assembler::equal, stub->entry()); 4603 bind(stub->continuation()); 4604 } 4605 4606 #endif // _LP64 4607 4608 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4609 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4610 switch(ideal_opc) { 4611 case Op_LShiftVS: 4612 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4613 case Op_LShiftVI: 4614 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4615 case Op_LShiftVL: 4616 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4617 case Op_RShiftVS: 4618 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4619 case Op_RShiftVI: 4620 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4621 case Op_RShiftVL: 4622 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4623 case Op_URShiftVS: 4624 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4625 case Op_URShiftVI: 4626 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4627 case Op_URShiftVL: 4628 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4629 case Op_RotateRightV: 4630 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4631 case Op_RotateLeftV: 4632 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4633 default: 4634 fatal("Unsupported masked operation"); break; 4635 } 4636 } 4637 4638 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4639 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4640 bool is_varshift) { 4641 switch (ideal_opc) { 4642 case Op_AddVB: 4643 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4644 case Op_AddVS: 4645 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4646 case Op_AddVI: 4647 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4648 case Op_AddVL: 4649 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4650 case Op_AddVF: 4651 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4652 case Op_AddVD: 4653 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4654 case Op_SubVB: 4655 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4656 case Op_SubVS: 4657 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4658 case Op_SubVI: 4659 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4660 case Op_SubVL: 4661 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4662 case Op_SubVF: 4663 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4664 case Op_SubVD: 4665 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4666 case Op_MulVS: 4667 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4668 case Op_MulVI: 4669 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4670 case Op_MulVL: 4671 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4672 case Op_MulVF: 4673 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4674 case Op_MulVD: 4675 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4676 case Op_DivVF: 4677 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4678 case Op_DivVD: 4679 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4680 case Op_SqrtVF: 4681 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4682 case Op_SqrtVD: 4683 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4684 case Op_AbsVB: 4685 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4686 case Op_AbsVS: 4687 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4688 case Op_AbsVI: 4689 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4690 case Op_AbsVL: 4691 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4692 case Op_FmaVF: 4693 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4694 case Op_FmaVD: 4695 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4696 case Op_VectorRearrange: 4697 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4698 case Op_LShiftVS: 4699 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4700 case Op_LShiftVI: 4701 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4702 case Op_LShiftVL: 4703 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4704 case Op_RShiftVS: 4705 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4706 case Op_RShiftVI: 4707 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4708 case Op_RShiftVL: 4709 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4710 case Op_URShiftVS: 4711 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4712 case Op_URShiftVI: 4713 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4714 case Op_URShiftVL: 4715 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4716 case Op_RotateLeftV: 4717 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4718 case Op_RotateRightV: 4719 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4720 case Op_MaxV: 4721 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4722 case Op_MinV: 4723 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4724 case Op_XorV: 4725 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4726 case Op_OrV: 4727 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4728 case Op_AndV: 4729 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4730 default: 4731 fatal("Unsupported masked operation"); break; 4732 } 4733 } 4734 4735 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4736 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4737 switch (ideal_opc) { 4738 case Op_AddVB: 4739 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4740 case Op_AddVS: 4741 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4742 case Op_AddVI: 4743 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4744 case Op_AddVL: 4745 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4746 case Op_AddVF: 4747 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4748 case Op_AddVD: 4749 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4750 case Op_SubVB: 4751 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4752 case Op_SubVS: 4753 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4754 case Op_SubVI: 4755 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4756 case Op_SubVL: 4757 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4758 case Op_SubVF: 4759 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4760 case Op_SubVD: 4761 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4762 case Op_MulVS: 4763 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4764 case Op_MulVI: 4765 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4766 case Op_MulVL: 4767 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4768 case Op_MulVF: 4769 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4770 case Op_MulVD: 4771 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4772 case Op_DivVF: 4773 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4774 case Op_DivVD: 4775 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4776 case Op_FmaVF: 4777 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4778 case Op_FmaVD: 4779 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4780 case Op_MaxV: 4781 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4782 case Op_MinV: 4783 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4784 case Op_XorV: 4785 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4786 case Op_OrV: 4787 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4788 case Op_AndV: 4789 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4790 default: 4791 fatal("Unsupported masked operation"); break; 4792 } 4793 } 4794 4795 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4796 KRegister src1, KRegister src2) { 4797 BasicType etype = T_ILLEGAL; 4798 switch(mask_len) { 4799 case 2: 4800 case 4: 4801 case 8: etype = T_BYTE; break; 4802 case 16: etype = T_SHORT; break; 4803 case 32: etype = T_INT; break; 4804 case 64: etype = T_LONG; break; 4805 default: fatal("Unsupported type"); break; 4806 } 4807 assert(etype != T_ILLEGAL, ""); 4808 switch(ideal_opc) { 4809 case Op_AndVMask: 4810 kand(etype, dst, src1, src2); break; 4811 case Op_OrVMask: 4812 kor(etype, dst, src1, src2); break; 4813 case Op_XorVMask: 4814 kxor(etype, dst, src1, src2); break; 4815 default: 4816 fatal("Unsupported masked operation"); break; 4817 } 4818 } 4819 4820 /* 4821 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4822 * If src is NaN, the result is 0. 4823 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4824 * the result is equal to the value of Integer.MIN_VALUE. 4825 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4826 * the result is equal to the value of Integer.MAX_VALUE. 4827 */ 4828 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4829 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4830 Register rscratch, AddressLiteral float_sign_flip, 4831 int vec_enc) { 4832 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4833 Label done; 4834 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4835 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4836 vptest(xtmp2, xtmp2, vec_enc); 4837 jccb(Assembler::equal, done); 4838 4839 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4840 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4841 4842 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4843 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4844 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4845 4846 // Recompute the mask for remaining special value. 4847 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4848 // Extract SRC values corresponding to TRUE mask lanes. 4849 vpand(xtmp4, xtmp2, src, vec_enc); 4850 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4851 // values are set. 4852 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4853 4854 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4855 bind(done); 4856 } 4857 4858 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4859 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4860 Register rscratch, AddressLiteral float_sign_flip, 4861 int vec_enc) { 4862 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4863 Label done; 4864 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4865 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4866 kortestwl(ktmp1, ktmp1); 4867 jccb(Assembler::equal, done); 4868 4869 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4870 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4871 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4872 4873 kxorwl(ktmp1, ktmp1, ktmp2); 4874 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4875 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4876 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4877 bind(done); 4878 } 4879 4880 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4881 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4882 Register rscratch, AddressLiteral double_sign_flip, 4883 int vec_enc) { 4884 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4885 4886 Label done; 4887 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4888 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4889 kortestwl(ktmp1, ktmp1); 4890 jccb(Assembler::equal, done); 4891 4892 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4893 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4894 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4895 4896 kxorwl(ktmp1, ktmp1, ktmp2); 4897 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4898 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4899 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4900 bind(done); 4901 } 4902 4903 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4904 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4905 Register rscratch, AddressLiteral float_sign_flip, 4906 int vec_enc) { 4907 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4908 Label done; 4909 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4910 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4911 kortestwl(ktmp1, ktmp1); 4912 jccb(Assembler::equal, done); 4913 4914 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4915 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4916 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4917 4918 kxorwl(ktmp1, ktmp1, ktmp2); 4919 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4920 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4921 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4922 bind(done); 4923 } 4924 4925 /* 4926 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4927 * If src is NaN, the result is 0. 4928 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4929 * the result is equal to the value of Long.MIN_VALUE. 4930 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4931 * the result is equal to the value of Long.MAX_VALUE. 4932 */ 4933 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4934 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4935 Register rscratch, AddressLiteral double_sign_flip, 4936 int vec_enc) { 4937 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4938 4939 Label done; 4940 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4941 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4942 kortestwl(ktmp1, ktmp1); 4943 jccb(Assembler::equal, done); 4944 4945 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4946 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4947 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4948 4949 kxorwl(ktmp1, ktmp1, ktmp2); 4950 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4951 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4952 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4953 bind(done); 4954 } 4955 4956 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4957 XMMRegister xtmp, int index, int vec_enc) { 4958 assert(vec_enc < Assembler::AVX_512bit, ""); 4959 if (vec_enc == Assembler::AVX_256bit) { 4960 vextractf128_high(xtmp, src); 4961 vshufps(dst, src, xtmp, index, vec_enc); 4962 } else { 4963 vshufps(dst, src, zero, index, vec_enc); 4964 } 4965 } 4966 4967 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4968 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4969 AddressLiteral float_sign_flip, int src_vec_enc) { 4970 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4971 4972 Label done; 4973 // Compare the destination lanes with float_sign_flip 4974 // value to get mask for all special values. 4975 movdqu(xtmp1, float_sign_flip, rscratch); 4976 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4977 ptest(xtmp2, xtmp2); 4978 jccb(Assembler::equal, done); 4979 4980 // Flip float_sign_flip to get max integer value. 4981 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4982 pxor(xtmp1, xtmp4); 4983 4984 // Set detination lanes corresponding to unordered source lanes as zero. 4985 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4986 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4987 4988 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4989 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4990 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4991 4992 // Recompute the mask for remaining special value. 4993 pxor(xtmp2, xtmp3); 4994 // Extract mask corresponding to non-negative source lanes. 4995 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4996 4997 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4998 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4999 pand(xtmp3, xtmp2); 5000 5001 // Replace destination lanes holding special value(0x80000000) with max int 5002 // if corresponding source lane holds a +ve value. 5003 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5004 bind(done); 5005 } 5006 5007 5008 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5009 XMMRegister xtmp, Register rscratch, int vec_enc) { 5010 switch(to_elem_bt) { 5011 case T_SHORT: 5012 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5013 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5014 vpackusdw(dst, dst, zero, vec_enc); 5015 if (vec_enc == Assembler::AVX_256bit) { 5016 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5017 } 5018 break; 5019 case T_BYTE: 5020 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5021 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5022 vpackusdw(dst, dst, zero, vec_enc); 5023 if (vec_enc == Assembler::AVX_256bit) { 5024 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5025 } 5026 vpackuswb(dst, dst, zero, vec_enc); 5027 break; 5028 default: assert(false, "%s", type2name(to_elem_bt)); 5029 } 5030 } 5031 5032 /* 5033 * Algorithm for vector D2L and F2I conversions:- 5034 * a) Perform vector D2L/F2I cast. 5035 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5036 * It signifies that source value could be any of the special floating point 5037 * values(NaN,-Inf,Inf,Max,-Min). 5038 * c) Set destination to zero if source is NaN value. 5039 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5040 */ 5041 5042 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5043 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5044 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5045 int to_elem_sz = type2aelembytes(to_elem_bt); 5046 assert(to_elem_sz <= 4, ""); 5047 vcvttps2dq(dst, src, vec_enc); 5048 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5049 if (to_elem_sz < 4) { 5050 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5051 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5052 } 5053 } 5054 5055 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5056 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5057 Register rscratch, int vec_enc) { 5058 int to_elem_sz = type2aelembytes(to_elem_bt); 5059 assert(to_elem_sz <= 4, ""); 5060 vcvttps2dq(dst, src, vec_enc); 5061 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5062 switch(to_elem_bt) { 5063 case T_INT: 5064 break; 5065 case T_SHORT: 5066 evpmovdw(dst, dst, vec_enc); 5067 break; 5068 case T_BYTE: 5069 evpmovdb(dst, dst, vec_enc); 5070 break; 5071 default: assert(false, "%s", type2name(to_elem_bt)); 5072 } 5073 } 5074 5075 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5076 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5077 Register rscratch, int vec_enc) { 5078 evcvttps2qq(dst, src, vec_enc); 5079 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5080 } 5081 5082 // Handling for downcasting from double to integer or sub-word types on AVX2. 5083 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5084 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5085 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5086 int to_elem_sz = type2aelembytes(to_elem_bt); 5087 assert(to_elem_sz < 8, ""); 5088 vcvttpd2dq(dst, src, vec_enc); 5089 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5090 float_sign_flip, vec_enc); 5091 if (to_elem_sz < 4) { 5092 // xtmp4 holds all zero lanes. 5093 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5094 } 5095 } 5096 5097 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5098 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5099 KRegister ktmp2, AddressLiteral sign_flip, 5100 Register rscratch, int vec_enc) { 5101 if (VM_Version::supports_avx512dq()) { 5102 evcvttpd2qq(dst, src, vec_enc); 5103 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5104 switch(to_elem_bt) { 5105 case T_LONG: 5106 break; 5107 case T_INT: 5108 evpmovsqd(dst, dst, vec_enc); 5109 break; 5110 case T_SHORT: 5111 evpmovsqd(dst, dst, vec_enc); 5112 evpmovdw(dst, dst, vec_enc); 5113 break; 5114 case T_BYTE: 5115 evpmovsqd(dst, dst, vec_enc); 5116 evpmovdb(dst, dst, vec_enc); 5117 break; 5118 default: assert(false, "%s", type2name(to_elem_bt)); 5119 } 5120 } else { 5121 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5122 vcvttpd2dq(dst, src, vec_enc); 5123 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5124 switch(to_elem_bt) { 5125 case T_INT: 5126 break; 5127 case T_SHORT: 5128 evpmovdw(dst, dst, vec_enc); 5129 break; 5130 case T_BYTE: 5131 evpmovdb(dst, dst, vec_enc); 5132 break; 5133 default: assert(false, "%s", type2name(to_elem_bt)); 5134 } 5135 } 5136 } 5137 5138 #ifdef _LP64 5139 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5140 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5141 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5142 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5143 // and re-instantiate original MXCSR.RC mode after that. 5144 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5145 5146 mov64(tmp, julong_cast(0.5L)); 5147 evpbroadcastq(xtmp1, tmp, vec_enc); 5148 vaddpd(xtmp1, src , xtmp1, vec_enc); 5149 evcvtpd2qq(dst, xtmp1, vec_enc); 5150 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5151 double_sign_flip, vec_enc);; 5152 5153 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5154 } 5155 5156 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5157 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5158 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5159 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5160 // and re-instantiate original MXCSR.RC mode after that. 5161 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5162 5163 movl(tmp, jint_cast(0.5)); 5164 movq(xtmp1, tmp); 5165 vbroadcastss(xtmp1, xtmp1, vec_enc); 5166 vaddps(xtmp1, src , xtmp1, vec_enc); 5167 vcvtps2dq(dst, xtmp1, vec_enc); 5168 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5169 float_sign_flip, vec_enc); 5170 5171 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5172 } 5173 5174 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5175 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5176 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5177 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5178 // and re-instantiate original MXCSR.RC mode after that. 5179 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5180 5181 movl(tmp, jint_cast(0.5)); 5182 movq(xtmp1, tmp); 5183 vbroadcastss(xtmp1, xtmp1, vec_enc); 5184 vaddps(xtmp1, src , xtmp1, vec_enc); 5185 vcvtps2dq(dst, xtmp1, vec_enc); 5186 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5187 5188 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5189 } 5190 #endif // _LP64 5191 5192 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5193 BasicType from_elem_bt, BasicType to_elem_bt) { 5194 switch (from_elem_bt) { 5195 case T_BYTE: 5196 switch (to_elem_bt) { 5197 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5198 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5199 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5200 default: ShouldNotReachHere(); 5201 } 5202 break; 5203 case T_SHORT: 5204 switch (to_elem_bt) { 5205 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5206 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5207 default: ShouldNotReachHere(); 5208 } 5209 break; 5210 case T_INT: 5211 assert(to_elem_bt == T_LONG, ""); 5212 vpmovzxdq(dst, src, vlen_enc); 5213 break; 5214 default: 5215 ShouldNotReachHere(); 5216 } 5217 } 5218 5219 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5220 BasicType from_elem_bt, BasicType to_elem_bt) { 5221 switch (from_elem_bt) { 5222 case T_BYTE: 5223 switch (to_elem_bt) { 5224 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5225 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5226 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5227 default: ShouldNotReachHere(); 5228 } 5229 break; 5230 case T_SHORT: 5231 switch (to_elem_bt) { 5232 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5233 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5234 default: ShouldNotReachHere(); 5235 } 5236 break; 5237 case T_INT: 5238 assert(to_elem_bt == T_LONG, ""); 5239 vpmovsxdq(dst, src, vlen_enc); 5240 break; 5241 default: 5242 ShouldNotReachHere(); 5243 } 5244 } 5245 5246 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5247 BasicType dst_bt, BasicType src_bt, int vlen) { 5248 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5249 assert(vlen_enc != AVX_512bit, ""); 5250 5251 int dst_bt_size = type2aelembytes(dst_bt); 5252 int src_bt_size = type2aelembytes(src_bt); 5253 if (dst_bt_size > src_bt_size) { 5254 switch (dst_bt_size / src_bt_size) { 5255 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5256 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5257 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5258 default: ShouldNotReachHere(); 5259 } 5260 } else { 5261 assert(dst_bt_size < src_bt_size, ""); 5262 switch (src_bt_size / dst_bt_size) { 5263 case 2: { 5264 if (vlen_enc == AVX_128bit) { 5265 vpacksswb(dst, src, src, vlen_enc); 5266 } else { 5267 vpacksswb(dst, src, src, vlen_enc); 5268 vpermq(dst, dst, 0x08, vlen_enc); 5269 } 5270 break; 5271 } 5272 case 4: { 5273 if (vlen_enc == AVX_128bit) { 5274 vpackssdw(dst, src, src, vlen_enc); 5275 vpacksswb(dst, dst, dst, vlen_enc); 5276 } else { 5277 vpackssdw(dst, src, src, vlen_enc); 5278 vpermq(dst, dst, 0x08, vlen_enc); 5279 vpacksswb(dst, dst, dst, AVX_128bit); 5280 } 5281 break; 5282 } 5283 case 8: { 5284 if (vlen_enc == AVX_128bit) { 5285 vpshufd(dst, src, 0x08, vlen_enc); 5286 vpackssdw(dst, dst, dst, vlen_enc); 5287 vpacksswb(dst, dst, dst, vlen_enc); 5288 } else { 5289 vpshufd(dst, src, 0x08, vlen_enc); 5290 vpermq(dst, dst, 0x08, vlen_enc); 5291 vpackssdw(dst, dst, dst, AVX_128bit); 5292 vpacksswb(dst, dst, dst, AVX_128bit); 5293 } 5294 break; 5295 } 5296 default: ShouldNotReachHere(); 5297 } 5298 } 5299 } 5300 5301 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5302 bool merge, BasicType bt, int vlen_enc) { 5303 if (bt == T_INT) { 5304 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5305 } else { 5306 assert(bt == T_LONG, ""); 5307 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5308 } 5309 } 5310 5311 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5312 bool merge, BasicType bt, int vlen_enc) { 5313 if (bt == T_INT) { 5314 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5315 } else { 5316 assert(bt == T_LONG, ""); 5317 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5318 } 5319 } 5320 5321 #ifdef _LP64 5322 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5323 Register rtmp2, XMMRegister xtmp, int mask_len, 5324 int vec_enc) { 5325 int index = 0; 5326 int vindex = 0; 5327 mov64(rtmp1, 0x0101010101010101L); 5328 pdepq(rtmp1, src, rtmp1); 5329 if (mask_len > 8) { 5330 movq(rtmp2, src); 5331 vpxor(xtmp, xtmp, xtmp, vec_enc); 5332 movq(xtmp, rtmp1); 5333 } 5334 movq(dst, rtmp1); 5335 5336 mask_len -= 8; 5337 while (mask_len > 0) { 5338 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5339 index++; 5340 if ((index % 2) == 0) { 5341 pxor(xtmp, xtmp); 5342 } 5343 mov64(rtmp1, 0x0101010101010101L); 5344 shrq(rtmp2, 8); 5345 pdepq(rtmp1, rtmp2, rtmp1); 5346 pinsrq(xtmp, rtmp1, index % 2); 5347 vindex = index / 2; 5348 if (vindex) { 5349 // Write entire 16 byte vector when both 64 bit 5350 // lanes are update to save redundant instructions. 5351 if (index % 2) { 5352 vinsertf128(dst, dst, xtmp, vindex); 5353 } 5354 } else { 5355 vmovdqu(dst, xtmp); 5356 } 5357 mask_len -= 8; 5358 } 5359 } 5360 5361 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5362 switch(opc) { 5363 case Op_VectorMaskTrueCount: 5364 popcntq(dst, tmp); 5365 break; 5366 case Op_VectorMaskLastTrue: 5367 if (VM_Version::supports_lzcnt()) { 5368 lzcntq(tmp, tmp); 5369 movl(dst, 63); 5370 subl(dst, tmp); 5371 } else { 5372 movl(dst, -1); 5373 bsrq(tmp, tmp); 5374 cmov32(Assembler::notZero, dst, tmp); 5375 } 5376 break; 5377 case Op_VectorMaskFirstTrue: 5378 if (VM_Version::supports_bmi1()) { 5379 if (masklen < 32) { 5380 orl(tmp, 1 << masklen); 5381 tzcntl(dst, tmp); 5382 } else if (masklen == 32) { 5383 tzcntl(dst, tmp); 5384 } else { 5385 assert(masklen == 64, ""); 5386 tzcntq(dst, tmp); 5387 } 5388 } else { 5389 if (masklen < 32) { 5390 orl(tmp, 1 << masklen); 5391 bsfl(dst, tmp); 5392 } else { 5393 assert(masklen == 32 || masklen == 64, ""); 5394 movl(dst, masklen); 5395 if (masklen == 32) { 5396 bsfl(tmp, tmp); 5397 } else { 5398 bsfq(tmp, tmp); 5399 } 5400 cmov32(Assembler::notZero, dst, tmp); 5401 } 5402 } 5403 break; 5404 case Op_VectorMaskToLong: 5405 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5406 break; 5407 default: assert(false, "Unhandled mask operation"); 5408 } 5409 } 5410 5411 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5412 int masklen, int masksize, int vec_enc) { 5413 assert(VM_Version::supports_popcnt(), ""); 5414 5415 if(VM_Version::supports_avx512bw()) { 5416 kmovql(tmp, mask); 5417 } else { 5418 assert(masklen <= 16, ""); 5419 kmovwl(tmp, mask); 5420 } 5421 5422 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5423 // operations needs to be clipped. 5424 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5425 andq(tmp, (1 << masklen) - 1); 5426 } 5427 5428 vector_mask_operation_helper(opc, dst, tmp, masklen); 5429 } 5430 5431 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5432 Register tmp, int masklen, BasicType bt, int vec_enc) { 5433 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5434 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5435 assert(VM_Version::supports_popcnt(), ""); 5436 5437 bool need_clip = false; 5438 switch(bt) { 5439 case T_BOOLEAN: 5440 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5441 vpxor(xtmp, xtmp, xtmp, vec_enc); 5442 vpsubb(xtmp, xtmp, mask, vec_enc); 5443 vpmovmskb(tmp, xtmp, vec_enc); 5444 need_clip = masklen < 16; 5445 break; 5446 case T_BYTE: 5447 vpmovmskb(tmp, mask, vec_enc); 5448 need_clip = masklen < 16; 5449 break; 5450 case T_SHORT: 5451 vpacksswb(xtmp, mask, mask, vec_enc); 5452 if (masklen >= 16) { 5453 vpermpd(xtmp, xtmp, 8, vec_enc); 5454 } 5455 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5456 need_clip = masklen < 16; 5457 break; 5458 case T_INT: 5459 case T_FLOAT: 5460 vmovmskps(tmp, mask, vec_enc); 5461 need_clip = masklen < 4; 5462 break; 5463 case T_LONG: 5464 case T_DOUBLE: 5465 vmovmskpd(tmp, mask, vec_enc); 5466 need_clip = masklen < 2; 5467 break; 5468 default: assert(false, "Unhandled type, %s", type2name(bt)); 5469 } 5470 5471 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5472 // operations needs to be clipped. 5473 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5474 // need_clip implies masklen < 32 5475 andq(tmp, (1 << masklen) - 1); 5476 } 5477 5478 vector_mask_operation_helper(opc, dst, tmp, masklen); 5479 } 5480 5481 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5482 Register rtmp2, int mask_len) { 5483 kmov(rtmp1, src); 5484 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5485 mov64(rtmp2, -1L); 5486 pextq(rtmp2, rtmp2, rtmp1); 5487 kmov(dst, rtmp2); 5488 } 5489 5490 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5491 XMMRegister mask, Register rtmp, Register rscratch, 5492 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5493 int vec_enc) { 5494 assert(type2aelembytes(bt) >= 4, ""); 5495 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5496 address compress_perm_table = nullptr; 5497 address expand_perm_table = nullptr; 5498 if (type2aelembytes(bt) == 8) { 5499 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5500 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5501 vmovmskpd(rtmp, mask, vec_enc); 5502 } else { 5503 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5504 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5505 vmovmskps(rtmp, mask, vec_enc); 5506 } 5507 shlq(rtmp, 5); // for 32 byte permute row. 5508 if (opcode == Op_CompressV) { 5509 lea(rscratch, ExternalAddress(compress_perm_table)); 5510 } else { 5511 lea(rscratch, ExternalAddress(expand_perm_table)); 5512 } 5513 addptr(rtmp, rscratch); 5514 vmovdqu(permv, Address(rtmp)); 5515 vpermps(dst, permv, src, Assembler::AVX_256bit); 5516 vpxor(xtmp, xtmp, xtmp, vec_enc); 5517 // Blend the result with zero vector using permute mask, each column entry 5518 // in a permute table row contains either a valid permute index or a -1 (default) 5519 // value, this can potentially be used as a blending mask after 5520 // compressing/expanding the source vector lanes. 5521 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5522 } 5523 5524 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5525 bool merge, BasicType bt, int vec_enc) { 5526 if (opcode == Op_CompressV) { 5527 switch(bt) { 5528 case T_BYTE: 5529 evpcompressb(dst, mask, src, merge, vec_enc); 5530 break; 5531 case T_CHAR: 5532 case T_SHORT: 5533 evpcompressw(dst, mask, src, merge, vec_enc); 5534 break; 5535 case T_INT: 5536 evpcompressd(dst, mask, src, merge, vec_enc); 5537 break; 5538 case T_FLOAT: 5539 evcompressps(dst, mask, src, merge, vec_enc); 5540 break; 5541 case T_LONG: 5542 evpcompressq(dst, mask, src, merge, vec_enc); 5543 break; 5544 case T_DOUBLE: 5545 evcompresspd(dst, mask, src, merge, vec_enc); 5546 break; 5547 default: 5548 fatal("Unsupported type %s", type2name(bt)); 5549 break; 5550 } 5551 } else { 5552 assert(opcode == Op_ExpandV, ""); 5553 switch(bt) { 5554 case T_BYTE: 5555 evpexpandb(dst, mask, src, merge, vec_enc); 5556 break; 5557 case T_CHAR: 5558 case T_SHORT: 5559 evpexpandw(dst, mask, src, merge, vec_enc); 5560 break; 5561 case T_INT: 5562 evpexpandd(dst, mask, src, merge, vec_enc); 5563 break; 5564 case T_FLOAT: 5565 evexpandps(dst, mask, src, merge, vec_enc); 5566 break; 5567 case T_LONG: 5568 evpexpandq(dst, mask, src, merge, vec_enc); 5569 break; 5570 case T_DOUBLE: 5571 evexpandpd(dst, mask, src, merge, vec_enc); 5572 break; 5573 default: 5574 fatal("Unsupported type %s", type2name(bt)); 5575 break; 5576 } 5577 } 5578 } 5579 #endif 5580 5581 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5582 KRegister ktmp1, int vec_enc) { 5583 if (opcode == Op_SignumVD) { 5584 vsubpd(dst, zero, one, vec_enc); 5585 // if src < 0 ? -1 : 1 5586 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5587 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5588 // if src == NaN, -0.0 or 0.0 return src. 5589 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5590 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5591 } else { 5592 assert(opcode == Op_SignumVF, ""); 5593 vsubps(dst, zero, one, vec_enc); 5594 // if src < 0 ? -1 : 1 5595 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5596 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5597 // if src == NaN, -0.0 or 0.0 return src. 5598 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5599 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5600 } 5601 } 5602 5603 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5604 XMMRegister xtmp1, int vec_enc) { 5605 if (opcode == Op_SignumVD) { 5606 vsubpd(dst, zero, one, vec_enc); 5607 // if src < 0 ? -1 : 1 5608 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5609 // if src == NaN, -0.0 or 0.0 return src. 5610 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5611 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5612 } else { 5613 assert(opcode == Op_SignumVF, ""); 5614 vsubps(dst, zero, one, vec_enc); 5615 // if src < 0 ? -1 : 1 5616 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5617 // if src == NaN, -0.0 or 0.0 return src. 5618 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5619 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5620 } 5621 } 5622 5623 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5624 if (VM_Version::supports_avx512bw()) { 5625 if (mask_len > 32) { 5626 kmovql(dst, src); 5627 } else { 5628 kmovdl(dst, src); 5629 if (mask_len != 32) { 5630 kshiftrdl(dst, dst, 32 - mask_len); 5631 } 5632 } 5633 } else { 5634 assert(mask_len <= 16, ""); 5635 kmovwl(dst, src); 5636 if (mask_len != 16) { 5637 kshiftrwl(dst, dst, 16 - mask_len); 5638 } 5639 } 5640 } 5641 5642 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5643 int lane_size = type2aelembytes(bt); 5644 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5645 if ((is_LP64 || lane_size < 8) && 5646 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5647 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5648 movptr(rtmp, imm32); 5649 switch(lane_size) { 5650 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5651 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5652 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5653 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5654 fatal("Unsupported lane size %d", lane_size); 5655 break; 5656 } 5657 } else { 5658 movptr(rtmp, imm32); 5659 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5660 switch(lane_size) { 5661 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5662 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5663 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5664 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5665 fatal("Unsupported lane size %d", lane_size); 5666 break; 5667 } 5668 } 5669 } 5670 5671 // 5672 // Following is lookup table based popcount computation algorithm:- 5673 // Index Bit set count 5674 // [ 0000 -> 0, 5675 // 0001 -> 1, 5676 // 0010 -> 1, 5677 // 0011 -> 2, 5678 // 0100 -> 1, 5679 // 0101 -> 2, 5680 // 0110 -> 2, 5681 // 0111 -> 3, 5682 // 1000 -> 1, 5683 // 1001 -> 2, 5684 // 1010 -> 3, 5685 // 1011 -> 3, 5686 // 1100 -> 2, 5687 // 1101 -> 3, 5688 // 1111 -> 4 ] 5689 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5690 // shuffle indices for lookup table access. 5691 // b. Right shift each byte of vector lane by 4 positions. 5692 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5693 // shuffle indices for lookup table access. 5694 // d. Add the bitset count of upper and lower 4 bits of each byte. 5695 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5696 // count of all the bytes of a quadword. 5697 // f. Perform step e. for upper 128bit vector lane. 5698 // g. Pack the bitset count of quadwords back to double word. 5699 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5700 5701 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5702 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5703 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5704 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5705 vpsrlw(dst, src, 4, vec_enc); 5706 vpand(dst, dst, xtmp1, vec_enc); 5707 vpand(xtmp1, src, xtmp1, vec_enc); 5708 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5709 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5710 vpshufb(dst, xtmp2, dst, vec_enc); 5711 vpaddb(dst, dst, xtmp1, vec_enc); 5712 } 5713 5714 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5715 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5716 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5717 // Following code is as per steps e,f,g and h of above algorithm. 5718 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5719 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5720 vpsadbw(dst, dst, xtmp2, vec_enc); 5721 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5722 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5723 vpackuswb(dst, xtmp1, dst, vec_enc); 5724 } 5725 5726 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5727 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5728 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5729 // Add the popcount of upper and lower bytes of word. 5730 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5731 vpsrlw(dst, xtmp1, 8, vec_enc); 5732 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5733 vpaddw(dst, dst, xtmp1, vec_enc); 5734 } 5735 5736 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5737 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5738 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5739 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5740 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5741 } 5742 5743 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5744 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5745 switch(bt) { 5746 case T_LONG: 5747 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5748 break; 5749 case T_INT: 5750 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5751 break; 5752 case T_CHAR: 5753 case T_SHORT: 5754 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5755 break; 5756 case T_BYTE: 5757 case T_BOOLEAN: 5758 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5759 break; 5760 default: 5761 fatal("Unsupported type %s", type2name(bt)); 5762 break; 5763 } 5764 } 5765 5766 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5767 KRegister mask, bool merge, int vec_enc) { 5768 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5769 switch(bt) { 5770 case T_LONG: 5771 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5772 evpopcntq(dst, mask, src, merge, vec_enc); 5773 break; 5774 case T_INT: 5775 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5776 evpopcntd(dst, mask, src, merge, vec_enc); 5777 break; 5778 case T_CHAR: 5779 case T_SHORT: 5780 assert(VM_Version::supports_avx512_bitalg(), ""); 5781 evpopcntw(dst, mask, src, merge, vec_enc); 5782 break; 5783 case T_BYTE: 5784 case T_BOOLEAN: 5785 assert(VM_Version::supports_avx512_bitalg(), ""); 5786 evpopcntb(dst, mask, src, merge, vec_enc); 5787 break; 5788 default: 5789 fatal("Unsupported type %s", type2name(bt)); 5790 break; 5791 } 5792 } 5793 5794 #ifndef _LP64 5795 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5796 assert(VM_Version::supports_avx512bw(), ""); 5797 kmovdl(tmp, src); 5798 kunpckdql(dst, tmp, tmp); 5799 } 5800 #endif 5801 5802 // Bit reversal algorithm first reverses the bits of each byte followed by 5803 // a byte level reversal for multi-byte primitive types (short/int/long). 5804 // Algorithm performs a lookup table access to get reverse bit sequence 5805 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5806 // is obtained by swapping the reverse bit sequences of upper and lower 5807 // nibble of a byte. 5808 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5809 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5810 if (VM_Version::supports_avx512vlbw()) { 5811 5812 // Get the reverse bit sequence of lower nibble of each byte. 5813 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5814 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5815 evpandq(dst, xtmp2, src, vec_enc); 5816 vpshufb(dst, xtmp1, dst, vec_enc); 5817 vpsllq(dst, dst, 4, vec_enc); 5818 5819 // Get the reverse bit sequence of upper nibble of each byte. 5820 vpandn(xtmp2, xtmp2, src, vec_enc); 5821 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5822 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5823 5824 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5825 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5826 evporq(xtmp2, dst, xtmp2, vec_enc); 5827 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5828 5829 } else if(vec_enc == Assembler::AVX_512bit) { 5830 // Shift based bit reversal. 5831 assert(bt == T_LONG || bt == T_INT, ""); 5832 5833 // Swap lower and upper nibble of each byte. 5834 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5835 5836 // Swap two least and most significant bits of each nibble. 5837 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5838 5839 // Swap adjacent pair of bits. 5840 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5841 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5842 5843 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5844 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5845 } else { 5846 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5847 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5848 5849 // Get the reverse bit sequence of lower nibble of each byte. 5850 vpand(dst, xtmp2, src, vec_enc); 5851 vpshufb(dst, xtmp1, dst, vec_enc); 5852 vpsllq(dst, dst, 4, vec_enc); 5853 5854 // Get the reverse bit sequence of upper nibble of each byte. 5855 vpandn(xtmp2, xtmp2, src, vec_enc); 5856 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5857 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5858 5859 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5860 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5861 vpor(xtmp2, dst, xtmp2, vec_enc); 5862 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5863 } 5864 } 5865 5866 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5867 XMMRegister xtmp, Register rscratch) { 5868 assert(VM_Version::supports_gfni(), ""); 5869 assert(rscratch != noreg || always_reachable(mask), "missing"); 5870 5871 // Galois field instruction based bit reversal based on following algorithm. 5872 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5873 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5874 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5875 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5876 } 5877 5878 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5879 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5880 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5881 evpandq(dst, xtmp1, src, vec_enc); 5882 vpsllq(dst, dst, nbits, vec_enc); 5883 vpandn(xtmp1, xtmp1, src, vec_enc); 5884 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5885 evporq(dst, dst, xtmp1, vec_enc); 5886 } 5887 5888 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5889 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5890 // Shift based bit reversal. 5891 assert(VM_Version::supports_evex(), ""); 5892 switch(bt) { 5893 case T_LONG: 5894 // Swap upper and lower double word of each quad word. 5895 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5896 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5897 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5898 break; 5899 case T_INT: 5900 // Swap upper and lower word of each double word. 5901 evprord(xtmp1, k0, src, 16, true, vec_enc); 5902 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5903 break; 5904 case T_CHAR: 5905 case T_SHORT: 5906 // Swap upper and lower byte of each word. 5907 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5908 break; 5909 case T_BYTE: 5910 evmovdquq(dst, k0, src, true, vec_enc); 5911 break; 5912 default: 5913 fatal("Unsupported type %s", type2name(bt)); 5914 break; 5915 } 5916 } 5917 5918 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5919 if (bt == T_BYTE) { 5920 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5921 evmovdquq(dst, k0, src, true, vec_enc); 5922 } else { 5923 vmovdqu(dst, src); 5924 } 5925 return; 5926 } 5927 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5928 // pre-computed shuffle indices. 5929 switch(bt) { 5930 case T_LONG: 5931 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5932 break; 5933 case T_INT: 5934 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5935 break; 5936 case T_CHAR: 5937 case T_SHORT: 5938 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5939 break; 5940 default: 5941 fatal("Unsupported type %s", type2name(bt)); 5942 break; 5943 } 5944 vpshufb(dst, src, dst, vec_enc); 5945 } 5946 5947 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5948 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5949 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5950 assert(is_integral_type(bt), ""); 5951 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5952 assert(VM_Version::supports_avx512cd(), ""); 5953 switch(bt) { 5954 case T_LONG: 5955 evplzcntq(dst, ktmp, src, merge, vec_enc); 5956 break; 5957 case T_INT: 5958 evplzcntd(dst, ktmp, src, merge, vec_enc); 5959 break; 5960 case T_SHORT: 5961 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5962 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5963 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5964 vpunpckhwd(dst, xtmp1, src, vec_enc); 5965 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5966 vpackusdw(dst, xtmp2, dst, vec_enc); 5967 break; 5968 case T_BYTE: 5969 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5970 // accessing the lookup table. 5971 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5972 // accessing the lookup table. 5973 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5974 assert(VM_Version::supports_avx512bw(), ""); 5975 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5976 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5977 vpand(xtmp2, dst, src, vec_enc); 5978 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5979 vpsrlw(xtmp3, src, 4, vec_enc); 5980 vpand(xtmp3, dst, xtmp3, vec_enc); 5981 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5982 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5983 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5984 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5985 break; 5986 default: 5987 fatal("Unsupported type %s", type2name(bt)); 5988 break; 5989 } 5990 } 5991 5992 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5993 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5994 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5995 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5996 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5997 // accessing the lookup table. 5998 vpand(dst, xtmp2, src, vec_enc); 5999 vpshufb(dst, xtmp1, dst, vec_enc); 6000 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6001 // accessing the lookup table. 6002 vpsrlw(xtmp3, src, 4, vec_enc); 6003 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6004 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6005 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6006 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6007 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6008 vpaddb(dst, dst, xtmp2, vec_enc); 6009 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6010 } 6011 6012 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6013 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6014 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6015 // Add zero counts of lower byte and upper byte of a word if 6016 // upper byte holds a zero value. 6017 vpsrlw(xtmp3, src, 8, vec_enc); 6018 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6019 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6020 vpsllw(xtmp2, dst, 8, vec_enc); 6021 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6022 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6023 vpsrlw(dst, dst, 8, vec_enc); 6024 } 6025 6026 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6027 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6028 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6029 // hence biased exponent can be used to compute leading zero count as per 6030 // following formula:- 6031 // LZCNT = 32 - (biased_exp - 127) 6032 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6033 6034 // Broadcast 0xFF 6035 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6036 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6037 6038 // Extract biased exponent. 6039 vcvtdq2ps(dst, src, vec_enc); 6040 vpsrld(dst, dst, 23, vec_enc); 6041 vpand(dst, dst, xtmp1, vec_enc); 6042 6043 // Broadcast 127. 6044 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6045 // Exponent = biased_exp - 127 6046 vpsubd(dst, dst, xtmp1, vec_enc); 6047 6048 // Exponent = Exponent + 1 6049 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6050 vpaddd(dst, dst, xtmp3, vec_enc); 6051 6052 // Replace -ve exponent with zero, exponent is -ve when src 6053 // lane contains a zero value. 6054 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6055 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6056 6057 // Rematerialize broadcast 32. 6058 vpslld(xtmp1, xtmp3, 5, vec_enc); 6059 // Exponent is 32 if corresponding source lane contains max_int value. 6060 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6061 // LZCNT = 32 - exponent 6062 vpsubd(dst, xtmp1, dst, vec_enc); 6063 6064 // Replace LZCNT with a value 1 if corresponding source lane 6065 // contains max_int value. 6066 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6067 6068 // Replace biased_exp with 0 if source lane value is less than zero. 6069 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6070 vblendvps(dst, dst, xtmp2, src, vec_enc); 6071 } 6072 6073 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6074 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6075 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6076 // Add zero counts of lower word and upper word of a double word if 6077 // upper word holds a zero value. 6078 vpsrld(xtmp3, src, 16, vec_enc); 6079 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6080 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6081 vpslld(xtmp2, dst, 16, vec_enc); 6082 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6083 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6084 vpsrld(dst, dst, 16, vec_enc); 6085 // Add zero counts of lower doubleword and upper doubleword of a 6086 // quadword if upper doubleword holds a zero value. 6087 vpsrlq(xtmp3, src, 32, vec_enc); 6088 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6089 vpsllq(xtmp2, dst, 32, vec_enc); 6090 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6091 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6092 vpsrlq(dst, dst, 32, vec_enc); 6093 } 6094 6095 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6096 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6097 Register rtmp, int vec_enc) { 6098 assert(is_integral_type(bt), "unexpected type"); 6099 assert(vec_enc < Assembler::AVX_512bit, ""); 6100 switch(bt) { 6101 case T_LONG: 6102 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6103 break; 6104 case T_INT: 6105 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6106 break; 6107 case T_SHORT: 6108 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6109 break; 6110 case T_BYTE: 6111 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6112 break; 6113 default: 6114 fatal("Unsupported type %s", type2name(bt)); 6115 break; 6116 } 6117 } 6118 6119 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6120 switch(bt) { 6121 case T_BYTE: 6122 vpsubb(dst, src1, src2, vec_enc); 6123 break; 6124 case T_SHORT: 6125 vpsubw(dst, src1, src2, vec_enc); 6126 break; 6127 case T_INT: 6128 vpsubd(dst, src1, src2, vec_enc); 6129 break; 6130 case T_LONG: 6131 vpsubq(dst, src1, src2, vec_enc); 6132 break; 6133 default: 6134 fatal("Unsupported type %s", type2name(bt)); 6135 break; 6136 } 6137 } 6138 6139 // Trailing zero count computation is based on leading zero count operation as per 6140 // following equation. All AVX3 targets support AVX512CD feature which offers 6141 // direct vector instruction to compute leading zero count. 6142 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6143 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6144 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6145 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6146 assert(is_integral_type(bt), ""); 6147 // xtmp = -1 6148 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6149 // xtmp = xtmp + src 6150 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6151 // xtmp = xtmp & ~src 6152 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6153 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6154 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6155 vpsub(bt, dst, xtmp4, dst, vec_enc); 6156 } 6157 6158 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6159 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6160 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6161 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6162 assert(is_integral_type(bt), ""); 6163 // xtmp = 0 6164 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6165 // xtmp = 0 - src 6166 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6167 // xtmp = xtmp | src 6168 vpor(xtmp3, xtmp3, src, vec_enc); 6169 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6170 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6171 vpsub(bt, dst, xtmp1, dst, vec_enc); 6172 } 6173 6174 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6175 Label done; 6176 Label neg_divisor_fastpath; 6177 cmpl(divisor, 0); 6178 jccb(Assembler::less, neg_divisor_fastpath); 6179 xorl(rdx, rdx); 6180 divl(divisor); 6181 jmpb(done); 6182 bind(neg_divisor_fastpath); 6183 // Fastpath for divisor < 0: 6184 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6185 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6186 movl(rdx, rax); 6187 subl(rdx, divisor); 6188 if (VM_Version::supports_bmi1()) { 6189 andnl(rax, rdx, rax); 6190 } else { 6191 notl(rdx); 6192 andl(rax, rdx); 6193 } 6194 shrl(rax, 31); 6195 bind(done); 6196 } 6197 6198 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6199 Label done; 6200 Label neg_divisor_fastpath; 6201 cmpl(divisor, 0); 6202 jccb(Assembler::less, neg_divisor_fastpath); 6203 xorl(rdx, rdx); 6204 divl(divisor); 6205 jmpb(done); 6206 bind(neg_divisor_fastpath); 6207 // Fastpath when divisor < 0: 6208 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6209 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6210 movl(rdx, rax); 6211 subl(rax, divisor); 6212 if (VM_Version::supports_bmi1()) { 6213 andnl(rax, rax, rdx); 6214 } else { 6215 notl(rax); 6216 andl(rax, rdx); 6217 } 6218 sarl(rax, 31); 6219 andl(rax, divisor); 6220 subl(rdx, rax); 6221 bind(done); 6222 } 6223 6224 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6225 Label done; 6226 Label neg_divisor_fastpath; 6227 6228 cmpl(divisor, 0); 6229 jccb(Assembler::less, neg_divisor_fastpath); 6230 xorl(rdx, rdx); 6231 divl(divisor); 6232 jmpb(done); 6233 bind(neg_divisor_fastpath); 6234 // Fastpath for divisor < 0: 6235 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6236 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6237 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6238 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6239 movl(rdx, rax); 6240 subl(rax, divisor); 6241 if (VM_Version::supports_bmi1()) { 6242 andnl(rax, rax, rdx); 6243 } else { 6244 notl(rax); 6245 andl(rax, rdx); 6246 } 6247 movl(tmp, rax); 6248 shrl(rax, 31); // quotient 6249 sarl(tmp, 31); 6250 andl(tmp, divisor); 6251 subl(rdx, tmp); // remainder 6252 bind(done); 6253 } 6254 6255 #ifdef _LP64 6256 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6257 XMMRegister xtmp2, Register rtmp) { 6258 if(VM_Version::supports_gfni()) { 6259 // Galois field instruction based bit reversal based on following algorithm. 6260 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6261 mov64(rtmp, 0x8040201008040201L); 6262 movq(xtmp1, src); 6263 movq(xtmp2, rtmp); 6264 gf2p8affineqb(xtmp1, xtmp2, 0); 6265 movq(dst, xtmp1); 6266 } else { 6267 // Swap even and odd numbered bits. 6268 movl(rtmp, src); 6269 andl(rtmp, 0x55555555); 6270 shll(rtmp, 1); 6271 movl(dst, src); 6272 andl(dst, 0xAAAAAAAA); 6273 shrl(dst, 1); 6274 orl(dst, rtmp); 6275 6276 // Swap LSB and MSB 2 bits of each nibble. 6277 movl(rtmp, dst); 6278 andl(rtmp, 0x33333333); 6279 shll(rtmp, 2); 6280 andl(dst, 0xCCCCCCCC); 6281 shrl(dst, 2); 6282 orl(dst, rtmp); 6283 6284 // Swap LSB and MSB 4 bits of each byte. 6285 movl(rtmp, dst); 6286 andl(rtmp, 0x0F0F0F0F); 6287 shll(rtmp, 4); 6288 andl(dst, 0xF0F0F0F0); 6289 shrl(dst, 4); 6290 orl(dst, rtmp); 6291 } 6292 bswapl(dst); 6293 } 6294 6295 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6296 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6297 if(VM_Version::supports_gfni()) { 6298 // Galois field instruction based bit reversal based on following algorithm. 6299 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6300 mov64(rtmp1, 0x8040201008040201L); 6301 movq(xtmp1, src); 6302 movq(xtmp2, rtmp1); 6303 gf2p8affineqb(xtmp1, xtmp2, 0); 6304 movq(dst, xtmp1); 6305 } else { 6306 // Swap even and odd numbered bits. 6307 movq(rtmp1, src); 6308 mov64(rtmp2, 0x5555555555555555L); 6309 andq(rtmp1, rtmp2); 6310 shlq(rtmp1, 1); 6311 movq(dst, src); 6312 notq(rtmp2); 6313 andq(dst, rtmp2); 6314 shrq(dst, 1); 6315 orq(dst, rtmp1); 6316 6317 // Swap LSB and MSB 2 bits of each nibble. 6318 movq(rtmp1, dst); 6319 mov64(rtmp2, 0x3333333333333333L); 6320 andq(rtmp1, rtmp2); 6321 shlq(rtmp1, 2); 6322 notq(rtmp2); 6323 andq(dst, rtmp2); 6324 shrq(dst, 2); 6325 orq(dst, rtmp1); 6326 6327 // Swap LSB and MSB 4 bits of each byte. 6328 movq(rtmp1, dst); 6329 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6330 andq(rtmp1, rtmp2); 6331 shlq(rtmp1, 4); 6332 notq(rtmp2); 6333 andq(dst, rtmp2); 6334 shrq(dst, 4); 6335 orq(dst, rtmp1); 6336 } 6337 bswapq(dst); 6338 } 6339 6340 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6341 Label done; 6342 Label neg_divisor_fastpath; 6343 cmpq(divisor, 0); 6344 jccb(Assembler::less, neg_divisor_fastpath); 6345 xorl(rdx, rdx); 6346 divq(divisor); 6347 jmpb(done); 6348 bind(neg_divisor_fastpath); 6349 // Fastpath for divisor < 0: 6350 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6351 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6352 movq(rdx, rax); 6353 subq(rdx, divisor); 6354 if (VM_Version::supports_bmi1()) { 6355 andnq(rax, rdx, rax); 6356 } else { 6357 notq(rdx); 6358 andq(rax, rdx); 6359 } 6360 shrq(rax, 63); 6361 bind(done); 6362 } 6363 6364 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6365 Label done; 6366 Label neg_divisor_fastpath; 6367 cmpq(divisor, 0); 6368 jccb(Assembler::less, neg_divisor_fastpath); 6369 xorq(rdx, rdx); 6370 divq(divisor); 6371 jmp(done); 6372 bind(neg_divisor_fastpath); 6373 // Fastpath when divisor < 0: 6374 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6375 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6376 movq(rdx, rax); 6377 subq(rax, divisor); 6378 if (VM_Version::supports_bmi1()) { 6379 andnq(rax, rax, rdx); 6380 } else { 6381 notq(rax); 6382 andq(rax, rdx); 6383 } 6384 sarq(rax, 63); 6385 andq(rax, divisor); 6386 subq(rdx, rax); 6387 bind(done); 6388 } 6389 6390 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6391 Label done; 6392 Label neg_divisor_fastpath; 6393 cmpq(divisor, 0); 6394 jccb(Assembler::less, neg_divisor_fastpath); 6395 xorq(rdx, rdx); 6396 divq(divisor); 6397 jmp(done); 6398 bind(neg_divisor_fastpath); 6399 // Fastpath for divisor < 0: 6400 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6401 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6402 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6403 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6404 movq(rdx, rax); 6405 subq(rax, divisor); 6406 if (VM_Version::supports_bmi1()) { 6407 andnq(rax, rax, rdx); 6408 } else { 6409 notq(rax); 6410 andq(rax, rdx); 6411 } 6412 movq(tmp, rax); 6413 shrq(rax, 63); // quotient 6414 sarq(tmp, 63); 6415 andq(tmp, divisor); 6416 subq(rdx, tmp); // remainder 6417 bind(done); 6418 } 6419 #endif 6420 6421 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6422 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6423 int vlen_enc) { 6424 assert(VM_Version::supports_avx512bw(), ""); 6425 // Byte shuffles are inlane operations and indices are determined using 6426 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6427 // normalized to index range 0-15. This makes sure that all the multiples 6428 // of an index value are placed at same relative position in 128 bit 6429 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6430 // will be 16th element in their respective 128 bit lanes. 6431 movl(rtmp, 16); 6432 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6433 6434 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6435 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6436 // original shuffle indices and move the shuffled lanes corresponding to true 6437 // mask to destination vector. 6438 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6439 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6440 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6441 6442 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6443 // and broadcasting second 128 bit lane. 6444 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6445 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6446 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6447 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6448 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6449 6450 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6451 // and broadcasting third 128 bit lane. 6452 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6453 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6454 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6455 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6456 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6457 6458 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6459 // and broadcasting third 128 bit lane. 6460 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6461 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6462 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6463 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6464 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6465 } 6466 6467 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6468 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6469 if (vlen_enc == AVX_128bit) { 6470 vpermilps(dst, src, shuffle, vlen_enc); 6471 } else if (bt == T_INT) { 6472 vpermd(dst, shuffle, src, vlen_enc); 6473 } else { 6474 assert(bt == T_FLOAT, ""); 6475 vpermps(dst, shuffle, src, vlen_enc); 6476 } 6477 }