1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address (displaced header location) - KILLED 249 // rax,: tmp -- KILLED 250 // scr: tmp -- KILLED 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 252 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 253 Metadata* method_data) { 254 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 255 // Ensure the register assignments are disjoint 256 assert(tmpReg == rax, ""); 257 assert(cx1Reg == noreg, ""); 258 assert(cx2Reg == noreg, ""); 259 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 260 261 // Possible cases that we'll encounter in fast_lock 262 // ------------------------------------------------ 263 // * Inflated 264 // -- unlocked 265 // -- Locked 266 // = by self 267 // = by other 268 // * neutral 269 // * stack-locked 270 // -- by self 271 // = sp-proximity test hits 272 // = sp-proximity test generates false-negative 273 // -- by other 274 // 275 276 Label IsInflated, DONE_LABEL, COUNT; 277 278 if (DiagnoseSyncOnValueBasedClasses != 0) { 279 load_klass(tmpReg, objReg, scrReg); 280 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 281 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 282 jcc(Assembler::notZero, DONE_LABEL); 283 } 284 285 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 286 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 287 jcc(Assembler::notZero, IsInflated); 288 289 if (LockingMode == LM_MONITOR) { 290 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 291 testptr(objReg, objReg); 292 } else { 293 assert(LockingMode == LM_LEGACY, "must be"); 294 // Attempt stack-locking ... 295 orptr (tmpReg, markWord::unlocked_value); 296 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 297 lock(); 298 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 299 jcc(Assembler::equal, COUNT); // Success 300 301 // Recursive locking. 302 // The object is stack-locked: markword contains stack pointer to BasicLock. 303 // Locked by current thread if difference with current SP is less than one page. 304 subptr(tmpReg, rsp); 305 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 306 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 307 movptr(Address(boxReg, 0), tmpReg); 308 } 309 // After recursive stack locking attempt case 310 jmp(DONE_LABEL); 311 312 bind(IsInflated); 313 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 314 315 #ifndef _LP64 316 // The object is inflated. 317 318 // boxReg refers to the on-stack BasicLock in the current frame. 319 // We'd like to write: 320 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 321 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 322 // additional latency as we have another ST in the store buffer that must drain. 323 324 // avoid ST-before-CAS 325 // register juggle because we need tmpReg for cmpxchgptr below 326 movptr(scrReg, boxReg); 327 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 328 329 // Optimistic form: consider XORL tmpReg,tmpReg 330 movptr(tmpReg, NULL_WORD); 331 332 // Appears unlocked - try to swing _owner from null to non-null. 333 // Ideally, I'd manifest "Self" with get_thread and then attempt 334 // to CAS the register containing thread id into m->Owner. 335 // But we don't have enough registers, so instead we can either try to CAS 336 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 337 // we later store thread id into m->Owner. Transiently storing a stack address 338 // (rsp or the address of the box) into m->owner is harmless. 339 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 340 lock(); 341 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 342 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 343 // If we weren't able to swing _owner from null to the BasicLock 344 // then take the slow path. 345 jccb (Assembler::notZero, DONE_LABEL); 346 // update _owner from BasicLock to thread 347 get_thread (scrReg); // beware: clobbers ICCs 348 movptr(scrReg, Address(scrReg, JavaThread::lock_id_offset())); 349 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 350 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 351 jmp(DONE_LABEL); 352 353 // If the CAS fails we can either retry or pass control to the slow path. 354 // We use the latter tactic. 355 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 356 // If the CAS was successful ... 357 // Self has acquired the lock 358 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 359 // Intentional fall-through into DONE_LABEL ... 360 #else // _LP64 361 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 362 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 363 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 364 365 // It's inflated and we use scrReg for ObjectMonitor* in this section. 366 movq(scrReg, tmpReg); 367 xorq(tmpReg, tmpReg); 368 movptr(boxReg, Address(r15_thread, JavaThread::lock_id_offset())); 369 lock(); 370 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 371 372 // Propagate ICC.ZF from CAS above into DONE_LABEL. 373 jccb(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success) 374 375 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 376 jccb(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail) 377 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 378 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 379 jmp(DONE_LABEL); 380 #endif // _LP64 381 382 bind(COUNT); 383 // Count monitors in fast path 384 increment(Address(thread, JavaThread::held_monitor_count_offset())); 385 xorl(tmpReg, tmpReg); // Set ZF == 1 386 387 bind(DONE_LABEL); 388 389 // At DONE_LABEL the icc ZFlag is set as follows ... 390 // fast_unlock uses the same protocol. 391 // ZFlag == 1 -> Success 392 // ZFlag == 0 -> Failure - force control through the slow path 393 } 394 395 // obj: object to unlock 396 // box: box address (displaced header location), killed. Must be EAX. 397 // tmp: killed, cannot be obj nor box. 398 // 399 // Some commentary on balanced locking: 400 // 401 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 402 // Methods that don't have provably balanced locking are forced to run in the 403 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 404 // The interpreter provides two properties: 405 // I1: At return-time the interpreter automatically and quietly unlocks any 406 // objects acquired the current activation (frame). Recall that the 407 // interpreter maintains an on-stack list of locks currently held by 408 // a frame. 409 // I2: If a method attempts to unlock an object that is not held by the 410 // the frame the interpreter throws IMSX. 411 // 412 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 413 // B() doesn't have provably balanced locking so it runs in the interpreter. 414 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 415 // is still locked by A(). 416 // 417 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 418 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 419 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 420 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 421 // Arguably given that the spec legislates the JNI case as undefined our implementation 422 // could reasonably *avoid* checking owner in fast_unlock(). 423 // In the interest of performance we elide m->Owner==Self check in unlock. 424 // A perfectly viable alternative is to elide the owner check except when 425 // Xcheck:jni is enabled. 426 427 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, Register scrReg) { 428 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 429 assert(boxReg == rax, ""); 430 assert_different_registers(objReg, boxReg, tmpReg); 431 432 Label DONE_LABEL, Stacked, COUNT; 433 434 if (LockingMode == LM_LEGACY) { 435 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 436 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 437 } 438 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 439 if (LockingMode != LM_MONITOR) { 440 testptr(tmpReg, markWord::monitor_value); // Inflated? 441 jcc(Assembler::zero, Stacked); 442 } 443 444 // It's inflated. 445 // If the owner is ANONYMOUS, we need to fix it - in an outline stub. 446 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER); 447 #ifdef _LP64 448 if (!Compile::current()->output()->in_scratch_emit_size()) { 449 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg); 450 Compile::current()->output()->add_stub(stub); 451 jcc(Assembler::equal, stub->entry()); 452 bind(stub->continuation()); 453 } else 454 #endif 455 { 456 // We can't easily implement this optimization on 32 bit because we don't have a thread register. 457 // Call the slow-path instead. 458 jcc(Assembler::notEqual, DONE_LABEL); 459 } 460 461 // Despite our balanced locking property we still check that m->_owner == Self 462 // as java routines or native JNI code called by this thread might 463 // have released the lock. 464 // Refer to the comments in synchronizer.cpp for how we might encode extra 465 // state in _succ so we can avoid fetching EntryList|cxq. 466 // 467 // If there's no contention try a 1-0 exit. That is, exit without 468 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 469 // we detect and recover from the race that the 1-0 exit admits. 470 // 471 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 472 // before it STs null into _owner, releasing the lock. Updates 473 // to data protected by the critical section must be visible before 474 // we drop the lock (and thus before any other thread could acquire 475 // the lock and observe the fields protected by the lock). 476 // IA32's memory-model is SPO, so STs are ordered with respect to 477 // each other and there's no need for an explicit barrier (fence). 478 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 479 #ifndef _LP64 480 // Note that we could employ various encoding schemes to reduce 481 // the number of loads below (currently 4) to just 2 or 3. 482 // Refer to the comments in synchronizer.cpp. 483 // In practice the chain of fetches doesn't seem to impact performance, however. 484 xorptr(boxReg, boxReg); 485 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 486 jccb (Assembler::notZero, DONE_LABEL); 487 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 488 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 489 jccb (Assembler::notZero, DONE_LABEL); 490 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 491 jmpb (DONE_LABEL); 492 #else // _LP64 493 // It's inflated 494 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 495 496 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 497 jccb(Assembler::equal, LNotRecursive); 498 499 // Recursive inflated unlock 500 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 501 xorl(tmpReg, tmpReg); // Set ZF == 1 502 jmp(DONE_LABEL); 503 504 bind(LNotRecursive); 505 506 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 507 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 508 jccb (Assembler::notZero, CheckSucc); 509 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 510 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 511 jmpb (DONE_LABEL); 512 513 // Try to avoid passing control into the slow_path ... 514 bind (CheckSucc); 515 516 // The following optional optimization can be elided if necessary 517 // Effectively: if (succ == null) goto slow path 518 // The code reduces the window for a race, however, 519 // and thus benefits performance. 520 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 521 jccb (Assembler::zero, LGoSlowPath); 522 523 xorptr(boxReg, boxReg); 524 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 525 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 526 527 // Memory barrier/fence 528 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 529 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 530 // This is faster on Nehalem and AMD Shanghai/Barcelona. 531 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 532 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 533 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 534 lock(); addl(Address(rsp, 0), 0); 535 536 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 537 jccb (Assembler::notZero, LSuccess); 538 539 // Rare inopportune interleaving - race. 540 // The successor vanished in the small window above. 541 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 542 // We need to ensure progress and succession. 543 // Try to reacquire the lock. 544 // If that fails then the new owner is responsible for succession and this 545 // thread needs to take no further action and can exit via the fast path (success). 546 // If the re-acquire succeeds then pass control into the slow path. 547 // As implemented, this latter mode is horrible because we generated more 548 // coherence traffic on the lock *and* artificially extended the critical section 549 // length while by virtue of passing control into the slow path. 550 551 // box is really RAX -- the following CMPXCHG depends on that binding 552 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 553 movptr(scrReg, Address(r15_thread, JavaThread::lock_id_offset())); 554 lock(); 555 cmpxchgptr(scrReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 556 // There's no successor so we tried to regrab the lock. 557 // If that didn't work, then another thread grabbed the 558 // lock so we're done (and exit was a success). 559 jccb (Assembler::notEqual, LSuccess); 560 // Intentional fall-through into slow path 561 562 bind (LGoSlowPath); 563 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 564 jmpb (DONE_LABEL); 565 566 bind (LSuccess); 567 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 568 jmpb (DONE_LABEL); 569 570 #endif 571 if (LockingMode == LM_LEGACY) { 572 bind (Stacked); 573 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 574 lock(); 575 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 576 jccb(Assembler::notZero, DONE_LABEL); 577 // Count monitors in fast path 578 #ifndef _LP64 579 get_thread(tmpReg); 580 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 581 #else // _LP64 582 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 583 #endif 584 xorl(tmpReg, tmpReg); // Set ZF == 1 585 } 586 587 // ZFlag == 1 -> Success 588 // ZFlag == 0 -> Failure - force control through the slow path 589 bind(DONE_LABEL); 590 } 591 592 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 593 Register t, Register thread) { 594 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 595 assert(rax_reg == rax, "Used for CAS"); 596 assert_different_registers(obj, box, rax_reg, t, thread); 597 598 // Handle inflated monitor. 599 Label inflated; 600 // Finish fast lock successfully. 601 Label locked; 602 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 603 Label slow_path; 604 605 if (DiagnoseSyncOnValueBasedClasses != 0) { 606 load_klass(rax_reg, obj, t); 607 movl(rax_reg, Address(rax_reg, Klass::access_flags_offset())); 608 testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS); 609 jcc(Assembler::notZero, slow_path); 610 } 611 612 const Register mark = t; 613 614 { // Lightweight Lock 615 616 Label push; 617 618 const Register top = box; 619 620 // Load the mark. 621 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 622 623 // Prefetch top. 624 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 625 626 // Check for monitor (0b10). 627 testptr(mark, markWord::monitor_value); 628 jcc(Assembler::notZero, inflated); 629 630 // Check if lock-stack is full. 631 cmpl(top, LockStack::end_offset() - 1); 632 jcc(Assembler::greater, slow_path); 633 634 // Check if recursive. 635 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 636 jccb(Assembler::equal, push); 637 638 // Try to lock. Transition lock bits 0b01 => 0b00 639 movptr(rax_reg, mark); 640 orptr(rax_reg, markWord::unlocked_value); 641 andptr(mark, ~(int32_t)markWord::unlocked_value); 642 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 643 jcc(Assembler::notEqual, slow_path); 644 645 bind(push); 646 // After successful lock, push object on lock-stack. 647 movptr(Address(thread, top), obj); 648 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 649 xorl(rax_reg, rax_reg); 650 jmpb(locked); 651 } 652 653 { // Handle inflated monitor. 654 bind(inflated); 655 656 const Register tagged_monitor = mark; 657 658 // CAS owner (null => current thread). 659 xorptr(rax_reg, rax_reg); 660 movptr(box, Address(thread, JavaThread::lock_id_offset())); 661 lock(); cmpxchgptr(box, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 662 jccb(Assembler::equal, locked); 663 664 // Check if recursive. 665 cmpptr(box, rax_reg); 666 jccb(Assembler::notEqual, slow_path); 667 668 // Recursive. 669 increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 670 xorl(rax_reg, rax_reg); 671 } 672 673 bind(locked); 674 #ifdef ASSERT 675 // Check that locked label is reached with ZF set. 676 Label zf_correct; 677 Label zf_bad_zero; 678 jcc(Assembler::zero, zf_correct); 679 jmp(zf_bad_zero); 680 #endif 681 682 bind(slow_path); 683 #ifdef ASSERT 684 // Check that slow_path label is reached with ZF not set. 685 jcc(Assembler::notZero, zf_correct); 686 stop("Fast Lock ZF != 0"); 687 bind(zf_bad_zero); 688 stop("Fast Lock ZF != 1"); 689 bind(zf_correct); 690 #endif 691 // C2 uses the value of ZF to determine the continuation. 692 } 693 694 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t1, Register t2, Register thread) { 695 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 696 assert(reg_rax == rax, "Used for CAS"); 697 assert_different_registers(obj, reg_rax, t1, t2); 698 699 // Handle inflated monitor. 700 Label inflated, inflated_check_lock_stack; 701 // Finish fast unlock successfully. MUST jump with ZF == 1 702 Label unlocked; 703 704 const Register mark = t1; 705 const Register top = reg_rax; 706 707 Label dummy; 708 C2FastUnlockLightweightStub* stub = nullptr; 709 710 if (!Compile::current()->output()->in_scratch_emit_size()) { 711 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, t2, thread); 712 Compile::current()->output()->add_stub(stub); 713 } 714 715 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 716 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 717 718 { // Lightweight Unlock 719 720 // Load top. 721 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 722 723 // Prefetch mark. 724 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 725 726 // Check if obj is top of lock-stack. 727 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 728 // Top of lock stack was not obj. Must be monitor. 729 jcc(Assembler::notEqual, inflated_check_lock_stack); 730 731 // Pop lock-stack. 732 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 733 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 734 735 // Check if recursive. 736 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 737 jcc(Assembler::equal, unlocked); 738 739 // We elide the monitor check, let the CAS fail instead. 740 741 // Try to unlock. Transition lock bits 0b00 => 0b01 742 movptr(reg_rax, mark); 743 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 744 orptr(mark, markWord::unlocked_value); 745 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 746 jcc(Assembler::notEqual, push_and_slow_path); 747 jmp(unlocked); 748 } 749 750 751 { // Handle inflated monitor. 752 bind(inflated_check_lock_stack); 753 #ifdef ASSERT 754 Label check_done; 755 subl(top, oopSize); 756 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 757 jcc(Assembler::below, check_done); 758 cmpptr(obj, Address(thread, top)); 759 jccb(Assembler::notEqual, inflated_check_lock_stack); 760 stop("Fast Unlock lock on stack"); 761 bind(check_done); 762 testptr(mark, markWord::monitor_value); 763 jccb(Assembler::notZero, inflated); 764 stop("Fast Unlock not monitor"); 765 #endif 766 767 bind(inflated); 768 769 // mark contains the tagged ObjectMonitor*. 770 const Register monitor = mark; 771 772 #ifndef _LP64 773 // Check if recursive. 774 xorptr(reg_rax, reg_rax); 775 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 776 jcc(Assembler::notZero, check_successor); 777 778 // Check if the entry lists are empty. 779 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 780 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 781 jcc(Assembler::notZero, check_successor); 782 783 // Release lock. 784 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 785 #else // _LP64 786 Label recursive; 787 788 // Check if recursive. 789 cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 790 jccb(Assembler::notEqual, recursive); 791 792 // Check if the entry lists are empty. 793 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 794 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 795 jcc(Assembler::notZero, check_successor); 796 797 // Release lock. 798 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 799 jmpb(unlocked); 800 801 // Recursive unlock. 802 bind(recursive); 803 decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 804 xorl(t1, t1); 805 #endif 806 } 807 808 bind(unlocked); 809 if (stub != nullptr) { 810 bind(stub->unlocked_continuation()); 811 } 812 813 #ifdef ASSERT 814 // Check that unlocked label is reached with ZF set. 815 Label zf_correct; 816 jcc(Assembler::zero, zf_correct); 817 stop("Fast Unlock ZF != 1"); 818 #endif 819 820 if (stub != nullptr) { 821 bind(stub->slow_path_continuation()); 822 } 823 #ifdef ASSERT 824 // Check that stub->continuation() label is reached with ZF not set. 825 jccb(Assembler::notZero, zf_correct); 826 stop("Fast Unlock ZF != 0"); 827 bind(zf_correct); 828 #endif 829 // C2 uses the value of ZF to determine the continuation. 830 } 831 832 //------------------------------------------------------------------------------------------- 833 // Generic instructions support for use in .ad files C2 code generation 834 835 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 836 if (dst != src) { 837 movdqu(dst, src); 838 } 839 if (opcode == Op_AbsVD) { 840 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 841 } else { 842 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 843 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 844 } 845 } 846 847 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 848 if (opcode == Op_AbsVD) { 849 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 850 } else { 851 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 852 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 853 } 854 } 855 856 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 857 if (dst != src) { 858 movdqu(dst, src); 859 } 860 if (opcode == Op_AbsVF) { 861 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 862 } else { 863 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 864 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 865 } 866 } 867 868 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 869 if (opcode == Op_AbsVF) { 870 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 871 } else { 872 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 873 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 874 } 875 } 876 877 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 878 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 879 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 880 881 if (opcode == Op_MinV) { 882 if (elem_bt == T_BYTE) { 883 pminsb(dst, src); 884 } else if (elem_bt == T_SHORT) { 885 pminsw(dst, src); 886 } else if (elem_bt == T_INT) { 887 pminsd(dst, src); 888 } else { 889 assert(elem_bt == T_LONG, "required"); 890 assert(tmp == xmm0, "required"); 891 assert_different_registers(dst, src, tmp); 892 movdqu(xmm0, dst); 893 pcmpgtq(xmm0, src); 894 blendvpd(dst, src); // xmm0 as mask 895 } 896 } else { // opcode == Op_MaxV 897 if (elem_bt == T_BYTE) { 898 pmaxsb(dst, src); 899 } else if (elem_bt == T_SHORT) { 900 pmaxsw(dst, src); 901 } else if (elem_bt == T_INT) { 902 pmaxsd(dst, src); 903 } else { 904 assert(elem_bt == T_LONG, "required"); 905 assert(tmp == xmm0, "required"); 906 assert_different_registers(dst, src, tmp); 907 movdqu(xmm0, src); 908 pcmpgtq(xmm0, dst); 909 blendvpd(dst, src); // xmm0 as mask 910 } 911 } 912 } 913 914 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 915 XMMRegister dst, XMMRegister src1, XMMRegister src2, 916 int vlen_enc) { 917 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 918 919 if (opcode == Op_MinV) { 920 if (elem_bt == T_BYTE) { 921 vpminsb(dst, src1, src2, vlen_enc); 922 } else if (elem_bt == T_SHORT) { 923 vpminsw(dst, src1, src2, vlen_enc); 924 } else if (elem_bt == T_INT) { 925 vpminsd(dst, src1, src2, vlen_enc); 926 } else { 927 assert(elem_bt == T_LONG, "required"); 928 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 929 vpminsq(dst, src1, src2, vlen_enc); 930 } else { 931 assert_different_registers(dst, src1, src2); 932 vpcmpgtq(dst, src1, src2, vlen_enc); 933 vblendvpd(dst, src1, src2, dst, vlen_enc); 934 } 935 } 936 } else { // opcode == Op_MaxV 937 if (elem_bt == T_BYTE) { 938 vpmaxsb(dst, src1, src2, vlen_enc); 939 } else if (elem_bt == T_SHORT) { 940 vpmaxsw(dst, src1, src2, vlen_enc); 941 } else if (elem_bt == T_INT) { 942 vpmaxsd(dst, src1, src2, vlen_enc); 943 } else { 944 assert(elem_bt == T_LONG, "required"); 945 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 946 vpmaxsq(dst, src1, src2, vlen_enc); 947 } else { 948 assert_different_registers(dst, src1, src2); 949 vpcmpgtq(dst, src1, src2, vlen_enc); 950 vblendvpd(dst, src2, src1, dst, vlen_enc); 951 } 952 } 953 } 954 } 955 956 // Float/Double min max 957 958 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 959 XMMRegister dst, XMMRegister a, XMMRegister b, 960 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 961 int vlen_enc) { 962 assert(UseAVX > 0, "required"); 963 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 964 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 965 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 966 assert_different_registers(a, tmp, atmp, btmp); 967 assert_different_registers(b, tmp, atmp, btmp); 968 969 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 970 bool is_double_word = is_double_word_type(elem_bt); 971 972 /* Note on 'non-obvious' assembly sequence: 973 * 974 * While there are vminps/vmaxps instructions, there are two important differences between hardware 975 * and Java on how they handle floats: 976 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 977 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 978 * 979 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 980 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 981 * (only useful when signs differ, noop otherwise) 982 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 983 984 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 985 * btmp = (b < +0.0) ? a : b 986 * atmp = (b < +0.0) ? b : a 987 * Tmp = Max_Float(atmp , btmp) 988 * Res = (atmp == NaN) ? atmp : Tmp 989 */ 990 991 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 992 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 993 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 994 XMMRegister mask; 995 996 if (!is_double_word && is_min) { 997 mask = a; 998 vblend = &MacroAssembler::vblendvps; 999 vmaxmin = &MacroAssembler::vminps; 1000 vcmp = &MacroAssembler::vcmpps; 1001 } else if (!is_double_word && !is_min) { 1002 mask = b; 1003 vblend = &MacroAssembler::vblendvps; 1004 vmaxmin = &MacroAssembler::vmaxps; 1005 vcmp = &MacroAssembler::vcmpps; 1006 } else if (is_double_word && is_min) { 1007 mask = a; 1008 vblend = &MacroAssembler::vblendvpd; 1009 vmaxmin = &MacroAssembler::vminpd; 1010 vcmp = &MacroAssembler::vcmppd; 1011 } else { 1012 assert(is_double_word && !is_min, "sanity"); 1013 mask = b; 1014 vblend = &MacroAssembler::vblendvpd; 1015 vmaxmin = &MacroAssembler::vmaxpd; 1016 vcmp = &MacroAssembler::vcmppd; 1017 } 1018 1019 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1020 XMMRegister maxmin, scratch; 1021 if (dst == btmp) { 1022 maxmin = btmp; 1023 scratch = tmp; 1024 } else { 1025 maxmin = tmp; 1026 scratch = btmp; 1027 } 1028 1029 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1030 if (precompute_mask && !is_double_word) { 1031 vpsrad(tmp, mask, 32, vlen_enc); 1032 mask = tmp; 1033 } else if (precompute_mask && is_double_word) { 1034 vpxor(tmp, tmp, tmp, vlen_enc); 1035 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1036 mask = tmp; 1037 } 1038 1039 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1040 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1041 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1042 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1043 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1044 } 1045 1046 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1047 XMMRegister dst, XMMRegister a, XMMRegister b, 1048 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1049 int vlen_enc) { 1050 assert(UseAVX > 2, "required"); 1051 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1052 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1053 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1054 assert_different_registers(dst, a, atmp, btmp); 1055 assert_different_registers(dst, b, atmp, btmp); 1056 1057 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1058 bool is_double_word = is_double_word_type(elem_bt); 1059 bool merge = true; 1060 1061 if (!is_double_word && is_min) { 1062 evpmovd2m(ktmp, a, vlen_enc); 1063 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1064 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1065 vminps(dst, atmp, btmp, vlen_enc); 1066 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1067 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1068 } else if (!is_double_word && !is_min) { 1069 evpmovd2m(ktmp, b, vlen_enc); 1070 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1071 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1072 vmaxps(dst, atmp, btmp, vlen_enc); 1073 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1074 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1075 } else if (is_double_word && is_min) { 1076 evpmovq2m(ktmp, a, vlen_enc); 1077 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1078 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1079 vminpd(dst, atmp, btmp, vlen_enc); 1080 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1081 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1082 } else { 1083 assert(is_double_word && !is_min, "sanity"); 1084 evpmovq2m(ktmp, b, vlen_enc); 1085 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1086 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1087 vmaxpd(dst, atmp, btmp, vlen_enc); 1088 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1089 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1090 } 1091 } 1092 1093 // Float/Double signum 1094 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1095 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1096 1097 Label DONE_LABEL; 1098 1099 if (opcode == Op_SignumF) { 1100 assert(UseSSE > 0, "required"); 1101 ucomiss(dst, zero); 1102 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1103 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1104 movflt(dst, one); 1105 jcc(Assembler::above, DONE_LABEL); 1106 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1107 } else if (opcode == Op_SignumD) { 1108 assert(UseSSE > 1, "required"); 1109 ucomisd(dst, zero); 1110 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1111 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1112 movdbl(dst, one); 1113 jcc(Assembler::above, DONE_LABEL); 1114 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1115 } 1116 1117 bind(DONE_LABEL); 1118 } 1119 1120 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1121 if (sign) { 1122 pmovsxbw(dst, src); 1123 } else { 1124 pmovzxbw(dst, src); 1125 } 1126 } 1127 1128 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1129 if (sign) { 1130 vpmovsxbw(dst, src, vector_len); 1131 } else { 1132 vpmovzxbw(dst, src, vector_len); 1133 } 1134 } 1135 1136 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1137 if (sign) { 1138 vpmovsxbd(dst, src, vector_len); 1139 } else { 1140 vpmovzxbd(dst, src, vector_len); 1141 } 1142 } 1143 1144 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1145 if (sign) { 1146 vpmovsxwd(dst, src, vector_len); 1147 } else { 1148 vpmovzxwd(dst, src, vector_len); 1149 } 1150 } 1151 1152 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1153 int shift, int vector_len) { 1154 if (opcode == Op_RotateLeftV) { 1155 if (etype == T_INT) { 1156 evprold(dst, src, shift, vector_len); 1157 } else { 1158 assert(etype == T_LONG, "expected type T_LONG"); 1159 evprolq(dst, src, shift, vector_len); 1160 } 1161 } else { 1162 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1163 if (etype == T_INT) { 1164 evprord(dst, src, shift, vector_len); 1165 } else { 1166 assert(etype == T_LONG, "expected type T_LONG"); 1167 evprorq(dst, src, shift, vector_len); 1168 } 1169 } 1170 } 1171 1172 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1173 XMMRegister shift, int vector_len) { 1174 if (opcode == Op_RotateLeftV) { 1175 if (etype == T_INT) { 1176 evprolvd(dst, src, shift, vector_len); 1177 } else { 1178 assert(etype == T_LONG, "expected type T_LONG"); 1179 evprolvq(dst, src, shift, vector_len); 1180 } 1181 } else { 1182 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1183 if (etype == T_INT) { 1184 evprorvd(dst, src, shift, vector_len); 1185 } else { 1186 assert(etype == T_LONG, "expected type T_LONG"); 1187 evprorvq(dst, src, shift, vector_len); 1188 } 1189 } 1190 } 1191 1192 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1193 if (opcode == Op_RShiftVI) { 1194 psrad(dst, shift); 1195 } else if (opcode == Op_LShiftVI) { 1196 pslld(dst, shift); 1197 } else { 1198 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1199 psrld(dst, shift); 1200 } 1201 } 1202 1203 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1204 switch (opcode) { 1205 case Op_RShiftVI: psrad(dst, shift); break; 1206 case Op_LShiftVI: pslld(dst, shift); break; 1207 case Op_URShiftVI: psrld(dst, shift); break; 1208 1209 default: assert(false, "%s", NodeClassNames[opcode]); 1210 } 1211 } 1212 1213 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1214 if (opcode == Op_RShiftVI) { 1215 vpsrad(dst, nds, shift, vector_len); 1216 } else if (opcode == Op_LShiftVI) { 1217 vpslld(dst, nds, shift, vector_len); 1218 } else { 1219 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1220 vpsrld(dst, nds, shift, vector_len); 1221 } 1222 } 1223 1224 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1225 switch (opcode) { 1226 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1227 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1228 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1229 1230 default: assert(false, "%s", NodeClassNames[opcode]); 1231 } 1232 } 1233 1234 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1235 switch (opcode) { 1236 case Op_RShiftVB: // fall-through 1237 case Op_RShiftVS: psraw(dst, shift); break; 1238 1239 case Op_LShiftVB: // fall-through 1240 case Op_LShiftVS: psllw(dst, shift); break; 1241 1242 case Op_URShiftVS: // fall-through 1243 case Op_URShiftVB: psrlw(dst, shift); break; 1244 1245 default: assert(false, "%s", NodeClassNames[opcode]); 1246 } 1247 } 1248 1249 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1250 switch (opcode) { 1251 case Op_RShiftVB: // fall-through 1252 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1253 1254 case Op_LShiftVB: // fall-through 1255 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1256 1257 case Op_URShiftVS: // fall-through 1258 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1259 1260 default: assert(false, "%s", NodeClassNames[opcode]); 1261 } 1262 } 1263 1264 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1265 switch (opcode) { 1266 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1267 case Op_LShiftVL: psllq(dst, shift); break; 1268 case Op_URShiftVL: psrlq(dst, shift); break; 1269 1270 default: assert(false, "%s", NodeClassNames[opcode]); 1271 } 1272 } 1273 1274 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1275 if (opcode == Op_RShiftVL) { 1276 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1277 } else if (opcode == Op_LShiftVL) { 1278 psllq(dst, shift); 1279 } else { 1280 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1281 psrlq(dst, shift); 1282 } 1283 } 1284 1285 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1286 switch (opcode) { 1287 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1288 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1289 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1290 1291 default: assert(false, "%s", NodeClassNames[opcode]); 1292 } 1293 } 1294 1295 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1296 if (opcode == Op_RShiftVL) { 1297 evpsraq(dst, nds, shift, vector_len); 1298 } else if (opcode == Op_LShiftVL) { 1299 vpsllq(dst, nds, shift, vector_len); 1300 } else { 1301 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1302 vpsrlq(dst, nds, shift, vector_len); 1303 } 1304 } 1305 1306 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1307 switch (opcode) { 1308 case Op_RShiftVB: // fall-through 1309 case Op_RShiftVS: // fall-through 1310 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1311 1312 case Op_LShiftVB: // fall-through 1313 case Op_LShiftVS: // fall-through 1314 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1315 1316 case Op_URShiftVB: // fall-through 1317 case Op_URShiftVS: // fall-through 1318 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1319 1320 default: assert(false, "%s", NodeClassNames[opcode]); 1321 } 1322 } 1323 1324 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1325 switch (opcode) { 1326 case Op_RShiftVB: // fall-through 1327 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1328 1329 case Op_LShiftVB: // fall-through 1330 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1331 1332 case Op_URShiftVB: // fall-through 1333 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1334 1335 default: assert(false, "%s", NodeClassNames[opcode]); 1336 } 1337 } 1338 1339 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1340 assert(UseAVX >= 2, "required"); 1341 switch (opcode) { 1342 case Op_RShiftVL: { 1343 if (UseAVX > 2) { 1344 assert(tmp == xnoreg, "not used"); 1345 if (!VM_Version::supports_avx512vl()) { 1346 vlen_enc = Assembler::AVX_512bit; 1347 } 1348 evpsravq(dst, src, shift, vlen_enc); 1349 } else { 1350 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1351 vpsrlvq(dst, src, shift, vlen_enc); 1352 vpsrlvq(tmp, tmp, shift, vlen_enc); 1353 vpxor(dst, dst, tmp, vlen_enc); 1354 vpsubq(dst, dst, tmp, vlen_enc); 1355 } 1356 break; 1357 } 1358 case Op_LShiftVL: { 1359 assert(tmp == xnoreg, "not used"); 1360 vpsllvq(dst, src, shift, vlen_enc); 1361 break; 1362 } 1363 case Op_URShiftVL: { 1364 assert(tmp == xnoreg, "not used"); 1365 vpsrlvq(dst, src, shift, vlen_enc); 1366 break; 1367 } 1368 default: assert(false, "%s", NodeClassNames[opcode]); 1369 } 1370 } 1371 1372 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1373 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1374 assert(opcode == Op_LShiftVB || 1375 opcode == Op_RShiftVB || 1376 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1377 bool sign = (opcode != Op_URShiftVB); 1378 assert(vector_len == 0, "required"); 1379 vextendbd(sign, dst, src, 1); 1380 vpmovzxbd(vtmp, shift, 1); 1381 varshiftd(opcode, dst, dst, vtmp, 1); 1382 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1383 vextracti128_high(vtmp, dst); 1384 vpackusdw(dst, dst, vtmp, 0); 1385 } 1386 1387 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1388 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1389 assert(opcode == Op_LShiftVB || 1390 opcode == Op_RShiftVB || 1391 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1392 bool sign = (opcode != Op_URShiftVB); 1393 int ext_vector_len = vector_len + 1; 1394 vextendbw(sign, dst, src, ext_vector_len); 1395 vpmovzxbw(vtmp, shift, ext_vector_len); 1396 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1397 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1398 if (vector_len == 0) { 1399 vextracti128_high(vtmp, dst); 1400 vpackuswb(dst, dst, vtmp, vector_len); 1401 } else { 1402 vextracti64x4_high(vtmp, dst); 1403 vpackuswb(dst, dst, vtmp, vector_len); 1404 vpermq(dst, dst, 0xD8, vector_len); 1405 } 1406 } 1407 1408 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1409 switch(typ) { 1410 case T_BYTE: 1411 pinsrb(dst, val, idx); 1412 break; 1413 case T_SHORT: 1414 pinsrw(dst, val, idx); 1415 break; 1416 case T_INT: 1417 pinsrd(dst, val, idx); 1418 break; 1419 case T_LONG: 1420 pinsrq(dst, val, idx); 1421 break; 1422 default: 1423 assert(false,"Should not reach here."); 1424 break; 1425 } 1426 } 1427 1428 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1429 switch(typ) { 1430 case T_BYTE: 1431 vpinsrb(dst, src, val, idx); 1432 break; 1433 case T_SHORT: 1434 vpinsrw(dst, src, val, idx); 1435 break; 1436 case T_INT: 1437 vpinsrd(dst, src, val, idx); 1438 break; 1439 case T_LONG: 1440 vpinsrq(dst, src, val, idx); 1441 break; 1442 default: 1443 assert(false,"Should not reach here."); 1444 break; 1445 } 1446 } 1447 1448 #ifdef _LP64 1449 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1450 XMMRegister dst, Register base, 1451 Register idx_base, 1452 Register offset, Register mask, 1453 Register mask_idx, Register rtmp, 1454 int vlen_enc) { 1455 vpxor(dst, dst, dst, vlen_enc); 1456 if (elem_bt == T_SHORT) { 1457 for (int i = 0; i < 4; i++) { 1458 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1459 Label skip_load; 1460 btq(mask, mask_idx); 1461 jccb(Assembler::carryClear, skip_load); 1462 movl(rtmp, Address(idx_base, i * 4)); 1463 if (offset != noreg) { 1464 addl(rtmp, offset); 1465 } 1466 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1467 bind(skip_load); 1468 incq(mask_idx); 1469 } 1470 } else { 1471 assert(elem_bt == T_BYTE, ""); 1472 for (int i = 0; i < 8; i++) { 1473 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1474 Label skip_load; 1475 btq(mask, mask_idx); 1476 jccb(Assembler::carryClear, skip_load); 1477 movl(rtmp, Address(idx_base, i * 4)); 1478 if (offset != noreg) { 1479 addl(rtmp, offset); 1480 } 1481 pinsrb(dst, Address(base, rtmp), i); 1482 bind(skip_load); 1483 incq(mask_idx); 1484 } 1485 } 1486 } 1487 #endif // _LP64 1488 1489 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1490 Register base, Register idx_base, 1491 Register offset, Register rtmp, 1492 int vlen_enc) { 1493 vpxor(dst, dst, dst, vlen_enc); 1494 if (elem_bt == T_SHORT) { 1495 for (int i = 0; i < 4; i++) { 1496 // dst[i] = src[offset + idx_base[i]] 1497 movl(rtmp, Address(idx_base, i * 4)); 1498 if (offset != noreg) { 1499 addl(rtmp, offset); 1500 } 1501 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1502 } 1503 } else { 1504 assert(elem_bt == T_BYTE, ""); 1505 for (int i = 0; i < 8; i++) { 1506 // dst[i] = src[offset + idx_base[i]] 1507 movl(rtmp, Address(idx_base, i * 4)); 1508 if (offset != noreg) { 1509 addl(rtmp, offset); 1510 } 1511 pinsrb(dst, Address(base, rtmp), i); 1512 } 1513 } 1514 } 1515 1516 /* 1517 * Gather using hybrid algorithm, first partially unroll scalar loop 1518 * to accumulate values from gather indices into a quad-word(64bit) slice. 1519 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1520 * permutation to place the slice into appropriate vector lane 1521 * locations in destination vector. Following pseudo code describes the 1522 * algorithm in detail: 1523 * 1524 * DST_VEC = ZERO_VEC 1525 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1526 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1527 * FOREACH_ITER: 1528 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1529 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1530 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1531 * PERM_INDEX = PERM_INDEX - TWO_VEC 1532 * 1533 * With each iteration, doubleword permute indices (0,1) corresponding 1534 * to gathered quadword gets right shifted by two lane positions. 1535 * 1536 */ 1537 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1538 Register base, Register idx_base, 1539 Register offset, Register mask, 1540 XMMRegister xtmp1, XMMRegister xtmp2, 1541 XMMRegister temp_dst, Register rtmp, 1542 Register mask_idx, Register length, 1543 int vector_len, int vlen_enc) { 1544 Label GATHER8_LOOP; 1545 assert(is_subword_type(elem_ty), ""); 1546 movl(length, vector_len); 1547 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1548 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1549 vallones(xtmp2, vlen_enc); 1550 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1551 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1552 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1553 1554 bind(GATHER8_LOOP); 1555 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1556 if (mask == noreg) { 1557 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1558 } else { 1559 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1560 } 1561 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1562 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1563 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1564 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1565 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1566 vpor(dst, dst, temp_dst, vlen_enc); 1567 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1568 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1569 jcc(Assembler::notEqual, GATHER8_LOOP); 1570 } 1571 1572 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1573 switch(typ) { 1574 case T_INT: 1575 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1576 break; 1577 case T_FLOAT: 1578 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1579 break; 1580 case T_LONG: 1581 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1582 break; 1583 case T_DOUBLE: 1584 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1585 break; 1586 default: 1587 assert(false,"Should not reach here."); 1588 break; 1589 } 1590 } 1591 1592 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1593 switch(typ) { 1594 case T_INT: 1595 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1596 break; 1597 case T_FLOAT: 1598 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1599 break; 1600 case T_LONG: 1601 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1602 break; 1603 case T_DOUBLE: 1604 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1605 break; 1606 default: 1607 assert(false,"Should not reach here."); 1608 break; 1609 } 1610 } 1611 1612 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1613 switch(typ) { 1614 case T_INT: 1615 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1616 break; 1617 case T_FLOAT: 1618 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1619 break; 1620 case T_LONG: 1621 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1622 break; 1623 case T_DOUBLE: 1624 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1625 break; 1626 default: 1627 assert(false,"Should not reach here."); 1628 break; 1629 } 1630 } 1631 1632 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1633 if (vlen_in_bytes <= 16) { 1634 pxor (dst, dst); 1635 psubb(dst, src); 1636 switch (elem_bt) { 1637 case T_BYTE: /* nothing to do */ break; 1638 case T_SHORT: pmovsxbw(dst, dst); break; 1639 case T_INT: pmovsxbd(dst, dst); break; 1640 case T_FLOAT: pmovsxbd(dst, dst); break; 1641 case T_LONG: pmovsxbq(dst, dst); break; 1642 case T_DOUBLE: pmovsxbq(dst, dst); break; 1643 1644 default: assert(false, "%s", type2name(elem_bt)); 1645 } 1646 } else { 1647 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1648 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1649 1650 vpxor (dst, dst, dst, vlen_enc); 1651 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1652 1653 switch (elem_bt) { 1654 case T_BYTE: /* nothing to do */ break; 1655 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1656 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1657 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1658 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1659 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1660 1661 default: assert(false, "%s", type2name(elem_bt)); 1662 } 1663 } 1664 } 1665 1666 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1667 if (novlbwdq) { 1668 vpmovsxbd(xtmp, src, vlen_enc); 1669 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1670 Assembler::eq, true, vlen_enc, noreg); 1671 } else { 1672 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1673 vpsubb(xtmp, xtmp, src, vlen_enc); 1674 evpmovb2m(dst, xtmp, vlen_enc); 1675 } 1676 } 1677 1678 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1679 switch (vlen_in_bytes) { 1680 case 4: movdl(dst, src); break; 1681 case 8: movq(dst, src); break; 1682 case 16: movdqu(dst, src); break; 1683 case 32: vmovdqu(dst, src); break; 1684 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1685 default: ShouldNotReachHere(); 1686 } 1687 } 1688 1689 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1690 assert(rscratch != noreg || always_reachable(src), "missing"); 1691 1692 if (reachable(src)) { 1693 load_vector(dst, as_Address(src), vlen_in_bytes); 1694 } else { 1695 lea(rscratch, src); 1696 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1697 } 1698 } 1699 1700 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1701 int vlen_enc = vector_length_encoding(vlen); 1702 if (VM_Version::supports_avx()) { 1703 if (bt == T_LONG) { 1704 if (VM_Version::supports_avx2()) { 1705 vpbroadcastq(dst, src, vlen_enc); 1706 } else { 1707 vmovddup(dst, src, vlen_enc); 1708 } 1709 } else if (bt == T_DOUBLE) { 1710 if (vlen_enc != Assembler::AVX_128bit) { 1711 vbroadcastsd(dst, src, vlen_enc, noreg); 1712 } else { 1713 vmovddup(dst, src, vlen_enc); 1714 } 1715 } else { 1716 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1717 vpbroadcastd(dst, src, vlen_enc); 1718 } else { 1719 vbroadcastss(dst, src, vlen_enc); 1720 } 1721 } 1722 } else if (VM_Version::supports_sse3()) { 1723 movddup(dst, src); 1724 } else { 1725 movq(dst, src); 1726 if (vlen == 16) { 1727 punpcklqdq(dst, dst); 1728 } 1729 } 1730 } 1731 1732 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1733 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1734 int offset = exact_log2(type2aelembytes(bt)) << 6; 1735 if (is_floating_point_type(bt)) { 1736 offset += 128; 1737 } 1738 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1739 load_vector(dst, addr, vlen_in_bytes); 1740 } 1741 1742 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1743 1744 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1745 int vector_len = Assembler::AVX_128bit; 1746 1747 switch (opcode) { 1748 case Op_AndReductionV: pand(dst, src); break; 1749 case Op_OrReductionV: por (dst, src); break; 1750 case Op_XorReductionV: pxor(dst, src); break; 1751 case Op_MinReductionV: 1752 switch (typ) { 1753 case T_BYTE: pminsb(dst, src); break; 1754 case T_SHORT: pminsw(dst, src); break; 1755 case T_INT: pminsd(dst, src); break; 1756 case T_LONG: assert(UseAVX > 2, "required"); 1757 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1758 default: assert(false, "wrong type"); 1759 } 1760 break; 1761 case Op_MaxReductionV: 1762 switch (typ) { 1763 case T_BYTE: pmaxsb(dst, src); break; 1764 case T_SHORT: pmaxsw(dst, src); break; 1765 case T_INT: pmaxsd(dst, src); break; 1766 case T_LONG: assert(UseAVX > 2, "required"); 1767 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1768 default: assert(false, "wrong type"); 1769 } 1770 break; 1771 case Op_AddReductionVF: addss(dst, src); break; 1772 case Op_AddReductionVD: addsd(dst, src); break; 1773 case Op_AddReductionVI: 1774 switch (typ) { 1775 case T_BYTE: paddb(dst, src); break; 1776 case T_SHORT: paddw(dst, src); break; 1777 case T_INT: paddd(dst, src); break; 1778 default: assert(false, "wrong type"); 1779 } 1780 break; 1781 case Op_AddReductionVL: paddq(dst, src); break; 1782 case Op_MulReductionVF: mulss(dst, src); break; 1783 case Op_MulReductionVD: mulsd(dst, src); break; 1784 case Op_MulReductionVI: 1785 switch (typ) { 1786 case T_SHORT: pmullw(dst, src); break; 1787 case T_INT: pmulld(dst, src); break; 1788 default: assert(false, "wrong type"); 1789 } 1790 break; 1791 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1792 evpmullq(dst, dst, src, vector_len); break; 1793 default: assert(false, "wrong opcode"); 1794 } 1795 } 1796 1797 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1798 int vector_len = Assembler::AVX_256bit; 1799 1800 switch (opcode) { 1801 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1802 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1803 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1804 case Op_MinReductionV: 1805 switch (typ) { 1806 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1807 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1808 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1809 case T_LONG: assert(UseAVX > 2, "required"); 1810 vpminsq(dst, src1, src2, vector_len); break; 1811 default: assert(false, "wrong type"); 1812 } 1813 break; 1814 case Op_MaxReductionV: 1815 switch (typ) { 1816 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1817 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1818 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1819 case T_LONG: assert(UseAVX > 2, "required"); 1820 vpmaxsq(dst, src1, src2, vector_len); break; 1821 default: assert(false, "wrong type"); 1822 } 1823 break; 1824 case Op_AddReductionVI: 1825 switch (typ) { 1826 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1827 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1828 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1829 default: assert(false, "wrong type"); 1830 } 1831 break; 1832 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1833 case Op_MulReductionVI: 1834 switch (typ) { 1835 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1836 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1837 default: assert(false, "wrong type"); 1838 } 1839 break; 1840 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1841 default: assert(false, "wrong opcode"); 1842 } 1843 } 1844 1845 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1846 XMMRegister dst, XMMRegister src, 1847 XMMRegister vtmp1, XMMRegister vtmp2) { 1848 switch (opcode) { 1849 case Op_AddReductionVF: 1850 case Op_MulReductionVF: 1851 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1852 break; 1853 1854 case Op_AddReductionVD: 1855 case Op_MulReductionVD: 1856 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1857 break; 1858 1859 default: assert(false, "wrong opcode"); 1860 } 1861 } 1862 1863 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1864 Register dst, Register src1, XMMRegister src2, 1865 XMMRegister vtmp1, XMMRegister vtmp2) { 1866 switch (vlen) { 1867 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1868 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1869 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1870 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1871 1872 default: assert(false, "wrong vector length"); 1873 } 1874 } 1875 1876 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1877 Register dst, Register src1, XMMRegister src2, 1878 XMMRegister vtmp1, XMMRegister vtmp2) { 1879 switch (vlen) { 1880 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1881 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1882 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1883 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1884 1885 default: assert(false, "wrong vector length"); 1886 } 1887 } 1888 1889 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1890 Register dst, Register src1, XMMRegister src2, 1891 XMMRegister vtmp1, XMMRegister vtmp2) { 1892 switch (vlen) { 1893 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1894 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1895 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1896 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1897 1898 default: assert(false, "wrong vector length"); 1899 } 1900 } 1901 1902 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1903 Register dst, Register src1, XMMRegister src2, 1904 XMMRegister vtmp1, XMMRegister vtmp2) { 1905 switch (vlen) { 1906 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1907 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1908 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1909 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1910 1911 default: assert(false, "wrong vector length"); 1912 } 1913 } 1914 1915 #ifdef _LP64 1916 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1917 Register dst, Register src1, XMMRegister src2, 1918 XMMRegister vtmp1, XMMRegister vtmp2) { 1919 switch (vlen) { 1920 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1921 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1922 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1923 1924 default: assert(false, "wrong vector length"); 1925 } 1926 } 1927 #endif // _LP64 1928 1929 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1930 switch (vlen) { 1931 case 2: 1932 assert(vtmp2 == xnoreg, ""); 1933 reduce2F(opcode, dst, src, vtmp1); 1934 break; 1935 case 4: 1936 assert(vtmp2 == xnoreg, ""); 1937 reduce4F(opcode, dst, src, vtmp1); 1938 break; 1939 case 8: 1940 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1941 break; 1942 case 16: 1943 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1944 break; 1945 default: assert(false, "wrong vector length"); 1946 } 1947 } 1948 1949 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1950 switch (vlen) { 1951 case 2: 1952 assert(vtmp2 == xnoreg, ""); 1953 reduce2D(opcode, dst, src, vtmp1); 1954 break; 1955 case 4: 1956 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1957 break; 1958 case 8: 1959 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1960 break; 1961 default: assert(false, "wrong vector length"); 1962 } 1963 } 1964 1965 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1966 if (opcode == Op_AddReductionVI) { 1967 if (vtmp1 != src2) { 1968 movdqu(vtmp1, src2); 1969 } 1970 phaddd(vtmp1, vtmp1); 1971 } else { 1972 pshufd(vtmp1, src2, 0x1); 1973 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1974 } 1975 movdl(vtmp2, src1); 1976 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1977 movdl(dst, vtmp1); 1978 } 1979 1980 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1981 if (opcode == Op_AddReductionVI) { 1982 if (vtmp1 != src2) { 1983 movdqu(vtmp1, src2); 1984 } 1985 phaddd(vtmp1, src2); 1986 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1987 } else { 1988 pshufd(vtmp2, src2, 0xE); 1989 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1990 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1991 } 1992 } 1993 1994 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1995 if (opcode == Op_AddReductionVI) { 1996 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1997 vextracti128_high(vtmp2, vtmp1); 1998 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1999 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2000 } else { 2001 vextracti128_high(vtmp1, src2); 2002 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2003 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2004 } 2005 } 2006 2007 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2008 vextracti64x4_high(vtmp2, src2); 2009 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2010 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2011 } 2012 2013 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2014 pshufd(vtmp2, src2, 0x1); 2015 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2016 movdqu(vtmp1, vtmp2); 2017 psrldq(vtmp1, 2); 2018 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2019 movdqu(vtmp2, vtmp1); 2020 psrldq(vtmp2, 1); 2021 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2022 movdl(vtmp2, src1); 2023 pmovsxbd(vtmp1, vtmp1); 2024 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2025 pextrb(dst, vtmp1, 0x0); 2026 movsbl(dst, dst); 2027 } 2028 2029 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2030 pshufd(vtmp1, src2, 0xE); 2031 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2032 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2033 } 2034 2035 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2036 vextracti128_high(vtmp2, src2); 2037 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2038 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2039 } 2040 2041 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2042 vextracti64x4_high(vtmp1, src2); 2043 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2044 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2045 } 2046 2047 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2048 pmovsxbw(vtmp2, src2); 2049 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2050 } 2051 2052 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2053 if (UseAVX > 1) { 2054 int vector_len = Assembler::AVX_256bit; 2055 vpmovsxbw(vtmp1, src2, vector_len); 2056 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2057 } else { 2058 pmovsxbw(vtmp2, src2); 2059 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2060 pshufd(vtmp2, src2, 0x1); 2061 pmovsxbw(vtmp2, src2); 2062 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2063 } 2064 } 2065 2066 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2067 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2068 int vector_len = Assembler::AVX_512bit; 2069 vpmovsxbw(vtmp1, src2, vector_len); 2070 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2071 } else { 2072 assert(UseAVX >= 2,"Should not reach here."); 2073 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2074 vextracti128_high(vtmp2, src2); 2075 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2076 } 2077 } 2078 2079 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2080 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2081 vextracti64x4_high(vtmp2, src2); 2082 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2083 } 2084 2085 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2086 if (opcode == Op_AddReductionVI) { 2087 if (vtmp1 != src2) { 2088 movdqu(vtmp1, src2); 2089 } 2090 phaddw(vtmp1, vtmp1); 2091 phaddw(vtmp1, vtmp1); 2092 } else { 2093 pshufd(vtmp2, src2, 0x1); 2094 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2095 movdqu(vtmp1, vtmp2); 2096 psrldq(vtmp1, 2); 2097 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2098 } 2099 movdl(vtmp2, src1); 2100 pmovsxwd(vtmp1, vtmp1); 2101 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2102 pextrw(dst, vtmp1, 0x0); 2103 movswl(dst, dst); 2104 } 2105 2106 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2107 if (opcode == Op_AddReductionVI) { 2108 if (vtmp1 != src2) { 2109 movdqu(vtmp1, src2); 2110 } 2111 phaddw(vtmp1, src2); 2112 } else { 2113 pshufd(vtmp1, src2, 0xE); 2114 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2115 } 2116 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2117 } 2118 2119 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2120 if (opcode == Op_AddReductionVI) { 2121 int vector_len = Assembler::AVX_256bit; 2122 vphaddw(vtmp2, src2, src2, vector_len); 2123 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2124 } else { 2125 vextracti128_high(vtmp2, src2); 2126 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2127 } 2128 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2129 } 2130 2131 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2132 int vector_len = Assembler::AVX_256bit; 2133 vextracti64x4_high(vtmp1, src2); 2134 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2135 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2136 } 2137 2138 #ifdef _LP64 2139 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2140 pshufd(vtmp2, src2, 0xE); 2141 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2142 movdq(vtmp1, src1); 2143 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2144 movdq(dst, vtmp1); 2145 } 2146 2147 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2148 vextracti128_high(vtmp1, src2); 2149 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2150 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2151 } 2152 2153 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2154 vextracti64x4_high(vtmp2, src2); 2155 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2156 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2157 } 2158 2159 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2160 mov64(temp, -1L); 2161 bzhiq(temp, temp, len); 2162 kmovql(dst, temp); 2163 } 2164 #endif // _LP64 2165 2166 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2167 reduce_operation_128(T_FLOAT, opcode, dst, src); 2168 pshufd(vtmp, src, 0x1); 2169 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2170 } 2171 2172 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2173 reduce2F(opcode, dst, src, vtmp); 2174 pshufd(vtmp, src, 0x2); 2175 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2176 pshufd(vtmp, src, 0x3); 2177 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2178 } 2179 2180 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2181 reduce4F(opcode, dst, src, vtmp2); 2182 vextractf128_high(vtmp2, src); 2183 reduce4F(opcode, dst, vtmp2, vtmp1); 2184 } 2185 2186 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2187 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2188 vextracti64x4_high(vtmp1, src); 2189 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2190 } 2191 2192 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2193 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2194 pshufd(vtmp, src, 0xE); 2195 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2196 } 2197 2198 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2199 reduce2D(opcode, dst, src, vtmp2); 2200 vextractf128_high(vtmp2, src); 2201 reduce2D(opcode, dst, vtmp2, vtmp1); 2202 } 2203 2204 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2205 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2206 vextracti64x4_high(vtmp1, src); 2207 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2208 } 2209 2210 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2211 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2212 } 2213 2214 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2215 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2216 } 2217 2218 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2219 int vec_enc) { 2220 switch(elem_bt) { 2221 case T_INT: 2222 case T_FLOAT: 2223 vmaskmovps(dst, src, mask, vec_enc); 2224 break; 2225 case T_LONG: 2226 case T_DOUBLE: 2227 vmaskmovpd(dst, src, mask, vec_enc); 2228 break; 2229 default: 2230 fatal("Unsupported type %s", type2name(elem_bt)); 2231 break; 2232 } 2233 } 2234 2235 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2236 int vec_enc) { 2237 switch(elem_bt) { 2238 case T_INT: 2239 case T_FLOAT: 2240 vmaskmovps(dst, src, mask, vec_enc); 2241 break; 2242 case T_LONG: 2243 case T_DOUBLE: 2244 vmaskmovpd(dst, src, mask, vec_enc); 2245 break; 2246 default: 2247 fatal("Unsupported type %s", type2name(elem_bt)); 2248 break; 2249 } 2250 } 2251 2252 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2253 XMMRegister dst, XMMRegister src, 2254 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2255 XMMRegister xmm_0, XMMRegister xmm_1) { 2256 const int permconst[] = {1, 14}; 2257 XMMRegister wsrc = src; 2258 XMMRegister wdst = xmm_0; 2259 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2260 2261 int vlen_enc = Assembler::AVX_128bit; 2262 if (vlen == 16) { 2263 vlen_enc = Assembler::AVX_256bit; 2264 } 2265 2266 for (int i = log2(vlen) - 1; i >=0; i--) { 2267 if (i == 0 && !is_dst_valid) { 2268 wdst = dst; 2269 } 2270 if (i == 3) { 2271 vextracti64x4_high(wtmp, wsrc); 2272 } else if (i == 2) { 2273 vextracti128_high(wtmp, wsrc); 2274 } else { // i = [0,1] 2275 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2276 } 2277 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2278 wsrc = wdst; 2279 vlen_enc = Assembler::AVX_128bit; 2280 } 2281 if (is_dst_valid) { 2282 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2283 } 2284 } 2285 2286 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2287 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2288 XMMRegister xmm_0, XMMRegister xmm_1) { 2289 XMMRegister wsrc = src; 2290 XMMRegister wdst = xmm_0; 2291 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2292 int vlen_enc = Assembler::AVX_128bit; 2293 if (vlen == 8) { 2294 vlen_enc = Assembler::AVX_256bit; 2295 } 2296 for (int i = log2(vlen) - 1; i >=0; i--) { 2297 if (i == 0 && !is_dst_valid) { 2298 wdst = dst; 2299 } 2300 if (i == 1) { 2301 vextracti128_high(wtmp, wsrc); 2302 } else if (i == 2) { 2303 vextracti64x4_high(wtmp, wsrc); 2304 } else { 2305 assert(i == 0, "%d", i); 2306 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2307 } 2308 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2309 wsrc = wdst; 2310 vlen_enc = Assembler::AVX_128bit; 2311 } 2312 if (is_dst_valid) { 2313 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2314 } 2315 } 2316 2317 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2318 switch (bt) { 2319 case T_BYTE: pextrb(dst, src, idx); break; 2320 case T_SHORT: pextrw(dst, src, idx); break; 2321 case T_INT: pextrd(dst, src, idx); break; 2322 case T_LONG: pextrq(dst, src, idx); break; 2323 2324 default: 2325 assert(false,"Should not reach here."); 2326 break; 2327 } 2328 } 2329 2330 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2331 int esize = type2aelembytes(typ); 2332 int elem_per_lane = 16/esize; 2333 int lane = elemindex / elem_per_lane; 2334 int eindex = elemindex % elem_per_lane; 2335 2336 if (lane >= 2) { 2337 assert(UseAVX > 2, "required"); 2338 vextractf32x4(dst, src, lane & 3); 2339 return dst; 2340 } else if (lane > 0) { 2341 assert(UseAVX > 0, "required"); 2342 vextractf128(dst, src, lane); 2343 return dst; 2344 } else { 2345 return src; 2346 } 2347 } 2348 2349 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2350 if (typ == T_BYTE) { 2351 movsbl(dst, dst); 2352 } else if (typ == T_SHORT) { 2353 movswl(dst, dst); 2354 } 2355 } 2356 2357 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2358 int esize = type2aelembytes(typ); 2359 int elem_per_lane = 16/esize; 2360 int eindex = elemindex % elem_per_lane; 2361 assert(is_integral_type(typ),"required"); 2362 2363 if (eindex == 0) { 2364 if (typ == T_LONG) { 2365 movq(dst, src); 2366 } else { 2367 movdl(dst, src); 2368 movsxl(typ, dst); 2369 } 2370 } else { 2371 extract(typ, dst, src, eindex); 2372 movsxl(typ, dst); 2373 } 2374 } 2375 2376 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2377 int esize = type2aelembytes(typ); 2378 int elem_per_lane = 16/esize; 2379 int eindex = elemindex % elem_per_lane; 2380 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2381 2382 if (eindex == 0) { 2383 movq(dst, src); 2384 } else { 2385 if (typ == T_FLOAT) { 2386 if (UseAVX == 0) { 2387 movdqu(dst, src); 2388 shufps(dst, dst, eindex); 2389 } else { 2390 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2391 } 2392 } else { 2393 if (UseAVX == 0) { 2394 movdqu(dst, src); 2395 psrldq(dst, eindex*esize); 2396 } else { 2397 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2398 } 2399 movq(dst, dst); 2400 } 2401 } 2402 // Zero upper bits 2403 if (typ == T_FLOAT) { 2404 if (UseAVX == 0) { 2405 assert(vtmp != xnoreg, "required."); 2406 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2407 pand(dst, vtmp); 2408 } else { 2409 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2410 } 2411 } 2412 } 2413 2414 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2415 switch(typ) { 2416 case T_BYTE: 2417 case T_BOOLEAN: 2418 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2419 break; 2420 case T_SHORT: 2421 case T_CHAR: 2422 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2423 break; 2424 case T_INT: 2425 case T_FLOAT: 2426 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2427 break; 2428 case T_LONG: 2429 case T_DOUBLE: 2430 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2431 break; 2432 default: 2433 assert(false,"Should not reach here."); 2434 break; 2435 } 2436 } 2437 2438 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2439 assert(rscratch != noreg || always_reachable(src2), "missing"); 2440 2441 switch(typ) { 2442 case T_BOOLEAN: 2443 case T_BYTE: 2444 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2445 break; 2446 case T_CHAR: 2447 case T_SHORT: 2448 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2449 break; 2450 case T_INT: 2451 case T_FLOAT: 2452 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2453 break; 2454 case T_LONG: 2455 case T_DOUBLE: 2456 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2457 break; 2458 default: 2459 assert(false,"Should not reach here."); 2460 break; 2461 } 2462 } 2463 2464 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2465 switch(typ) { 2466 case T_BYTE: 2467 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2468 break; 2469 case T_SHORT: 2470 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2471 break; 2472 case T_INT: 2473 case T_FLOAT: 2474 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2475 break; 2476 case T_LONG: 2477 case T_DOUBLE: 2478 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2479 break; 2480 default: 2481 assert(false,"Should not reach here."); 2482 break; 2483 } 2484 } 2485 2486 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2487 assert(vlen_in_bytes <= 32, ""); 2488 int esize = type2aelembytes(bt); 2489 if (vlen_in_bytes == 32) { 2490 assert(vtmp == xnoreg, "required."); 2491 if (esize >= 4) { 2492 vtestps(src1, src2, AVX_256bit); 2493 } else { 2494 vptest(src1, src2, AVX_256bit); 2495 } 2496 return; 2497 } 2498 if (vlen_in_bytes < 16) { 2499 // Duplicate the lower part to fill the whole register, 2500 // Don't need to do so for src2 2501 assert(vtmp != xnoreg, "required"); 2502 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2503 pshufd(vtmp, src1, shuffle_imm); 2504 } else { 2505 assert(vtmp == xnoreg, "required"); 2506 vtmp = src1; 2507 } 2508 if (esize >= 4 && VM_Version::supports_avx()) { 2509 vtestps(vtmp, src2, AVX_128bit); 2510 } else { 2511 ptest(vtmp, src2); 2512 } 2513 } 2514 2515 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2516 assert(UseAVX >= 2, "required"); 2517 #ifdef ASSERT 2518 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2519 bool is_bw_supported = VM_Version::supports_avx512bw(); 2520 if (is_bw && !is_bw_supported) { 2521 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2522 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2523 "XMM register should be 0-15"); 2524 } 2525 #endif // ASSERT 2526 switch (elem_bt) { 2527 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2528 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2529 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2530 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2531 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2532 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2533 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2534 } 2535 } 2536 2537 #ifdef _LP64 2538 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2539 assert(UseAVX >= 2, "required"); 2540 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2541 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2542 if ((UseAVX > 2) && 2543 (!is_bw || VM_Version::supports_avx512bw()) && 2544 (!is_vl || VM_Version::supports_avx512vl())) { 2545 switch (elem_bt) { 2546 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2547 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2548 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2549 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2550 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2551 } 2552 } else { 2553 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2554 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2555 switch (elem_bt) { 2556 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2557 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2558 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2559 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2560 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2561 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2562 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2563 } 2564 } 2565 } 2566 #endif 2567 2568 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2569 switch (to_elem_bt) { 2570 case T_SHORT: 2571 vpmovsxbw(dst, src, vlen_enc); 2572 break; 2573 case T_INT: 2574 vpmovsxbd(dst, src, vlen_enc); 2575 break; 2576 case T_FLOAT: 2577 vpmovsxbd(dst, src, vlen_enc); 2578 vcvtdq2ps(dst, dst, vlen_enc); 2579 break; 2580 case T_LONG: 2581 vpmovsxbq(dst, src, vlen_enc); 2582 break; 2583 case T_DOUBLE: { 2584 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2585 vpmovsxbd(dst, src, mid_vlen_enc); 2586 vcvtdq2pd(dst, dst, vlen_enc); 2587 break; 2588 } 2589 default: 2590 fatal("Unsupported type %s", type2name(to_elem_bt)); 2591 break; 2592 } 2593 } 2594 2595 //------------------------------------------------------------------------------------------- 2596 2597 // IndexOf for constant substrings with size >= 8 chars 2598 // which don't need to be loaded through stack. 2599 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2600 Register cnt1, Register cnt2, 2601 int int_cnt2, Register result, 2602 XMMRegister vec, Register tmp, 2603 int ae) { 2604 ShortBranchVerifier sbv(this); 2605 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2606 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2607 2608 // This method uses the pcmpestri instruction with bound registers 2609 // inputs: 2610 // xmm - substring 2611 // rax - substring length (elements count) 2612 // mem - scanned string 2613 // rdx - string length (elements count) 2614 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2615 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2616 // outputs: 2617 // rcx - matched index in string 2618 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2619 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2620 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2621 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2622 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2623 2624 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2625 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2626 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2627 2628 // Note, inline_string_indexOf() generates checks: 2629 // if (substr.count > string.count) return -1; 2630 // if (substr.count == 0) return 0; 2631 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2632 2633 // Load substring. 2634 if (ae == StrIntrinsicNode::UL) { 2635 pmovzxbw(vec, Address(str2, 0)); 2636 } else { 2637 movdqu(vec, Address(str2, 0)); 2638 } 2639 movl(cnt2, int_cnt2); 2640 movptr(result, str1); // string addr 2641 2642 if (int_cnt2 > stride) { 2643 jmpb(SCAN_TO_SUBSTR); 2644 2645 // Reload substr for rescan, this code 2646 // is executed only for large substrings (> 8 chars) 2647 bind(RELOAD_SUBSTR); 2648 if (ae == StrIntrinsicNode::UL) { 2649 pmovzxbw(vec, Address(str2, 0)); 2650 } else { 2651 movdqu(vec, Address(str2, 0)); 2652 } 2653 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2654 2655 bind(RELOAD_STR); 2656 // We came here after the beginning of the substring was 2657 // matched but the rest of it was not so we need to search 2658 // again. Start from the next element after the previous match. 2659 2660 // cnt2 is number of substring reminding elements and 2661 // cnt1 is number of string reminding elements when cmp failed. 2662 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2663 subl(cnt1, cnt2); 2664 addl(cnt1, int_cnt2); 2665 movl(cnt2, int_cnt2); // Now restore cnt2 2666 2667 decrementl(cnt1); // Shift to next element 2668 cmpl(cnt1, cnt2); 2669 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2670 2671 addptr(result, (1<<scale1)); 2672 2673 } // (int_cnt2 > 8) 2674 2675 // Scan string for start of substr in 16-byte vectors 2676 bind(SCAN_TO_SUBSTR); 2677 pcmpestri(vec, Address(result, 0), mode); 2678 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2679 subl(cnt1, stride); 2680 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2681 cmpl(cnt1, cnt2); 2682 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2683 addptr(result, 16); 2684 jmpb(SCAN_TO_SUBSTR); 2685 2686 // Found a potential substr 2687 bind(FOUND_CANDIDATE); 2688 // Matched whole vector if first element matched (tmp(rcx) == 0). 2689 if (int_cnt2 == stride) { 2690 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2691 } else { // int_cnt2 > 8 2692 jccb(Assembler::overflow, FOUND_SUBSTR); 2693 } 2694 // After pcmpestri tmp(rcx) contains matched element index 2695 // Compute start addr of substr 2696 lea(result, Address(result, tmp, scale1)); 2697 2698 // Make sure string is still long enough 2699 subl(cnt1, tmp); 2700 cmpl(cnt1, cnt2); 2701 if (int_cnt2 == stride) { 2702 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2703 } else { // int_cnt2 > 8 2704 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2705 } 2706 // Left less then substring. 2707 2708 bind(RET_NOT_FOUND); 2709 movl(result, -1); 2710 jmp(EXIT); 2711 2712 if (int_cnt2 > stride) { 2713 // This code is optimized for the case when whole substring 2714 // is matched if its head is matched. 2715 bind(MATCH_SUBSTR_HEAD); 2716 pcmpestri(vec, Address(result, 0), mode); 2717 // Reload only string if does not match 2718 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2719 2720 Label CONT_SCAN_SUBSTR; 2721 // Compare the rest of substring (> 8 chars). 2722 bind(FOUND_SUBSTR); 2723 // First 8 chars are already matched. 2724 negptr(cnt2); 2725 addptr(cnt2, stride); 2726 2727 bind(SCAN_SUBSTR); 2728 subl(cnt1, stride); 2729 cmpl(cnt2, -stride); // Do not read beyond substring 2730 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2731 // Back-up strings to avoid reading beyond substring: 2732 // cnt1 = cnt1 - cnt2 + 8 2733 addl(cnt1, cnt2); // cnt2 is negative 2734 addl(cnt1, stride); 2735 movl(cnt2, stride); negptr(cnt2); 2736 bind(CONT_SCAN_SUBSTR); 2737 if (int_cnt2 < (int)G) { 2738 int tail_off1 = int_cnt2<<scale1; 2739 int tail_off2 = int_cnt2<<scale2; 2740 if (ae == StrIntrinsicNode::UL) { 2741 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2742 } else { 2743 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2744 } 2745 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2746 } else { 2747 // calculate index in register to avoid integer overflow (int_cnt2*2) 2748 movl(tmp, int_cnt2); 2749 addptr(tmp, cnt2); 2750 if (ae == StrIntrinsicNode::UL) { 2751 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2752 } else { 2753 movdqu(vec, Address(str2, tmp, scale2, 0)); 2754 } 2755 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2756 } 2757 // Need to reload strings pointers if not matched whole vector 2758 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2759 addptr(cnt2, stride); 2760 jcc(Assembler::negative, SCAN_SUBSTR); 2761 // Fall through if found full substring 2762 2763 } // (int_cnt2 > 8) 2764 2765 bind(RET_FOUND); 2766 // Found result if we matched full small substring. 2767 // Compute substr offset 2768 subptr(result, str1); 2769 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2770 shrl(result, 1); // index 2771 } 2772 bind(EXIT); 2773 2774 } // string_indexofC8 2775 2776 // Small strings are loaded through stack if they cross page boundary. 2777 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2778 Register cnt1, Register cnt2, 2779 int int_cnt2, Register result, 2780 XMMRegister vec, Register tmp, 2781 int ae) { 2782 ShortBranchVerifier sbv(this); 2783 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2784 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2785 2786 // 2787 // int_cnt2 is length of small (< 8 chars) constant substring 2788 // or (-1) for non constant substring in which case its length 2789 // is in cnt2 register. 2790 // 2791 // Note, inline_string_indexOf() generates checks: 2792 // if (substr.count > string.count) return -1; 2793 // if (substr.count == 0) return 0; 2794 // 2795 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2796 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2797 // This method uses the pcmpestri instruction with bound registers 2798 // inputs: 2799 // xmm - substring 2800 // rax - substring length (elements count) 2801 // mem - scanned string 2802 // rdx - string length (elements count) 2803 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2804 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2805 // outputs: 2806 // rcx - matched index in string 2807 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2808 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2809 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2810 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2811 2812 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2813 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2814 FOUND_CANDIDATE; 2815 2816 { //======================================================== 2817 // We don't know where these strings are located 2818 // and we can't read beyond them. Load them through stack. 2819 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2820 2821 movptr(tmp, rsp); // save old SP 2822 2823 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2824 if (int_cnt2 == (1>>scale2)) { // One byte 2825 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2826 load_unsigned_byte(result, Address(str2, 0)); 2827 movdl(vec, result); // move 32 bits 2828 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2829 // Not enough header space in 32-bit VM: 12+3 = 15. 2830 movl(result, Address(str2, -1)); 2831 shrl(result, 8); 2832 movdl(vec, result); // move 32 bits 2833 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2834 load_unsigned_short(result, Address(str2, 0)); 2835 movdl(vec, result); // move 32 bits 2836 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2837 movdl(vec, Address(str2, 0)); // move 32 bits 2838 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2839 movq(vec, Address(str2, 0)); // move 64 bits 2840 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2841 // Array header size is 12 bytes in 32-bit VM 2842 // + 6 bytes for 3 chars == 18 bytes, 2843 // enough space to load vec and shift. 2844 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2845 if (ae == StrIntrinsicNode::UL) { 2846 int tail_off = int_cnt2-8; 2847 pmovzxbw(vec, Address(str2, tail_off)); 2848 psrldq(vec, -2*tail_off); 2849 } 2850 else { 2851 int tail_off = int_cnt2*(1<<scale2); 2852 movdqu(vec, Address(str2, tail_off-16)); 2853 psrldq(vec, 16-tail_off); 2854 } 2855 } 2856 } else { // not constant substring 2857 cmpl(cnt2, stride); 2858 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2859 2860 // We can read beyond string if srt+16 does not cross page boundary 2861 // since heaps are aligned and mapped by pages. 2862 assert(os::vm_page_size() < (int)G, "default page should be small"); 2863 movl(result, str2); // We need only low 32 bits 2864 andl(result, ((int)os::vm_page_size()-1)); 2865 cmpl(result, ((int)os::vm_page_size()-16)); 2866 jccb(Assembler::belowEqual, CHECK_STR); 2867 2868 // Move small strings to stack to allow load 16 bytes into vec. 2869 subptr(rsp, 16); 2870 int stk_offset = wordSize-(1<<scale2); 2871 push(cnt2); 2872 2873 bind(COPY_SUBSTR); 2874 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2875 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2876 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2877 } else if (ae == StrIntrinsicNode::UU) { 2878 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2879 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2880 } 2881 decrement(cnt2); 2882 jccb(Assembler::notZero, COPY_SUBSTR); 2883 2884 pop(cnt2); 2885 movptr(str2, rsp); // New substring address 2886 } // non constant 2887 2888 bind(CHECK_STR); 2889 cmpl(cnt1, stride); 2890 jccb(Assembler::aboveEqual, BIG_STRINGS); 2891 2892 // Check cross page boundary. 2893 movl(result, str1); // We need only low 32 bits 2894 andl(result, ((int)os::vm_page_size()-1)); 2895 cmpl(result, ((int)os::vm_page_size()-16)); 2896 jccb(Assembler::belowEqual, BIG_STRINGS); 2897 2898 subptr(rsp, 16); 2899 int stk_offset = -(1<<scale1); 2900 if (int_cnt2 < 0) { // not constant 2901 push(cnt2); 2902 stk_offset += wordSize; 2903 } 2904 movl(cnt2, cnt1); 2905 2906 bind(COPY_STR); 2907 if (ae == StrIntrinsicNode::LL) { 2908 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2909 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2910 } else { 2911 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2912 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2913 } 2914 decrement(cnt2); 2915 jccb(Assembler::notZero, COPY_STR); 2916 2917 if (int_cnt2 < 0) { // not constant 2918 pop(cnt2); 2919 } 2920 movptr(str1, rsp); // New string address 2921 2922 bind(BIG_STRINGS); 2923 // Load substring. 2924 if (int_cnt2 < 0) { // -1 2925 if (ae == StrIntrinsicNode::UL) { 2926 pmovzxbw(vec, Address(str2, 0)); 2927 } else { 2928 movdqu(vec, Address(str2, 0)); 2929 } 2930 push(cnt2); // substr count 2931 push(str2); // substr addr 2932 push(str1); // string addr 2933 } else { 2934 // Small (< 8 chars) constant substrings are loaded already. 2935 movl(cnt2, int_cnt2); 2936 } 2937 push(tmp); // original SP 2938 2939 } // Finished loading 2940 2941 //======================================================== 2942 // Start search 2943 // 2944 2945 movptr(result, str1); // string addr 2946 2947 if (int_cnt2 < 0) { // Only for non constant substring 2948 jmpb(SCAN_TO_SUBSTR); 2949 2950 // SP saved at sp+0 2951 // String saved at sp+1*wordSize 2952 // Substr saved at sp+2*wordSize 2953 // Substr count saved at sp+3*wordSize 2954 2955 // Reload substr for rescan, this code 2956 // is executed only for large substrings (> 8 chars) 2957 bind(RELOAD_SUBSTR); 2958 movptr(str2, Address(rsp, 2*wordSize)); 2959 movl(cnt2, Address(rsp, 3*wordSize)); 2960 if (ae == StrIntrinsicNode::UL) { 2961 pmovzxbw(vec, Address(str2, 0)); 2962 } else { 2963 movdqu(vec, Address(str2, 0)); 2964 } 2965 // We came here after the beginning of the substring was 2966 // matched but the rest of it was not so we need to search 2967 // again. Start from the next element after the previous match. 2968 subptr(str1, result); // Restore counter 2969 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2970 shrl(str1, 1); 2971 } 2972 addl(cnt1, str1); 2973 decrementl(cnt1); // Shift to next element 2974 cmpl(cnt1, cnt2); 2975 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2976 2977 addptr(result, (1<<scale1)); 2978 } // non constant 2979 2980 // Scan string for start of substr in 16-byte vectors 2981 bind(SCAN_TO_SUBSTR); 2982 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2983 pcmpestri(vec, Address(result, 0), mode); 2984 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2985 subl(cnt1, stride); 2986 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2987 cmpl(cnt1, cnt2); 2988 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2989 addptr(result, 16); 2990 2991 bind(ADJUST_STR); 2992 cmpl(cnt1, stride); // Do not read beyond string 2993 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2994 // Back-up string to avoid reading beyond string. 2995 lea(result, Address(result, cnt1, scale1, -16)); 2996 movl(cnt1, stride); 2997 jmpb(SCAN_TO_SUBSTR); 2998 2999 // Found a potential substr 3000 bind(FOUND_CANDIDATE); 3001 // After pcmpestri tmp(rcx) contains matched element index 3002 3003 // Make sure string is still long enough 3004 subl(cnt1, tmp); 3005 cmpl(cnt1, cnt2); 3006 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3007 // Left less then substring. 3008 3009 bind(RET_NOT_FOUND); 3010 movl(result, -1); 3011 jmp(CLEANUP); 3012 3013 bind(FOUND_SUBSTR); 3014 // Compute start addr of substr 3015 lea(result, Address(result, tmp, scale1)); 3016 if (int_cnt2 > 0) { // Constant substring 3017 // Repeat search for small substring (< 8 chars) 3018 // from new point without reloading substring. 3019 // Have to check that we don't read beyond string. 3020 cmpl(tmp, stride-int_cnt2); 3021 jccb(Assembler::greater, ADJUST_STR); 3022 // Fall through if matched whole substring. 3023 } else { // non constant 3024 assert(int_cnt2 == -1, "should be != 0"); 3025 3026 addl(tmp, cnt2); 3027 // Found result if we matched whole substring. 3028 cmpl(tmp, stride); 3029 jcc(Assembler::lessEqual, RET_FOUND); 3030 3031 // Repeat search for small substring (<= 8 chars) 3032 // from new point 'str1' without reloading substring. 3033 cmpl(cnt2, stride); 3034 // Have to check that we don't read beyond string. 3035 jccb(Assembler::lessEqual, ADJUST_STR); 3036 3037 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3038 // Compare the rest of substring (> 8 chars). 3039 movptr(str1, result); 3040 3041 cmpl(tmp, cnt2); 3042 // First 8 chars are already matched. 3043 jccb(Assembler::equal, CHECK_NEXT); 3044 3045 bind(SCAN_SUBSTR); 3046 pcmpestri(vec, Address(str1, 0), mode); 3047 // Need to reload strings pointers if not matched whole vector 3048 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3049 3050 bind(CHECK_NEXT); 3051 subl(cnt2, stride); 3052 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3053 addptr(str1, 16); 3054 if (ae == StrIntrinsicNode::UL) { 3055 addptr(str2, 8); 3056 } else { 3057 addptr(str2, 16); 3058 } 3059 subl(cnt1, stride); 3060 cmpl(cnt2, stride); // Do not read beyond substring 3061 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3062 // Back-up strings to avoid reading beyond substring. 3063 3064 if (ae == StrIntrinsicNode::UL) { 3065 lea(str2, Address(str2, cnt2, scale2, -8)); 3066 lea(str1, Address(str1, cnt2, scale1, -16)); 3067 } else { 3068 lea(str2, Address(str2, cnt2, scale2, -16)); 3069 lea(str1, Address(str1, cnt2, scale1, -16)); 3070 } 3071 subl(cnt1, cnt2); 3072 movl(cnt2, stride); 3073 addl(cnt1, stride); 3074 bind(CONT_SCAN_SUBSTR); 3075 if (ae == StrIntrinsicNode::UL) { 3076 pmovzxbw(vec, Address(str2, 0)); 3077 } else { 3078 movdqu(vec, Address(str2, 0)); 3079 } 3080 jmp(SCAN_SUBSTR); 3081 3082 bind(RET_FOUND_LONG); 3083 movptr(str1, Address(rsp, wordSize)); 3084 } // non constant 3085 3086 bind(RET_FOUND); 3087 // Compute substr offset 3088 subptr(result, str1); 3089 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3090 shrl(result, 1); // index 3091 } 3092 bind(CLEANUP); 3093 pop(rsp); // restore SP 3094 3095 } // string_indexof 3096 3097 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3098 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3099 ShortBranchVerifier sbv(this); 3100 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3101 3102 int stride = 8; 3103 3104 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3105 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3106 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3107 FOUND_SEQ_CHAR, DONE_LABEL; 3108 3109 movptr(result, str1); 3110 if (UseAVX >= 2) { 3111 cmpl(cnt1, stride); 3112 jcc(Assembler::less, SCAN_TO_CHAR); 3113 cmpl(cnt1, 2*stride); 3114 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3115 movdl(vec1, ch); 3116 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3117 vpxor(vec2, vec2); 3118 movl(tmp, cnt1); 3119 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3120 andl(cnt1,0x0000000F); //tail count (in chars) 3121 3122 bind(SCAN_TO_16_CHAR_LOOP); 3123 vmovdqu(vec3, Address(result, 0)); 3124 vpcmpeqw(vec3, vec3, vec1, 1); 3125 vptest(vec2, vec3); 3126 jcc(Assembler::carryClear, FOUND_CHAR); 3127 addptr(result, 32); 3128 subl(tmp, 2*stride); 3129 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3130 jmp(SCAN_TO_8_CHAR); 3131 bind(SCAN_TO_8_CHAR_INIT); 3132 movdl(vec1, ch); 3133 pshuflw(vec1, vec1, 0x00); 3134 pshufd(vec1, vec1, 0); 3135 pxor(vec2, vec2); 3136 } 3137 bind(SCAN_TO_8_CHAR); 3138 cmpl(cnt1, stride); 3139 jcc(Assembler::less, SCAN_TO_CHAR); 3140 if (UseAVX < 2) { 3141 movdl(vec1, ch); 3142 pshuflw(vec1, vec1, 0x00); 3143 pshufd(vec1, vec1, 0); 3144 pxor(vec2, vec2); 3145 } 3146 movl(tmp, cnt1); 3147 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3148 andl(cnt1,0x00000007); //tail count (in chars) 3149 3150 bind(SCAN_TO_8_CHAR_LOOP); 3151 movdqu(vec3, Address(result, 0)); 3152 pcmpeqw(vec3, vec1); 3153 ptest(vec2, vec3); 3154 jcc(Assembler::carryClear, FOUND_CHAR); 3155 addptr(result, 16); 3156 subl(tmp, stride); 3157 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3158 bind(SCAN_TO_CHAR); 3159 testl(cnt1, cnt1); 3160 jcc(Assembler::zero, RET_NOT_FOUND); 3161 bind(SCAN_TO_CHAR_LOOP); 3162 load_unsigned_short(tmp, Address(result, 0)); 3163 cmpl(ch, tmp); 3164 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3165 addptr(result, 2); 3166 subl(cnt1, 1); 3167 jccb(Assembler::zero, RET_NOT_FOUND); 3168 jmp(SCAN_TO_CHAR_LOOP); 3169 3170 bind(RET_NOT_FOUND); 3171 movl(result, -1); 3172 jmpb(DONE_LABEL); 3173 3174 bind(FOUND_CHAR); 3175 if (UseAVX >= 2) { 3176 vpmovmskb(tmp, vec3); 3177 } else { 3178 pmovmskb(tmp, vec3); 3179 } 3180 bsfl(ch, tmp); 3181 addptr(result, ch); 3182 3183 bind(FOUND_SEQ_CHAR); 3184 subptr(result, str1); 3185 shrl(result, 1); 3186 3187 bind(DONE_LABEL); 3188 } // string_indexof_char 3189 3190 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3191 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3192 ShortBranchVerifier sbv(this); 3193 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3194 3195 int stride = 16; 3196 3197 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3198 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3199 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3200 FOUND_SEQ_CHAR, DONE_LABEL; 3201 3202 movptr(result, str1); 3203 if (UseAVX >= 2) { 3204 cmpl(cnt1, stride); 3205 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3206 cmpl(cnt1, stride*2); 3207 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3208 movdl(vec1, ch); 3209 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3210 vpxor(vec2, vec2); 3211 movl(tmp, cnt1); 3212 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3213 andl(cnt1,0x0000001F); //tail count (in chars) 3214 3215 bind(SCAN_TO_32_CHAR_LOOP); 3216 vmovdqu(vec3, Address(result, 0)); 3217 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3218 vptest(vec2, vec3); 3219 jcc(Assembler::carryClear, FOUND_CHAR); 3220 addptr(result, 32); 3221 subl(tmp, stride*2); 3222 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3223 jmp(SCAN_TO_16_CHAR); 3224 3225 bind(SCAN_TO_16_CHAR_INIT); 3226 movdl(vec1, ch); 3227 pxor(vec2, vec2); 3228 pshufb(vec1, vec2); 3229 } 3230 3231 bind(SCAN_TO_16_CHAR); 3232 cmpl(cnt1, stride); 3233 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3234 if (UseAVX < 2) { 3235 movdl(vec1, ch); 3236 pxor(vec2, vec2); 3237 pshufb(vec1, vec2); 3238 } 3239 movl(tmp, cnt1); 3240 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3241 andl(cnt1,0x0000000F); //tail count (in bytes) 3242 3243 bind(SCAN_TO_16_CHAR_LOOP); 3244 movdqu(vec3, Address(result, 0)); 3245 pcmpeqb(vec3, vec1); 3246 ptest(vec2, vec3); 3247 jcc(Assembler::carryClear, FOUND_CHAR); 3248 addptr(result, 16); 3249 subl(tmp, stride); 3250 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3251 3252 bind(SCAN_TO_CHAR_INIT); 3253 testl(cnt1, cnt1); 3254 jcc(Assembler::zero, RET_NOT_FOUND); 3255 bind(SCAN_TO_CHAR_LOOP); 3256 load_unsigned_byte(tmp, Address(result, 0)); 3257 cmpl(ch, tmp); 3258 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3259 addptr(result, 1); 3260 subl(cnt1, 1); 3261 jccb(Assembler::zero, RET_NOT_FOUND); 3262 jmp(SCAN_TO_CHAR_LOOP); 3263 3264 bind(RET_NOT_FOUND); 3265 movl(result, -1); 3266 jmpb(DONE_LABEL); 3267 3268 bind(FOUND_CHAR); 3269 if (UseAVX >= 2) { 3270 vpmovmskb(tmp, vec3); 3271 } else { 3272 pmovmskb(tmp, vec3); 3273 } 3274 bsfl(ch, tmp); 3275 addptr(result, ch); 3276 3277 bind(FOUND_SEQ_CHAR); 3278 subptr(result, str1); 3279 3280 bind(DONE_LABEL); 3281 } // stringL_indexof_char 3282 3283 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3284 switch (eltype) { 3285 case T_BOOLEAN: return sizeof(jboolean); 3286 case T_BYTE: return sizeof(jbyte); 3287 case T_SHORT: return sizeof(jshort); 3288 case T_CHAR: return sizeof(jchar); 3289 case T_INT: return sizeof(jint); 3290 default: 3291 ShouldNotReachHere(); 3292 return -1; 3293 } 3294 } 3295 3296 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3297 switch (eltype) { 3298 // T_BOOLEAN used as surrogate for unsigned byte 3299 case T_BOOLEAN: movzbl(dst, src); break; 3300 case T_BYTE: movsbl(dst, src); break; 3301 case T_SHORT: movswl(dst, src); break; 3302 case T_CHAR: movzwl(dst, src); break; 3303 case T_INT: movl(dst, src); break; 3304 default: 3305 ShouldNotReachHere(); 3306 } 3307 } 3308 3309 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3310 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3311 } 3312 3313 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3314 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3315 } 3316 3317 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3318 const int vlen = Assembler::AVX_256bit; 3319 switch (eltype) { 3320 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3321 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3322 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3323 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3324 case T_INT: 3325 // do nothing 3326 break; 3327 default: 3328 ShouldNotReachHere(); 3329 } 3330 } 3331 3332 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3333 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3334 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3335 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3336 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3337 BasicType eltype) { 3338 ShortBranchVerifier sbv(this); 3339 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3340 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3341 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3342 3343 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3344 SHORT_UNROLLED_LOOP_EXIT, 3345 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3346 UNROLLED_VECTOR_LOOP_BEGIN, 3347 END; 3348 switch (eltype) { 3349 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3350 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3351 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3352 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3353 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3354 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3355 } 3356 3357 // For "renaming" for readibility of the code 3358 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3359 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3360 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3361 3362 const int elsize = arrays_hashcode_elsize(eltype); 3363 3364 /* 3365 if (cnt1 >= 2) { 3366 if (cnt1 >= 32) { 3367 UNROLLED VECTOR LOOP 3368 } 3369 UNROLLED SCALAR LOOP 3370 } 3371 SINGLE SCALAR 3372 */ 3373 3374 cmpl(cnt1, 32); 3375 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3376 3377 // cnt1 >= 32 && generate_vectorized_loop 3378 xorl(index, index); 3379 3380 // vresult = IntVector.zero(I256); 3381 for (int idx = 0; idx < 4; idx++) { 3382 vpxor(vresult[idx], vresult[idx]); 3383 } 3384 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3385 Register bound = tmp2; 3386 Register next = tmp3; 3387 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3388 movl(next, Address(tmp2, 0)); 3389 movdl(vnext, next); 3390 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3391 3392 // index = 0; 3393 // bound = cnt1 & ~(32 - 1); 3394 movl(bound, cnt1); 3395 andl(bound, ~(32 - 1)); 3396 // for (; index < bound; index += 32) { 3397 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3398 // result *= next; 3399 imull(result, next); 3400 // loop fission to upfront the cost of fetching from memory, OOO execution 3401 // can then hopefully do a better job of prefetching 3402 for (int idx = 0; idx < 4; idx++) { 3403 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3404 } 3405 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3406 for (int idx = 0; idx < 4; idx++) { 3407 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3408 arrays_hashcode_elvcast(vtmp[idx], eltype); 3409 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3410 } 3411 // index += 32; 3412 addl(index, 32); 3413 // index < bound; 3414 cmpl(index, bound); 3415 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3416 // } 3417 3418 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3419 subl(cnt1, bound); 3420 // release bound 3421 3422 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3423 for (int idx = 0; idx < 4; idx++) { 3424 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3425 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3426 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3427 } 3428 // result += vresult.reduceLanes(ADD); 3429 for (int idx = 0; idx < 4; idx++) { 3430 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3431 } 3432 3433 // } else if (cnt1 < 32) { 3434 3435 bind(SHORT_UNROLLED_BEGIN); 3436 // int i = 1; 3437 movl(index, 1); 3438 cmpl(index, cnt1); 3439 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3440 3441 // for (; i < cnt1 ; i += 2) { 3442 bind(SHORT_UNROLLED_LOOP_BEGIN); 3443 movl(tmp3, 961); 3444 imull(result, tmp3); 3445 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3446 movl(tmp3, tmp2); 3447 shll(tmp3, 5); 3448 subl(tmp3, tmp2); 3449 addl(result, tmp3); 3450 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3451 addl(result, tmp3); 3452 addl(index, 2); 3453 cmpl(index, cnt1); 3454 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3455 3456 // } 3457 // if (i >= cnt1) { 3458 bind(SHORT_UNROLLED_LOOP_EXIT); 3459 jccb(Assembler::greater, END); 3460 movl(tmp2, result); 3461 shll(result, 5); 3462 subl(result, tmp2); 3463 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3464 addl(result, tmp3); 3465 // } 3466 bind(END); 3467 3468 BLOCK_COMMENT("} // arrays_hashcode"); 3469 3470 } // arrays_hashcode 3471 3472 // helper function for string_compare 3473 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3474 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3475 Address::ScaleFactor scale2, Register index, int ae) { 3476 if (ae == StrIntrinsicNode::LL) { 3477 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3478 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3479 } else if (ae == StrIntrinsicNode::UU) { 3480 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3481 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3482 } else { 3483 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3484 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3485 } 3486 } 3487 3488 // Compare strings, used for char[] and byte[]. 3489 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3490 Register cnt1, Register cnt2, Register result, 3491 XMMRegister vec1, int ae, KRegister mask) { 3492 ShortBranchVerifier sbv(this); 3493 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3494 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3495 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3496 int stride2x2 = 0x40; 3497 Address::ScaleFactor scale = Address::no_scale; 3498 Address::ScaleFactor scale1 = Address::no_scale; 3499 Address::ScaleFactor scale2 = Address::no_scale; 3500 3501 if (ae != StrIntrinsicNode::LL) { 3502 stride2x2 = 0x20; 3503 } 3504 3505 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3506 shrl(cnt2, 1); 3507 } 3508 // Compute the minimum of the string lengths and the 3509 // difference of the string lengths (stack). 3510 // Do the conditional move stuff 3511 movl(result, cnt1); 3512 subl(cnt1, cnt2); 3513 push(cnt1); 3514 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3515 3516 // Is the minimum length zero? 3517 testl(cnt2, cnt2); 3518 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3519 if (ae == StrIntrinsicNode::LL) { 3520 // Load first bytes 3521 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3522 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3523 } else if (ae == StrIntrinsicNode::UU) { 3524 // Load first characters 3525 load_unsigned_short(result, Address(str1, 0)); 3526 load_unsigned_short(cnt1, Address(str2, 0)); 3527 } else { 3528 load_unsigned_byte(result, Address(str1, 0)); 3529 load_unsigned_short(cnt1, Address(str2, 0)); 3530 } 3531 subl(result, cnt1); 3532 jcc(Assembler::notZero, POP_LABEL); 3533 3534 if (ae == StrIntrinsicNode::UU) { 3535 // Divide length by 2 to get number of chars 3536 shrl(cnt2, 1); 3537 } 3538 cmpl(cnt2, 1); 3539 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3540 3541 // Check if the strings start at the same location and setup scale and stride 3542 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3543 cmpptr(str1, str2); 3544 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3545 if (ae == StrIntrinsicNode::LL) { 3546 scale = Address::times_1; 3547 stride = 16; 3548 } else { 3549 scale = Address::times_2; 3550 stride = 8; 3551 } 3552 } else { 3553 scale1 = Address::times_1; 3554 scale2 = Address::times_2; 3555 // scale not used 3556 stride = 8; 3557 } 3558 3559 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3560 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3561 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3562 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3563 Label COMPARE_TAIL_LONG; 3564 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3565 3566 int pcmpmask = 0x19; 3567 if (ae == StrIntrinsicNode::LL) { 3568 pcmpmask &= ~0x01; 3569 } 3570 3571 // Setup to compare 16-chars (32-bytes) vectors, 3572 // start from first character again because it has aligned address. 3573 if (ae == StrIntrinsicNode::LL) { 3574 stride2 = 32; 3575 } else { 3576 stride2 = 16; 3577 } 3578 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3579 adr_stride = stride << scale; 3580 } else { 3581 adr_stride1 = 8; //stride << scale1; 3582 adr_stride2 = 16; //stride << scale2; 3583 } 3584 3585 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3586 // rax and rdx are used by pcmpestri as elements counters 3587 movl(result, cnt2); 3588 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3589 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3590 3591 // fast path : compare first 2 8-char vectors. 3592 bind(COMPARE_16_CHARS); 3593 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3594 movdqu(vec1, Address(str1, 0)); 3595 } else { 3596 pmovzxbw(vec1, Address(str1, 0)); 3597 } 3598 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3599 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3600 3601 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3602 movdqu(vec1, Address(str1, adr_stride)); 3603 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3604 } else { 3605 pmovzxbw(vec1, Address(str1, adr_stride1)); 3606 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3607 } 3608 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3609 addl(cnt1, stride); 3610 3611 // Compare the characters at index in cnt1 3612 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3613 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3614 subl(result, cnt2); 3615 jmp(POP_LABEL); 3616 3617 // Setup the registers to start vector comparison loop 3618 bind(COMPARE_WIDE_VECTORS); 3619 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3620 lea(str1, Address(str1, result, scale)); 3621 lea(str2, Address(str2, result, scale)); 3622 } else { 3623 lea(str1, Address(str1, result, scale1)); 3624 lea(str2, Address(str2, result, scale2)); 3625 } 3626 subl(result, stride2); 3627 subl(cnt2, stride2); 3628 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3629 negptr(result); 3630 3631 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3632 bind(COMPARE_WIDE_VECTORS_LOOP); 3633 3634 #ifdef _LP64 3635 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3636 cmpl(cnt2, stride2x2); 3637 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3638 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3639 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3640 3641 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3642 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3643 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3644 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3645 } else { 3646 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3647 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3648 } 3649 kortestql(mask, mask); 3650 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3651 addptr(result, stride2x2); // update since we already compared at this addr 3652 subl(cnt2, stride2x2); // and sub the size too 3653 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3654 3655 vpxor(vec1, vec1); 3656 jmpb(COMPARE_WIDE_TAIL); 3657 }//if (VM_Version::supports_avx512vlbw()) 3658 #endif // _LP64 3659 3660 3661 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3662 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3663 vmovdqu(vec1, Address(str1, result, scale)); 3664 vpxor(vec1, Address(str2, result, scale)); 3665 } else { 3666 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3667 vpxor(vec1, Address(str2, result, scale2)); 3668 } 3669 vptest(vec1, vec1); 3670 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3671 addptr(result, stride2); 3672 subl(cnt2, stride2); 3673 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3674 // clean upper bits of YMM registers 3675 vpxor(vec1, vec1); 3676 3677 // compare wide vectors tail 3678 bind(COMPARE_WIDE_TAIL); 3679 testptr(result, result); 3680 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3681 3682 movl(result, stride2); 3683 movl(cnt2, result); 3684 negptr(result); 3685 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3686 3687 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3688 bind(VECTOR_NOT_EQUAL); 3689 // clean upper bits of YMM registers 3690 vpxor(vec1, vec1); 3691 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3692 lea(str1, Address(str1, result, scale)); 3693 lea(str2, Address(str2, result, scale)); 3694 } else { 3695 lea(str1, Address(str1, result, scale1)); 3696 lea(str2, Address(str2, result, scale2)); 3697 } 3698 jmp(COMPARE_16_CHARS); 3699 3700 // Compare tail chars, length between 1 to 15 chars 3701 bind(COMPARE_TAIL_LONG); 3702 movl(cnt2, result); 3703 cmpl(cnt2, stride); 3704 jcc(Assembler::less, COMPARE_SMALL_STR); 3705 3706 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3707 movdqu(vec1, Address(str1, 0)); 3708 } else { 3709 pmovzxbw(vec1, Address(str1, 0)); 3710 } 3711 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3712 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3713 subptr(cnt2, stride); 3714 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3715 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3716 lea(str1, Address(str1, result, scale)); 3717 lea(str2, Address(str2, result, scale)); 3718 } else { 3719 lea(str1, Address(str1, result, scale1)); 3720 lea(str2, Address(str2, result, scale2)); 3721 } 3722 negptr(cnt2); 3723 jmpb(WHILE_HEAD_LABEL); 3724 3725 bind(COMPARE_SMALL_STR); 3726 } else if (UseSSE42Intrinsics) { 3727 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3728 int pcmpmask = 0x19; 3729 // Setup to compare 8-char (16-byte) vectors, 3730 // start from first character again because it has aligned address. 3731 movl(result, cnt2); 3732 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3733 if (ae == StrIntrinsicNode::LL) { 3734 pcmpmask &= ~0x01; 3735 } 3736 jcc(Assembler::zero, COMPARE_TAIL); 3737 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3738 lea(str1, Address(str1, result, scale)); 3739 lea(str2, Address(str2, result, scale)); 3740 } else { 3741 lea(str1, Address(str1, result, scale1)); 3742 lea(str2, Address(str2, result, scale2)); 3743 } 3744 negptr(result); 3745 3746 // pcmpestri 3747 // inputs: 3748 // vec1- substring 3749 // rax - negative string length (elements count) 3750 // mem - scanned string 3751 // rdx - string length (elements count) 3752 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3753 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3754 // outputs: 3755 // rcx - first mismatched element index 3756 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3757 3758 bind(COMPARE_WIDE_VECTORS); 3759 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3760 movdqu(vec1, Address(str1, result, scale)); 3761 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3762 } else { 3763 pmovzxbw(vec1, Address(str1, result, scale1)); 3764 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3765 } 3766 // After pcmpestri cnt1(rcx) contains mismatched element index 3767 3768 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3769 addptr(result, stride); 3770 subptr(cnt2, stride); 3771 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3772 3773 // compare wide vectors tail 3774 testptr(result, result); 3775 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3776 3777 movl(cnt2, stride); 3778 movl(result, stride); 3779 negptr(result); 3780 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3781 movdqu(vec1, Address(str1, result, scale)); 3782 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3783 } else { 3784 pmovzxbw(vec1, Address(str1, result, scale1)); 3785 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3786 } 3787 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3788 3789 // Mismatched characters in the vectors 3790 bind(VECTOR_NOT_EQUAL); 3791 addptr(cnt1, result); 3792 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3793 subl(result, cnt2); 3794 jmpb(POP_LABEL); 3795 3796 bind(COMPARE_TAIL); // limit is zero 3797 movl(cnt2, result); 3798 // Fallthru to tail compare 3799 } 3800 // Shift str2 and str1 to the end of the arrays, negate min 3801 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3802 lea(str1, Address(str1, cnt2, scale)); 3803 lea(str2, Address(str2, cnt2, scale)); 3804 } else { 3805 lea(str1, Address(str1, cnt2, scale1)); 3806 lea(str2, Address(str2, cnt2, scale2)); 3807 } 3808 decrementl(cnt2); // first character was compared already 3809 negptr(cnt2); 3810 3811 // Compare the rest of the elements 3812 bind(WHILE_HEAD_LABEL); 3813 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3814 subl(result, cnt1); 3815 jccb(Assembler::notZero, POP_LABEL); 3816 increment(cnt2); 3817 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3818 3819 // Strings are equal up to min length. Return the length difference. 3820 bind(LENGTH_DIFF_LABEL); 3821 pop(result); 3822 if (ae == StrIntrinsicNode::UU) { 3823 // Divide diff by 2 to get number of chars 3824 sarl(result, 1); 3825 } 3826 jmpb(DONE_LABEL); 3827 3828 #ifdef _LP64 3829 if (VM_Version::supports_avx512vlbw()) { 3830 3831 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3832 3833 kmovql(cnt1, mask); 3834 notq(cnt1); 3835 bsfq(cnt2, cnt1); 3836 if (ae != StrIntrinsicNode::LL) { 3837 // Divide diff by 2 to get number of chars 3838 sarl(cnt2, 1); 3839 } 3840 addq(result, cnt2); 3841 if (ae == StrIntrinsicNode::LL) { 3842 load_unsigned_byte(cnt1, Address(str2, result)); 3843 load_unsigned_byte(result, Address(str1, result)); 3844 } else if (ae == StrIntrinsicNode::UU) { 3845 load_unsigned_short(cnt1, Address(str2, result, scale)); 3846 load_unsigned_short(result, Address(str1, result, scale)); 3847 } else { 3848 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3849 load_unsigned_byte(result, Address(str1, result, scale1)); 3850 } 3851 subl(result, cnt1); 3852 jmpb(POP_LABEL); 3853 }//if (VM_Version::supports_avx512vlbw()) 3854 #endif // _LP64 3855 3856 // Discard the stored length difference 3857 bind(POP_LABEL); 3858 pop(cnt1); 3859 3860 // That's it 3861 bind(DONE_LABEL); 3862 if(ae == StrIntrinsicNode::UL) { 3863 negl(result); 3864 } 3865 3866 } 3867 3868 // Search for Non-ASCII character (Negative byte value) in a byte array, 3869 // return the index of the first such character, otherwise the length 3870 // of the array segment searched. 3871 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3872 // @IntrinsicCandidate 3873 // public static int countPositives(byte[] ba, int off, int len) { 3874 // for (int i = off; i < off + len; i++) { 3875 // if (ba[i] < 0) { 3876 // return i - off; 3877 // } 3878 // } 3879 // return len; 3880 // } 3881 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3882 Register result, Register tmp1, 3883 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3884 // rsi: byte array 3885 // rcx: len 3886 // rax: result 3887 ShortBranchVerifier sbv(this); 3888 assert_different_registers(ary1, len, result, tmp1); 3889 assert_different_registers(vec1, vec2); 3890 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3891 3892 movl(result, len); // copy 3893 // len == 0 3894 testl(len, len); 3895 jcc(Assembler::zero, DONE); 3896 3897 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3898 VM_Version::supports_avx512vlbw() && 3899 VM_Version::supports_bmi2()) { 3900 3901 Label test_64_loop, test_tail, BREAK_LOOP; 3902 movl(tmp1, len); 3903 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3904 3905 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 3906 andl(len, 0xffffffc0); // vector count (in chars) 3907 jccb(Assembler::zero, test_tail); 3908 3909 lea(ary1, Address(ary1, len, Address::times_1)); 3910 negptr(len); 3911 3912 bind(test_64_loop); 3913 // Check whether our 64 elements of size byte contain negatives 3914 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3915 kortestql(mask1, mask1); 3916 jcc(Assembler::notZero, BREAK_LOOP); 3917 3918 addptr(len, 64); 3919 jccb(Assembler::notZero, test_64_loop); 3920 3921 bind(test_tail); 3922 // bail out when there is nothing to be done 3923 testl(tmp1, -1); 3924 jcc(Assembler::zero, DONE); 3925 3926 3927 // check the tail for absense of negatives 3928 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3929 #ifdef _LP64 3930 { 3931 Register tmp3_aliased = len; 3932 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3933 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3934 notq(tmp3_aliased); 3935 kmovql(mask2, tmp3_aliased); 3936 } 3937 #else 3938 Label k_init; 3939 jmp(k_init); 3940 3941 // We could not read 64-bits from a general purpose register thus we move 3942 // data required to compose 64 1's to the instruction stream 3943 // We emit 64 byte wide series of elements from 0..63 which later on would 3944 // be used as a compare targets with tail count contained in tmp1 register. 3945 // Result would be a k register having tmp1 consecutive number or 1 3946 // counting from least significant bit. 3947 address tmp = pc(); 3948 emit_int64(0x0706050403020100); 3949 emit_int64(0x0F0E0D0C0B0A0908); 3950 emit_int64(0x1716151413121110); 3951 emit_int64(0x1F1E1D1C1B1A1918); 3952 emit_int64(0x2726252423222120); 3953 emit_int64(0x2F2E2D2C2B2A2928); 3954 emit_int64(0x3736353433323130); 3955 emit_int64(0x3F3E3D3C3B3A3938); 3956 3957 bind(k_init); 3958 lea(len, InternalAddress(tmp)); 3959 // create mask to test for negative byte inside a vector 3960 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3961 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3962 3963 #endif 3964 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3965 ktestq(mask1, mask2); 3966 jcc(Assembler::zero, DONE); 3967 3968 // do a full check for negative registers in the tail 3969 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 3970 // ary1 already pointing to the right place 3971 jmpb(TAIL_START); 3972 3973 bind(BREAK_LOOP); 3974 // At least one byte in the last 64 byte block was negative. 3975 // Set up to look at the last 64 bytes as if they were a tail 3976 lea(ary1, Address(ary1, len, Address::times_1)); 3977 addptr(result, len); 3978 // Ignore the very last byte: if all others are positive, 3979 // it must be negative, so we can skip right to the 2+1 byte 3980 // end comparison at this point 3981 orl(result, 63); 3982 movl(len, 63); 3983 // Fallthru to tail compare 3984 } else { 3985 3986 if (UseAVX >= 2 && UseSSE >= 2) { 3987 // With AVX2, use 32-byte vector compare 3988 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3989 3990 // Compare 32-byte vectors 3991 testl(len, 0xffffffe0); // vector count (in bytes) 3992 jccb(Assembler::zero, TAIL_START); 3993 3994 andl(len, 0xffffffe0); 3995 lea(ary1, Address(ary1, len, Address::times_1)); 3996 negptr(len); 3997 3998 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3999 movdl(vec2, tmp1); 4000 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4001 4002 bind(COMPARE_WIDE_VECTORS); 4003 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4004 vptest(vec1, vec2); 4005 jccb(Assembler::notZero, BREAK_LOOP); 4006 addptr(len, 32); 4007 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4008 4009 testl(result, 0x0000001f); // any bytes remaining? 4010 jcc(Assembler::zero, DONE); 4011 4012 // Quick test using the already prepared vector mask 4013 movl(len, result); 4014 andl(len, 0x0000001f); 4015 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4016 vptest(vec1, vec2); 4017 jcc(Assembler::zero, DONE); 4018 // There are zeros, jump to the tail to determine exactly where 4019 jmpb(TAIL_START); 4020 4021 bind(BREAK_LOOP); 4022 // At least one byte in the last 32-byte vector is negative. 4023 // Set up to look at the last 32 bytes as if they were a tail 4024 lea(ary1, Address(ary1, len, Address::times_1)); 4025 addptr(result, len); 4026 // Ignore the very last byte: if all others are positive, 4027 // it must be negative, so we can skip right to the 2+1 byte 4028 // end comparison at this point 4029 orl(result, 31); 4030 movl(len, 31); 4031 // Fallthru to tail compare 4032 } else if (UseSSE42Intrinsics) { 4033 // With SSE4.2, use double quad vector compare 4034 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4035 4036 // Compare 16-byte vectors 4037 testl(len, 0xfffffff0); // vector count (in bytes) 4038 jcc(Assembler::zero, TAIL_START); 4039 4040 andl(len, 0xfffffff0); 4041 lea(ary1, Address(ary1, len, Address::times_1)); 4042 negptr(len); 4043 4044 movl(tmp1, 0x80808080); 4045 movdl(vec2, tmp1); 4046 pshufd(vec2, vec2, 0); 4047 4048 bind(COMPARE_WIDE_VECTORS); 4049 movdqu(vec1, Address(ary1, len, Address::times_1)); 4050 ptest(vec1, vec2); 4051 jccb(Assembler::notZero, BREAK_LOOP); 4052 addptr(len, 16); 4053 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4054 4055 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4056 jcc(Assembler::zero, DONE); 4057 4058 // Quick test using the already prepared vector mask 4059 movl(len, result); 4060 andl(len, 0x0000000f); // tail count (in bytes) 4061 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4062 ptest(vec1, vec2); 4063 jcc(Assembler::zero, DONE); 4064 jmpb(TAIL_START); 4065 4066 bind(BREAK_LOOP); 4067 // At least one byte in the last 16-byte vector is negative. 4068 // Set up and look at the last 16 bytes as if they were a tail 4069 lea(ary1, Address(ary1, len, Address::times_1)); 4070 addptr(result, len); 4071 // Ignore the very last byte: if all others are positive, 4072 // it must be negative, so we can skip right to the 2+1 byte 4073 // end comparison at this point 4074 orl(result, 15); 4075 movl(len, 15); 4076 // Fallthru to tail compare 4077 } 4078 } 4079 4080 bind(TAIL_START); 4081 // Compare 4-byte vectors 4082 andl(len, 0xfffffffc); // vector count (in bytes) 4083 jccb(Assembler::zero, COMPARE_CHAR); 4084 4085 lea(ary1, Address(ary1, len, Address::times_1)); 4086 negptr(len); 4087 4088 bind(COMPARE_VECTORS); 4089 movl(tmp1, Address(ary1, len, Address::times_1)); 4090 andl(tmp1, 0x80808080); 4091 jccb(Assembler::notZero, TAIL_ADJUST); 4092 addptr(len, 4); 4093 jccb(Assembler::notZero, COMPARE_VECTORS); 4094 4095 // Compare trailing char (final 2-3 bytes), if any 4096 bind(COMPARE_CHAR); 4097 4098 testl(result, 0x2); // tail char 4099 jccb(Assembler::zero, COMPARE_BYTE); 4100 load_unsigned_short(tmp1, Address(ary1, 0)); 4101 andl(tmp1, 0x00008080); 4102 jccb(Assembler::notZero, CHAR_ADJUST); 4103 lea(ary1, Address(ary1, 2)); 4104 4105 bind(COMPARE_BYTE); 4106 testl(result, 0x1); // tail byte 4107 jccb(Assembler::zero, DONE); 4108 load_unsigned_byte(tmp1, Address(ary1, 0)); 4109 testl(tmp1, 0x00000080); 4110 jccb(Assembler::zero, DONE); 4111 subptr(result, 1); 4112 jmpb(DONE); 4113 4114 bind(TAIL_ADJUST); 4115 // there are negative bits in the last 4 byte block. 4116 // Adjust result and check the next three bytes 4117 addptr(result, len); 4118 orl(result, 3); 4119 lea(ary1, Address(ary1, len, Address::times_1)); 4120 jmpb(COMPARE_CHAR); 4121 4122 bind(CHAR_ADJUST); 4123 // We are looking at a char + optional byte tail, and found that one 4124 // of the bytes in the char is negative. Adjust the result, check the 4125 // first byte and readjust if needed. 4126 andl(result, 0xfffffffc); 4127 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4128 jccb(Assembler::notZero, DONE); 4129 addptr(result, 1); 4130 4131 // That's it 4132 bind(DONE); 4133 if (UseAVX >= 2 && UseSSE >= 2) { 4134 // clean upper bits of YMM registers 4135 vpxor(vec1, vec1); 4136 vpxor(vec2, vec2); 4137 } 4138 } 4139 4140 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4141 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4142 Register limit, Register result, Register chr, 4143 XMMRegister vec1, XMMRegister vec2, bool is_char, 4144 KRegister mask, bool expand_ary2) { 4145 // for expand_ary2, limit is the (smaller) size of the second array. 4146 ShortBranchVerifier sbv(this); 4147 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4148 4149 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4150 "Expansion only implemented for AVX2"); 4151 4152 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4153 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4154 4155 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4156 int scaleIncr = expand_ary2 ? 8 : 16; 4157 4158 if (is_array_equ) { 4159 // Check the input args 4160 cmpoop(ary1, ary2); 4161 jcc(Assembler::equal, TRUE_LABEL); 4162 4163 // Need additional checks for arrays_equals. 4164 testptr(ary1, ary1); 4165 jcc(Assembler::zero, FALSE_LABEL); 4166 testptr(ary2, ary2); 4167 jcc(Assembler::zero, FALSE_LABEL); 4168 4169 // Check the lengths 4170 movl(limit, Address(ary1, length_offset)); 4171 cmpl(limit, Address(ary2, length_offset)); 4172 jcc(Assembler::notEqual, FALSE_LABEL); 4173 } 4174 4175 // count == 0 4176 testl(limit, limit); 4177 jcc(Assembler::zero, TRUE_LABEL); 4178 4179 if (is_array_equ) { 4180 // Load array address 4181 lea(ary1, Address(ary1, base_offset)); 4182 lea(ary2, Address(ary2, base_offset)); 4183 } 4184 4185 if (is_array_equ && is_char) { 4186 // arrays_equals when used for char[]. 4187 shll(limit, 1); // byte count != 0 4188 } 4189 movl(result, limit); // copy 4190 4191 if (UseAVX >= 2) { 4192 // With AVX2, use 32-byte vector compare 4193 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4194 4195 // Compare 32-byte vectors 4196 if (expand_ary2) { 4197 andl(result, 0x0000000f); // tail count (in bytes) 4198 andl(limit, 0xfffffff0); // vector count (in bytes) 4199 jcc(Assembler::zero, COMPARE_TAIL); 4200 } else { 4201 andl(result, 0x0000001f); // tail count (in bytes) 4202 andl(limit, 0xffffffe0); // vector count (in bytes) 4203 jcc(Assembler::zero, COMPARE_TAIL_16); 4204 } 4205 4206 lea(ary1, Address(ary1, limit, scaleFactor)); 4207 lea(ary2, Address(ary2, limit, Address::times_1)); 4208 negptr(limit); 4209 4210 #ifdef _LP64 4211 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4212 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4213 4214 cmpl(limit, -64); 4215 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4216 4217 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4218 4219 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4220 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4221 kortestql(mask, mask); 4222 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4223 addptr(limit, 64); // update since we already compared at this addr 4224 cmpl(limit, -64); 4225 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4226 4227 // At this point we may still need to compare -limit+result bytes. 4228 // We could execute the next two instruction and just continue via non-wide path: 4229 // cmpl(limit, 0); 4230 // jcc(Assembler::equal, COMPARE_TAIL); // true 4231 // But since we stopped at the points ary{1,2}+limit which are 4232 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4233 // (|limit| <= 32 and result < 32), 4234 // we may just compare the last 64 bytes. 4235 // 4236 addptr(result, -64); // it is safe, bc we just came from this area 4237 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4238 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4239 kortestql(mask, mask); 4240 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4241 4242 jmp(TRUE_LABEL); 4243 4244 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4245 4246 }//if (VM_Version::supports_avx512vlbw()) 4247 #endif //_LP64 4248 bind(COMPARE_WIDE_VECTORS); 4249 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4250 if (expand_ary2) { 4251 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4252 } else { 4253 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4254 } 4255 vpxor(vec1, vec2); 4256 4257 vptest(vec1, vec1); 4258 jcc(Assembler::notZero, FALSE_LABEL); 4259 addptr(limit, scaleIncr * 2); 4260 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4261 4262 testl(result, result); 4263 jcc(Assembler::zero, TRUE_LABEL); 4264 4265 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4266 if (expand_ary2) { 4267 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4268 } else { 4269 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4270 } 4271 vpxor(vec1, vec2); 4272 4273 vptest(vec1, vec1); 4274 jcc(Assembler::notZero, FALSE_LABEL); 4275 jmp(TRUE_LABEL); 4276 4277 bind(COMPARE_TAIL_16); // limit is zero 4278 movl(limit, result); 4279 4280 // Compare 16-byte chunks 4281 andl(result, 0x0000000f); // tail count (in bytes) 4282 andl(limit, 0xfffffff0); // vector count (in bytes) 4283 jcc(Assembler::zero, COMPARE_TAIL); 4284 4285 lea(ary1, Address(ary1, limit, scaleFactor)); 4286 lea(ary2, Address(ary2, limit, Address::times_1)); 4287 negptr(limit); 4288 4289 bind(COMPARE_WIDE_VECTORS_16); 4290 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4291 if (expand_ary2) { 4292 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4293 } else { 4294 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4295 } 4296 pxor(vec1, vec2); 4297 4298 ptest(vec1, vec1); 4299 jcc(Assembler::notZero, FALSE_LABEL); 4300 addptr(limit, scaleIncr); 4301 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4302 4303 bind(COMPARE_TAIL); // limit is zero 4304 movl(limit, result); 4305 // Fallthru to tail compare 4306 } else if (UseSSE42Intrinsics) { 4307 // With SSE4.2, use double quad vector compare 4308 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4309 4310 // Compare 16-byte vectors 4311 andl(result, 0x0000000f); // tail count (in bytes) 4312 andl(limit, 0xfffffff0); // vector count (in bytes) 4313 jcc(Assembler::zero, COMPARE_TAIL); 4314 4315 lea(ary1, Address(ary1, limit, Address::times_1)); 4316 lea(ary2, Address(ary2, limit, Address::times_1)); 4317 negptr(limit); 4318 4319 bind(COMPARE_WIDE_VECTORS); 4320 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4321 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4322 pxor(vec1, vec2); 4323 4324 ptest(vec1, vec1); 4325 jcc(Assembler::notZero, FALSE_LABEL); 4326 addptr(limit, 16); 4327 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4328 4329 testl(result, result); 4330 jcc(Assembler::zero, TRUE_LABEL); 4331 4332 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4333 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4334 pxor(vec1, vec2); 4335 4336 ptest(vec1, vec1); 4337 jccb(Assembler::notZero, FALSE_LABEL); 4338 jmpb(TRUE_LABEL); 4339 4340 bind(COMPARE_TAIL); // limit is zero 4341 movl(limit, result); 4342 // Fallthru to tail compare 4343 } 4344 4345 // Compare 4-byte vectors 4346 if (expand_ary2) { 4347 testl(result, result); 4348 jccb(Assembler::zero, TRUE_LABEL); 4349 } else { 4350 andl(limit, 0xfffffffc); // vector count (in bytes) 4351 jccb(Assembler::zero, COMPARE_CHAR); 4352 } 4353 4354 lea(ary1, Address(ary1, limit, scaleFactor)); 4355 lea(ary2, Address(ary2, limit, Address::times_1)); 4356 negptr(limit); 4357 4358 bind(COMPARE_VECTORS); 4359 if (expand_ary2) { 4360 // There are no "vector" operations for bytes to shorts 4361 movzbl(chr, Address(ary2, limit, Address::times_1)); 4362 cmpw(Address(ary1, limit, Address::times_2), chr); 4363 jccb(Assembler::notEqual, FALSE_LABEL); 4364 addptr(limit, 1); 4365 jcc(Assembler::notZero, COMPARE_VECTORS); 4366 jmp(TRUE_LABEL); 4367 } else { 4368 movl(chr, Address(ary1, limit, Address::times_1)); 4369 cmpl(chr, Address(ary2, limit, Address::times_1)); 4370 jccb(Assembler::notEqual, FALSE_LABEL); 4371 addptr(limit, 4); 4372 jcc(Assembler::notZero, COMPARE_VECTORS); 4373 } 4374 4375 // Compare trailing char (final 2 bytes), if any 4376 bind(COMPARE_CHAR); 4377 testl(result, 0x2); // tail char 4378 jccb(Assembler::zero, COMPARE_BYTE); 4379 load_unsigned_short(chr, Address(ary1, 0)); 4380 load_unsigned_short(limit, Address(ary2, 0)); 4381 cmpl(chr, limit); 4382 jccb(Assembler::notEqual, FALSE_LABEL); 4383 4384 if (is_array_equ && is_char) { 4385 bind(COMPARE_BYTE); 4386 } else { 4387 lea(ary1, Address(ary1, 2)); 4388 lea(ary2, Address(ary2, 2)); 4389 4390 bind(COMPARE_BYTE); 4391 testl(result, 0x1); // tail byte 4392 jccb(Assembler::zero, TRUE_LABEL); 4393 load_unsigned_byte(chr, Address(ary1, 0)); 4394 load_unsigned_byte(limit, Address(ary2, 0)); 4395 cmpl(chr, limit); 4396 jccb(Assembler::notEqual, FALSE_LABEL); 4397 } 4398 bind(TRUE_LABEL); 4399 movl(result, 1); // return true 4400 jmpb(DONE); 4401 4402 bind(FALSE_LABEL); 4403 xorl(result, result); // return false 4404 4405 // That's it 4406 bind(DONE); 4407 if (UseAVX >= 2) { 4408 // clean upper bits of YMM registers 4409 vpxor(vec1, vec1); 4410 vpxor(vec2, vec2); 4411 } 4412 } 4413 4414 #ifdef _LP64 4415 4416 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4417 #define __ masm. 4418 Register dst = stub.data<0>(); 4419 XMMRegister src = stub.data<1>(); 4420 address target = stub.data<2>(); 4421 __ bind(stub.entry()); 4422 __ subptr(rsp, 8); 4423 __ movdbl(Address(rsp), src); 4424 __ call(RuntimeAddress(target)); 4425 __ pop(dst); 4426 __ jmp(stub.continuation()); 4427 #undef __ 4428 } 4429 4430 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4431 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4432 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4433 4434 address slowpath_target; 4435 if (dst_bt == T_INT) { 4436 if (src_bt == T_FLOAT) { 4437 cvttss2sil(dst, src); 4438 cmpl(dst, 0x80000000); 4439 slowpath_target = StubRoutines::x86::f2i_fixup(); 4440 } else { 4441 cvttsd2sil(dst, src); 4442 cmpl(dst, 0x80000000); 4443 slowpath_target = StubRoutines::x86::d2i_fixup(); 4444 } 4445 } else { 4446 if (src_bt == T_FLOAT) { 4447 cvttss2siq(dst, src); 4448 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4449 slowpath_target = StubRoutines::x86::f2l_fixup(); 4450 } else { 4451 cvttsd2siq(dst, src); 4452 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4453 slowpath_target = StubRoutines::x86::d2l_fixup(); 4454 } 4455 } 4456 4457 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4458 jcc(Assembler::equal, stub->entry()); 4459 bind(stub->continuation()); 4460 } 4461 4462 #endif // _LP64 4463 4464 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4465 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4466 switch(ideal_opc) { 4467 case Op_LShiftVS: 4468 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4469 case Op_LShiftVI: 4470 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4471 case Op_LShiftVL: 4472 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4473 case Op_RShiftVS: 4474 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4475 case Op_RShiftVI: 4476 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4477 case Op_RShiftVL: 4478 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4479 case Op_URShiftVS: 4480 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4481 case Op_URShiftVI: 4482 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4483 case Op_URShiftVL: 4484 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4485 case Op_RotateRightV: 4486 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4487 case Op_RotateLeftV: 4488 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4489 default: 4490 fatal("Unsupported masked operation"); break; 4491 } 4492 } 4493 4494 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4495 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4496 bool is_varshift) { 4497 switch (ideal_opc) { 4498 case Op_AddVB: 4499 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4500 case Op_AddVS: 4501 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4502 case Op_AddVI: 4503 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4504 case Op_AddVL: 4505 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4506 case Op_AddVF: 4507 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4508 case Op_AddVD: 4509 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4510 case Op_SubVB: 4511 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4512 case Op_SubVS: 4513 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4514 case Op_SubVI: 4515 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4516 case Op_SubVL: 4517 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4518 case Op_SubVF: 4519 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4520 case Op_SubVD: 4521 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4522 case Op_MulVS: 4523 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4524 case Op_MulVI: 4525 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4526 case Op_MulVL: 4527 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4528 case Op_MulVF: 4529 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4530 case Op_MulVD: 4531 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4532 case Op_DivVF: 4533 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4534 case Op_DivVD: 4535 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4536 case Op_SqrtVF: 4537 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4538 case Op_SqrtVD: 4539 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4540 case Op_AbsVB: 4541 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4542 case Op_AbsVS: 4543 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4544 case Op_AbsVI: 4545 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4546 case Op_AbsVL: 4547 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4548 case Op_FmaVF: 4549 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4550 case Op_FmaVD: 4551 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4552 case Op_VectorRearrange: 4553 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4554 case Op_LShiftVS: 4555 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4556 case Op_LShiftVI: 4557 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4558 case Op_LShiftVL: 4559 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4560 case Op_RShiftVS: 4561 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4562 case Op_RShiftVI: 4563 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4564 case Op_RShiftVL: 4565 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4566 case Op_URShiftVS: 4567 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4568 case Op_URShiftVI: 4569 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4570 case Op_URShiftVL: 4571 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4572 case Op_RotateLeftV: 4573 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4574 case Op_RotateRightV: 4575 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4576 case Op_MaxV: 4577 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4578 case Op_MinV: 4579 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4580 case Op_XorV: 4581 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4582 case Op_OrV: 4583 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4584 case Op_AndV: 4585 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4586 default: 4587 fatal("Unsupported masked operation"); break; 4588 } 4589 } 4590 4591 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4592 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4593 switch (ideal_opc) { 4594 case Op_AddVB: 4595 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4596 case Op_AddVS: 4597 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4598 case Op_AddVI: 4599 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4600 case Op_AddVL: 4601 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4602 case Op_AddVF: 4603 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4604 case Op_AddVD: 4605 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4606 case Op_SubVB: 4607 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4608 case Op_SubVS: 4609 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4610 case Op_SubVI: 4611 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4612 case Op_SubVL: 4613 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4614 case Op_SubVF: 4615 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4616 case Op_SubVD: 4617 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4618 case Op_MulVS: 4619 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4620 case Op_MulVI: 4621 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4622 case Op_MulVL: 4623 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4624 case Op_MulVF: 4625 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4626 case Op_MulVD: 4627 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4628 case Op_DivVF: 4629 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4630 case Op_DivVD: 4631 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4632 case Op_FmaVF: 4633 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4634 case Op_FmaVD: 4635 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4636 case Op_MaxV: 4637 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4638 case Op_MinV: 4639 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4640 case Op_XorV: 4641 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4642 case Op_OrV: 4643 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4644 case Op_AndV: 4645 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4646 default: 4647 fatal("Unsupported masked operation"); break; 4648 } 4649 } 4650 4651 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4652 KRegister src1, KRegister src2) { 4653 BasicType etype = T_ILLEGAL; 4654 switch(mask_len) { 4655 case 2: 4656 case 4: 4657 case 8: etype = T_BYTE; break; 4658 case 16: etype = T_SHORT; break; 4659 case 32: etype = T_INT; break; 4660 case 64: etype = T_LONG; break; 4661 default: fatal("Unsupported type"); break; 4662 } 4663 assert(etype != T_ILLEGAL, ""); 4664 switch(ideal_opc) { 4665 case Op_AndVMask: 4666 kand(etype, dst, src1, src2); break; 4667 case Op_OrVMask: 4668 kor(etype, dst, src1, src2); break; 4669 case Op_XorVMask: 4670 kxor(etype, dst, src1, src2); break; 4671 default: 4672 fatal("Unsupported masked operation"); break; 4673 } 4674 } 4675 4676 /* 4677 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4678 * If src is NaN, the result is 0. 4679 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4680 * the result is equal to the value of Integer.MIN_VALUE. 4681 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4682 * the result is equal to the value of Integer.MAX_VALUE. 4683 */ 4684 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4685 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4686 Register rscratch, AddressLiteral float_sign_flip, 4687 int vec_enc) { 4688 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4689 Label done; 4690 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4691 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4692 vptest(xtmp2, xtmp2, vec_enc); 4693 jccb(Assembler::equal, done); 4694 4695 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4696 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4697 4698 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4699 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4700 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4701 4702 // Recompute the mask for remaining special value. 4703 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4704 // Extract SRC values corresponding to TRUE mask lanes. 4705 vpand(xtmp4, xtmp2, src, vec_enc); 4706 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4707 // values are set. 4708 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4709 4710 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4711 bind(done); 4712 } 4713 4714 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4715 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4716 Register rscratch, AddressLiteral float_sign_flip, 4717 int vec_enc) { 4718 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4719 Label done; 4720 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4721 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4722 kortestwl(ktmp1, ktmp1); 4723 jccb(Assembler::equal, done); 4724 4725 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4726 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4727 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4728 4729 kxorwl(ktmp1, ktmp1, ktmp2); 4730 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4731 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4732 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4733 bind(done); 4734 } 4735 4736 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4737 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4738 Register rscratch, AddressLiteral double_sign_flip, 4739 int vec_enc) { 4740 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4741 4742 Label done; 4743 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4744 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4745 kortestwl(ktmp1, ktmp1); 4746 jccb(Assembler::equal, done); 4747 4748 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4749 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4750 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4751 4752 kxorwl(ktmp1, ktmp1, ktmp2); 4753 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4754 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4755 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4756 bind(done); 4757 } 4758 4759 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4760 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4761 Register rscratch, AddressLiteral float_sign_flip, 4762 int vec_enc) { 4763 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4764 Label done; 4765 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4766 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4767 kortestwl(ktmp1, ktmp1); 4768 jccb(Assembler::equal, done); 4769 4770 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4771 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4772 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4773 4774 kxorwl(ktmp1, ktmp1, ktmp2); 4775 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4776 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4777 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4778 bind(done); 4779 } 4780 4781 /* 4782 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4783 * If src is NaN, the result is 0. 4784 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4785 * the result is equal to the value of Long.MIN_VALUE. 4786 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4787 * the result is equal to the value of Long.MAX_VALUE. 4788 */ 4789 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4790 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4791 Register rscratch, AddressLiteral double_sign_flip, 4792 int vec_enc) { 4793 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4794 4795 Label done; 4796 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4797 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4798 kortestwl(ktmp1, ktmp1); 4799 jccb(Assembler::equal, done); 4800 4801 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4802 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4803 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4804 4805 kxorwl(ktmp1, ktmp1, ktmp2); 4806 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4807 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4808 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4809 bind(done); 4810 } 4811 4812 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4813 XMMRegister xtmp, int index, int vec_enc) { 4814 assert(vec_enc < Assembler::AVX_512bit, ""); 4815 if (vec_enc == Assembler::AVX_256bit) { 4816 vextractf128_high(xtmp, src); 4817 vshufps(dst, src, xtmp, index, vec_enc); 4818 } else { 4819 vshufps(dst, src, zero, index, vec_enc); 4820 } 4821 } 4822 4823 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4824 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4825 AddressLiteral float_sign_flip, int src_vec_enc) { 4826 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4827 4828 Label done; 4829 // Compare the destination lanes with float_sign_flip 4830 // value to get mask for all special values. 4831 movdqu(xtmp1, float_sign_flip, rscratch); 4832 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4833 ptest(xtmp2, xtmp2); 4834 jccb(Assembler::equal, done); 4835 4836 // Flip float_sign_flip to get max integer value. 4837 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4838 pxor(xtmp1, xtmp4); 4839 4840 // Set detination lanes corresponding to unordered source lanes as zero. 4841 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4842 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4843 4844 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4845 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4846 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4847 4848 // Recompute the mask for remaining special value. 4849 pxor(xtmp2, xtmp3); 4850 // Extract mask corresponding to non-negative source lanes. 4851 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4852 4853 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4854 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4855 pand(xtmp3, xtmp2); 4856 4857 // Replace destination lanes holding special value(0x80000000) with max int 4858 // if corresponding source lane holds a +ve value. 4859 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4860 bind(done); 4861 } 4862 4863 4864 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4865 XMMRegister xtmp, Register rscratch, int vec_enc) { 4866 switch(to_elem_bt) { 4867 case T_SHORT: 4868 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4869 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4870 vpackusdw(dst, dst, zero, vec_enc); 4871 if (vec_enc == Assembler::AVX_256bit) { 4872 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4873 } 4874 break; 4875 case T_BYTE: 4876 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4877 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4878 vpackusdw(dst, dst, zero, vec_enc); 4879 if (vec_enc == Assembler::AVX_256bit) { 4880 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4881 } 4882 vpackuswb(dst, dst, zero, vec_enc); 4883 break; 4884 default: assert(false, "%s", type2name(to_elem_bt)); 4885 } 4886 } 4887 4888 /* 4889 * Algorithm for vector D2L and F2I conversions:- 4890 * a) Perform vector D2L/F2I cast. 4891 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4892 * It signifies that source value could be any of the special floating point 4893 * values(NaN,-Inf,Inf,Max,-Min). 4894 * c) Set destination to zero if source is NaN value. 4895 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4896 */ 4897 4898 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4899 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4900 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4901 int to_elem_sz = type2aelembytes(to_elem_bt); 4902 assert(to_elem_sz <= 4, ""); 4903 vcvttps2dq(dst, src, vec_enc); 4904 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4905 if (to_elem_sz < 4) { 4906 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4907 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4908 } 4909 } 4910 4911 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4912 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4913 Register rscratch, int vec_enc) { 4914 int to_elem_sz = type2aelembytes(to_elem_bt); 4915 assert(to_elem_sz <= 4, ""); 4916 vcvttps2dq(dst, src, vec_enc); 4917 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4918 switch(to_elem_bt) { 4919 case T_INT: 4920 break; 4921 case T_SHORT: 4922 evpmovdw(dst, dst, vec_enc); 4923 break; 4924 case T_BYTE: 4925 evpmovdb(dst, dst, vec_enc); 4926 break; 4927 default: assert(false, "%s", type2name(to_elem_bt)); 4928 } 4929 } 4930 4931 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4932 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4933 Register rscratch, int vec_enc) { 4934 evcvttps2qq(dst, src, vec_enc); 4935 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 4936 } 4937 4938 // Handling for downcasting from double to integer or sub-word types on AVX2. 4939 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4940 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 4941 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4942 int to_elem_sz = type2aelembytes(to_elem_bt); 4943 assert(to_elem_sz < 8, ""); 4944 vcvttpd2dq(dst, src, vec_enc); 4945 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 4946 float_sign_flip, vec_enc); 4947 if (to_elem_sz < 4) { 4948 // xtmp4 holds all zero lanes. 4949 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 4950 } 4951 } 4952 4953 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 4954 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 4955 KRegister ktmp2, AddressLiteral sign_flip, 4956 Register rscratch, int vec_enc) { 4957 if (VM_Version::supports_avx512dq()) { 4958 evcvttpd2qq(dst, src, vec_enc); 4959 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4960 switch(to_elem_bt) { 4961 case T_LONG: 4962 break; 4963 case T_INT: 4964 evpmovsqd(dst, dst, vec_enc); 4965 break; 4966 case T_SHORT: 4967 evpmovsqd(dst, dst, vec_enc); 4968 evpmovdw(dst, dst, vec_enc); 4969 break; 4970 case T_BYTE: 4971 evpmovsqd(dst, dst, vec_enc); 4972 evpmovdb(dst, dst, vec_enc); 4973 break; 4974 default: assert(false, "%s", type2name(to_elem_bt)); 4975 } 4976 } else { 4977 assert(type2aelembytes(to_elem_bt) <= 4, ""); 4978 vcvttpd2dq(dst, src, vec_enc); 4979 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4980 switch(to_elem_bt) { 4981 case T_INT: 4982 break; 4983 case T_SHORT: 4984 evpmovdw(dst, dst, vec_enc); 4985 break; 4986 case T_BYTE: 4987 evpmovdb(dst, dst, vec_enc); 4988 break; 4989 default: assert(false, "%s", type2name(to_elem_bt)); 4990 } 4991 } 4992 } 4993 4994 #ifdef _LP64 4995 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 4996 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4997 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4998 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4999 // and re-instantiate original MXCSR.RC mode after that. 5000 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5001 5002 mov64(tmp, julong_cast(0.5L)); 5003 evpbroadcastq(xtmp1, tmp, vec_enc); 5004 vaddpd(xtmp1, src , xtmp1, vec_enc); 5005 evcvtpd2qq(dst, xtmp1, vec_enc); 5006 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5007 double_sign_flip, vec_enc);; 5008 5009 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5010 } 5011 5012 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5013 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5014 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5015 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5016 // and re-instantiate original MXCSR.RC mode after that. 5017 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5018 5019 movl(tmp, jint_cast(0.5)); 5020 movq(xtmp1, tmp); 5021 vbroadcastss(xtmp1, xtmp1, vec_enc); 5022 vaddps(xtmp1, src , xtmp1, vec_enc); 5023 vcvtps2dq(dst, xtmp1, vec_enc); 5024 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5025 float_sign_flip, vec_enc); 5026 5027 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5028 } 5029 5030 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5031 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5032 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5033 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5034 // and re-instantiate original MXCSR.RC mode after that. 5035 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5036 5037 movl(tmp, jint_cast(0.5)); 5038 movq(xtmp1, tmp); 5039 vbroadcastss(xtmp1, xtmp1, vec_enc); 5040 vaddps(xtmp1, src , xtmp1, vec_enc); 5041 vcvtps2dq(dst, xtmp1, vec_enc); 5042 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5043 5044 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5045 } 5046 #endif // _LP64 5047 5048 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5049 BasicType from_elem_bt, BasicType to_elem_bt) { 5050 switch (from_elem_bt) { 5051 case T_BYTE: 5052 switch (to_elem_bt) { 5053 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5054 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5055 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5056 default: ShouldNotReachHere(); 5057 } 5058 break; 5059 case T_SHORT: 5060 switch (to_elem_bt) { 5061 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5062 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5063 default: ShouldNotReachHere(); 5064 } 5065 break; 5066 case T_INT: 5067 assert(to_elem_bt == T_LONG, ""); 5068 vpmovzxdq(dst, src, vlen_enc); 5069 break; 5070 default: 5071 ShouldNotReachHere(); 5072 } 5073 } 5074 5075 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5076 BasicType from_elem_bt, BasicType to_elem_bt) { 5077 switch (from_elem_bt) { 5078 case T_BYTE: 5079 switch (to_elem_bt) { 5080 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5081 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5082 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5083 default: ShouldNotReachHere(); 5084 } 5085 break; 5086 case T_SHORT: 5087 switch (to_elem_bt) { 5088 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5089 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5090 default: ShouldNotReachHere(); 5091 } 5092 break; 5093 case T_INT: 5094 assert(to_elem_bt == T_LONG, ""); 5095 vpmovsxdq(dst, src, vlen_enc); 5096 break; 5097 default: 5098 ShouldNotReachHere(); 5099 } 5100 } 5101 5102 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5103 BasicType dst_bt, BasicType src_bt, int vlen) { 5104 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5105 assert(vlen_enc != AVX_512bit, ""); 5106 5107 int dst_bt_size = type2aelembytes(dst_bt); 5108 int src_bt_size = type2aelembytes(src_bt); 5109 if (dst_bt_size > src_bt_size) { 5110 switch (dst_bt_size / src_bt_size) { 5111 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5112 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5113 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5114 default: ShouldNotReachHere(); 5115 } 5116 } else { 5117 assert(dst_bt_size < src_bt_size, ""); 5118 switch (src_bt_size / dst_bt_size) { 5119 case 2: { 5120 if (vlen_enc == AVX_128bit) { 5121 vpacksswb(dst, src, src, vlen_enc); 5122 } else { 5123 vpacksswb(dst, src, src, vlen_enc); 5124 vpermq(dst, dst, 0x08, vlen_enc); 5125 } 5126 break; 5127 } 5128 case 4: { 5129 if (vlen_enc == AVX_128bit) { 5130 vpackssdw(dst, src, src, vlen_enc); 5131 vpacksswb(dst, dst, dst, vlen_enc); 5132 } else { 5133 vpackssdw(dst, src, src, vlen_enc); 5134 vpermq(dst, dst, 0x08, vlen_enc); 5135 vpacksswb(dst, dst, dst, AVX_128bit); 5136 } 5137 break; 5138 } 5139 case 8: { 5140 if (vlen_enc == AVX_128bit) { 5141 vpshufd(dst, src, 0x08, vlen_enc); 5142 vpackssdw(dst, dst, dst, vlen_enc); 5143 vpacksswb(dst, dst, dst, vlen_enc); 5144 } else { 5145 vpshufd(dst, src, 0x08, vlen_enc); 5146 vpermq(dst, dst, 0x08, vlen_enc); 5147 vpackssdw(dst, dst, dst, AVX_128bit); 5148 vpacksswb(dst, dst, dst, AVX_128bit); 5149 } 5150 break; 5151 } 5152 default: ShouldNotReachHere(); 5153 } 5154 } 5155 } 5156 5157 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5158 bool merge, BasicType bt, int vlen_enc) { 5159 if (bt == T_INT) { 5160 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5161 } else { 5162 assert(bt == T_LONG, ""); 5163 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5164 } 5165 } 5166 5167 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5168 bool merge, BasicType bt, int vlen_enc) { 5169 if (bt == T_INT) { 5170 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5171 } else { 5172 assert(bt == T_LONG, ""); 5173 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5174 } 5175 } 5176 5177 #ifdef _LP64 5178 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5179 Register rtmp2, XMMRegister xtmp, int mask_len, 5180 int vec_enc) { 5181 int index = 0; 5182 int vindex = 0; 5183 mov64(rtmp1, 0x0101010101010101L); 5184 pdepq(rtmp1, src, rtmp1); 5185 if (mask_len > 8) { 5186 movq(rtmp2, src); 5187 vpxor(xtmp, xtmp, xtmp, vec_enc); 5188 movq(xtmp, rtmp1); 5189 } 5190 movq(dst, rtmp1); 5191 5192 mask_len -= 8; 5193 while (mask_len > 0) { 5194 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5195 index++; 5196 if ((index % 2) == 0) { 5197 pxor(xtmp, xtmp); 5198 } 5199 mov64(rtmp1, 0x0101010101010101L); 5200 shrq(rtmp2, 8); 5201 pdepq(rtmp1, rtmp2, rtmp1); 5202 pinsrq(xtmp, rtmp1, index % 2); 5203 vindex = index / 2; 5204 if (vindex) { 5205 // Write entire 16 byte vector when both 64 bit 5206 // lanes are update to save redundant instructions. 5207 if (index % 2) { 5208 vinsertf128(dst, dst, xtmp, vindex); 5209 } 5210 } else { 5211 vmovdqu(dst, xtmp); 5212 } 5213 mask_len -= 8; 5214 } 5215 } 5216 5217 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5218 switch(opc) { 5219 case Op_VectorMaskTrueCount: 5220 popcntq(dst, tmp); 5221 break; 5222 case Op_VectorMaskLastTrue: 5223 if (VM_Version::supports_lzcnt()) { 5224 lzcntq(tmp, tmp); 5225 movl(dst, 63); 5226 subl(dst, tmp); 5227 } else { 5228 movl(dst, -1); 5229 bsrq(tmp, tmp); 5230 cmov32(Assembler::notZero, dst, tmp); 5231 } 5232 break; 5233 case Op_VectorMaskFirstTrue: 5234 if (VM_Version::supports_bmi1()) { 5235 if (masklen < 32) { 5236 orl(tmp, 1 << masklen); 5237 tzcntl(dst, tmp); 5238 } else if (masklen == 32) { 5239 tzcntl(dst, tmp); 5240 } else { 5241 assert(masklen == 64, ""); 5242 tzcntq(dst, tmp); 5243 } 5244 } else { 5245 if (masklen < 32) { 5246 orl(tmp, 1 << masklen); 5247 bsfl(dst, tmp); 5248 } else { 5249 assert(masklen == 32 || masklen == 64, ""); 5250 movl(dst, masklen); 5251 if (masklen == 32) { 5252 bsfl(tmp, tmp); 5253 } else { 5254 bsfq(tmp, tmp); 5255 } 5256 cmov32(Assembler::notZero, dst, tmp); 5257 } 5258 } 5259 break; 5260 case Op_VectorMaskToLong: 5261 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5262 break; 5263 default: assert(false, "Unhandled mask operation"); 5264 } 5265 } 5266 5267 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5268 int masklen, int masksize, int vec_enc) { 5269 assert(VM_Version::supports_popcnt(), ""); 5270 5271 if(VM_Version::supports_avx512bw()) { 5272 kmovql(tmp, mask); 5273 } else { 5274 assert(masklen <= 16, ""); 5275 kmovwl(tmp, mask); 5276 } 5277 5278 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5279 // operations needs to be clipped. 5280 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5281 andq(tmp, (1 << masklen) - 1); 5282 } 5283 5284 vector_mask_operation_helper(opc, dst, tmp, masklen); 5285 } 5286 5287 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5288 Register tmp, int masklen, BasicType bt, int vec_enc) { 5289 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5290 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5291 assert(VM_Version::supports_popcnt(), ""); 5292 5293 bool need_clip = false; 5294 switch(bt) { 5295 case T_BOOLEAN: 5296 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5297 vpxor(xtmp, xtmp, xtmp, vec_enc); 5298 vpsubb(xtmp, xtmp, mask, vec_enc); 5299 vpmovmskb(tmp, xtmp, vec_enc); 5300 need_clip = masklen < 16; 5301 break; 5302 case T_BYTE: 5303 vpmovmskb(tmp, mask, vec_enc); 5304 need_clip = masklen < 16; 5305 break; 5306 case T_SHORT: 5307 vpacksswb(xtmp, mask, mask, vec_enc); 5308 if (masklen >= 16) { 5309 vpermpd(xtmp, xtmp, 8, vec_enc); 5310 } 5311 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5312 need_clip = masklen < 16; 5313 break; 5314 case T_INT: 5315 case T_FLOAT: 5316 vmovmskps(tmp, mask, vec_enc); 5317 need_clip = masklen < 4; 5318 break; 5319 case T_LONG: 5320 case T_DOUBLE: 5321 vmovmskpd(tmp, mask, vec_enc); 5322 need_clip = masklen < 2; 5323 break; 5324 default: assert(false, "Unhandled type, %s", type2name(bt)); 5325 } 5326 5327 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5328 // operations needs to be clipped. 5329 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5330 // need_clip implies masklen < 32 5331 andq(tmp, (1 << masklen) - 1); 5332 } 5333 5334 vector_mask_operation_helper(opc, dst, tmp, masklen); 5335 } 5336 5337 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5338 Register rtmp2, int mask_len) { 5339 kmov(rtmp1, src); 5340 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5341 mov64(rtmp2, -1L); 5342 pextq(rtmp2, rtmp2, rtmp1); 5343 kmov(dst, rtmp2); 5344 } 5345 5346 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5347 XMMRegister mask, Register rtmp, Register rscratch, 5348 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5349 int vec_enc) { 5350 assert(type2aelembytes(bt) >= 4, ""); 5351 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5352 address compress_perm_table = nullptr; 5353 address expand_perm_table = nullptr; 5354 if (type2aelembytes(bt) == 8) { 5355 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5356 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5357 vmovmskpd(rtmp, mask, vec_enc); 5358 } else { 5359 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5360 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5361 vmovmskps(rtmp, mask, vec_enc); 5362 } 5363 shlq(rtmp, 5); // for 32 byte permute row. 5364 if (opcode == Op_CompressV) { 5365 lea(rscratch, ExternalAddress(compress_perm_table)); 5366 } else { 5367 lea(rscratch, ExternalAddress(expand_perm_table)); 5368 } 5369 addptr(rtmp, rscratch); 5370 vmovdqu(permv, Address(rtmp)); 5371 vpermps(dst, permv, src, Assembler::AVX_256bit); 5372 vpxor(xtmp, xtmp, xtmp, vec_enc); 5373 // Blend the result with zero vector using permute mask, each column entry 5374 // in a permute table row contains either a valid permute index or a -1 (default) 5375 // value, this can potentially be used as a blending mask after 5376 // compressing/expanding the source vector lanes. 5377 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5378 } 5379 5380 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5381 bool merge, BasicType bt, int vec_enc) { 5382 if (opcode == Op_CompressV) { 5383 switch(bt) { 5384 case T_BYTE: 5385 evpcompressb(dst, mask, src, merge, vec_enc); 5386 break; 5387 case T_CHAR: 5388 case T_SHORT: 5389 evpcompressw(dst, mask, src, merge, vec_enc); 5390 break; 5391 case T_INT: 5392 evpcompressd(dst, mask, src, merge, vec_enc); 5393 break; 5394 case T_FLOAT: 5395 evcompressps(dst, mask, src, merge, vec_enc); 5396 break; 5397 case T_LONG: 5398 evpcompressq(dst, mask, src, merge, vec_enc); 5399 break; 5400 case T_DOUBLE: 5401 evcompresspd(dst, mask, src, merge, vec_enc); 5402 break; 5403 default: 5404 fatal("Unsupported type %s", type2name(bt)); 5405 break; 5406 } 5407 } else { 5408 assert(opcode == Op_ExpandV, ""); 5409 switch(bt) { 5410 case T_BYTE: 5411 evpexpandb(dst, mask, src, merge, vec_enc); 5412 break; 5413 case T_CHAR: 5414 case T_SHORT: 5415 evpexpandw(dst, mask, src, merge, vec_enc); 5416 break; 5417 case T_INT: 5418 evpexpandd(dst, mask, src, merge, vec_enc); 5419 break; 5420 case T_FLOAT: 5421 evexpandps(dst, mask, src, merge, vec_enc); 5422 break; 5423 case T_LONG: 5424 evpexpandq(dst, mask, src, merge, vec_enc); 5425 break; 5426 case T_DOUBLE: 5427 evexpandpd(dst, mask, src, merge, vec_enc); 5428 break; 5429 default: 5430 fatal("Unsupported type %s", type2name(bt)); 5431 break; 5432 } 5433 } 5434 } 5435 #endif 5436 5437 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5438 KRegister ktmp1, int vec_enc) { 5439 if (opcode == Op_SignumVD) { 5440 vsubpd(dst, zero, one, vec_enc); 5441 // if src < 0 ? -1 : 1 5442 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5443 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5444 // if src == NaN, -0.0 or 0.0 return src. 5445 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5446 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5447 } else { 5448 assert(opcode == Op_SignumVF, ""); 5449 vsubps(dst, zero, one, vec_enc); 5450 // if src < 0 ? -1 : 1 5451 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5452 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5453 // if src == NaN, -0.0 or 0.0 return src. 5454 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5455 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5456 } 5457 } 5458 5459 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5460 XMMRegister xtmp1, int vec_enc) { 5461 if (opcode == Op_SignumVD) { 5462 vsubpd(dst, zero, one, vec_enc); 5463 // if src < 0 ? -1 : 1 5464 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5465 // if src == NaN, -0.0 or 0.0 return src. 5466 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5467 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5468 } else { 5469 assert(opcode == Op_SignumVF, ""); 5470 vsubps(dst, zero, one, vec_enc); 5471 // if src < 0 ? -1 : 1 5472 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5473 // if src == NaN, -0.0 or 0.0 return src. 5474 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5475 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5476 } 5477 } 5478 5479 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5480 if (VM_Version::supports_avx512bw()) { 5481 if (mask_len > 32) { 5482 kmovql(dst, src); 5483 } else { 5484 kmovdl(dst, src); 5485 if (mask_len != 32) { 5486 kshiftrdl(dst, dst, 32 - mask_len); 5487 } 5488 } 5489 } else { 5490 assert(mask_len <= 16, ""); 5491 kmovwl(dst, src); 5492 if (mask_len != 16) { 5493 kshiftrwl(dst, dst, 16 - mask_len); 5494 } 5495 } 5496 } 5497 5498 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5499 int lane_size = type2aelembytes(bt); 5500 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5501 if ((is_LP64 || lane_size < 8) && 5502 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5503 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5504 movptr(rtmp, imm32); 5505 switch(lane_size) { 5506 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5507 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5508 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5509 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5510 fatal("Unsupported lane size %d", lane_size); 5511 break; 5512 } 5513 } else { 5514 movptr(rtmp, imm32); 5515 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5516 switch(lane_size) { 5517 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5518 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5519 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5520 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5521 fatal("Unsupported lane size %d", lane_size); 5522 break; 5523 } 5524 } 5525 } 5526 5527 // 5528 // Following is lookup table based popcount computation algorithm:- 5529 // Index Bit set count 5530 // [ 0000 -> 0, 5531 // 0001 -> 1, 5532 // 0010 -> 1, 5533 // 0011 -> 2, 5534 // 0100 -> 1, 5535 // 0101 -> 2, 5536 // 0110 -> 2, 5537 // 0111 -> 3, 5538 // 1000 -> 1, 5539 // 1001 -> 2, 5540 // 1010 -> 3, 5541 // 1011 -> 3, 5542 // 1100 -> 2, 5543 // 1101 -> 3, 5544 // 1111 -> 4 ] 5545 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5546 // shuffle indices for lookup table access. 5547 // b. Right shift each byte of vector lane by 4 positions. 5548 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5549 // shuffle indices for lookup table access. 5550 // d. Add the bitset count of upper and lower 4 bits of each byte. 5551 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5552 // count of all the bytes of a quadword. 5553 // f. Perform step e. for upper 128bit vector lane. 5554 // g. Pack the bitset count of quadwords back to double word. 5555 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5556 5557 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5558 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5559 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5560 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5561 vpsrlw(dst, src, 4, vec_enc); 5562 vpand(dst, dst, xtmp1, vec_enc); 5563 vpand(xtmp1, src, xtmp1, vec_enc); 5564 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5565 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5566 vpshufb(dst, xtmp2, dst, vec_enc); 5567 vpaddb(dst, dst, xtmp1, vec_enc); 5568 } 5569 5570 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5571 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5572 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5573 // Following code is as per steps e,f,g and h of above algorithm. 5574 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5575 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5576 vpsadbw(dst, dst, xtmp2, vec_enc); 5577 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5578 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5579 vpackuswb(dst, xtmp1, dst, vec_enc); 5580 } 5581 5582 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5583 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5584 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5585 // Add the popcount of upper and lower bytes of word. 5586 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5587 vpsrlw(dst, xtmp1, 8, vec_enc); 5588 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5589 vpaddw(dst, dst, xtmp1, vec_enc); 5590 } 5591 5592 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5593 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5594 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5595 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5596 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5597 } 5598 5599 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5600 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5601 switch(bt) { 5602 case T_LONG: 5603 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5604 break; 5605 case T_INT: 5606 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5607 break; 5608 case T_CHAR: 5609 case T_SHORT: 5610 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5611 break; 5612 case T_BYTE: 5613 case T_BOOLEAN: 5614 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5615 break; 5616 default: 5617 fatal("Unsupported type %s", type2name(bt)); 5618 break; 5619 } 5620 } 5621 5622 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5623 KRegister mask, bool merge, int vec_enc) { 5624 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5625 switch(bt) { 5626 case T_LONG: 5627 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5628 evpopcntq(dst, mask, src, merge, vec_enc); 5629 break; 5630 case T_INT: 5631 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5632 evpopcntd(dst, mask, src, merge, vec_enc); 5633 break; 5634 case T_CHAR: 5635 case T_SHORT: 5636 assert(VM_Version::supports_avx512_bitalg(), ""); 5637 evpopcntw(dst, mask, src, merge, vec_enc); 5638 break; 5639 case T_BYTE: 5640 case T_BOOLEAN: 5641 assert(VM_Version::supports_avx512_bitalg(), ""); 5642 evpopcntb(dst, mask, src, merge, vec_enc); 5643 break; 5644 default: 5645 fatal("Unsupported type %s", type2name(bt)); 5646 break; 5647 } 5648 } 5649 5650 #ifndef _LP64 5651 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5652 assert(VM_Version::supports_avx512bw(), ""); 5653 kmovdl(tmp, src); 5654 kunpckdql(dst, tmp, tmp); 5655 } 5656 #endif 5657 5658 // Bit reversal algorithm first reverses the bits of each byte followed by 5659 // a byte level reversal for multi-byte primitive types (short/int/long). 5660 // Algorithm performs a lookup table access to get reverse bit sequence 5661 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5662 // is obtained by swapping the reverse bit sequences of upper and lower 5663 // nibble of a byte. 5664 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5665 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5666 if (VM_Version::supports_avx512vlbw()) { 5667 5668 // Get the reverse bit sequence of lower nibble of each byte. 5669 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5670 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5671 evpandq(dst, xtmp2, src, vec_enc); 5672 vpshufb(dst, xtmp1, dst, vec_enc); 5673 vpsllq(dst, dst, 4, vec_enc); 5674 5675 // Get the reverse bit sequence of upper nibble of each byte. 5676 vpandn(xtmp2, xtmp2, src, vec_enc); 5677 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5678 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5679 5680 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5681 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5682 evporq(xtmp2, dst, xtmp2, vec_enc); 5683 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5684 5685 } else if(vec_enc == Assembler::AVX_512bit) { 5686 // Shift based bit reversal. 5687 assert(bt == T_LONG || bt == T_INT, ""); 5688 5689 // Swap lower and upper nibble of each byte. 5690 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5691 5692 // Swap two least and most significant bits of each nibble. 5693 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5694 5695 // Swap adjacent pair of bits. 5696 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5697 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5698 5699 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5700 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5701 } else { 5702 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5703 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5704 5705 // Get the reverse bit sequence of lower nibble of each byte. 5706 vpand(dst, xtmp2, src, vec_enc); 5707 vpshufb(dst, xtmp1, dst, vec_enc); 5708 vpsllq(dst, dst, 4, vec_enc); 5709 5710 // Get the reverse bit sequence of upper nibble of each byte. 5711 vpandn(xtmp2, xtmp2, src, vec_enc); 5712 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5713 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5714 5715 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5716 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5717 vpor(xtmp2, dst, xtmp2, vec_enc); 5718 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5719 } 5720 } 5721 5722 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5723 XMMRegister xtmp, Register rscratch) { 5724 assert(VM_Version::supports_gfni(), ""); 5725 assert(rscratch != noreg || always_reachable(mask), "missing"); 5726 5727 // Galois field instruction based bit reversal based on following algorithm. 5728 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5729 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5730 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5731 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5732 } 5733 5734 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5735 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5736 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5737 evpandq(dst, xtmp1, src, vec_enc); 5738 vpsllq(dst, dst, nbits, vec_enc); 5739 vpandn(xtmp1, xtmp1, src, vec_enc); 5740 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5741 evporq(dst, dst, xtmp1, vec_enc); 5742 } 5743 5744 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5745 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5746 // Shift based bit reversal. 5747 assert(VM_Version::supports_evex(), ""); 5748 switch(bt) { 5749 case T_LONG: 5750 // Swap upper and lower double word of each quad word. 5751 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5752 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5753 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5754 break; 5755 case T_INT: 5756 // Swap upper and lower word of each double word. 5757 evprord(xtmp1, k0, src, 16, true, vec_enc); 5758 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5759 break; 5760 case T_CHAR: 5761 case T_SHORT: 5762 // Swap upper and lower byte of each word. 5763 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5764 break; 5765 case T_BYTE: 5766 evmovdquq(dst, k0, src, true, vec_enc); 5767 break; 5768 default: 5769 fatal("Unsupported type %s", type2name(bt)); 5770 break; 5771 } 5772 } 5773 5774 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5775 if (bt == T_BYTE) { 5776 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5777 evmovdquq(dst, k0, src, true, vec_enc); 5778 } else { 5779 vmovdqu(dst, src); 5780 } 5781 return; 5782 } 5783 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5784 // pre-computed shuffle indices. 5785 switch(bt) { 5786 case T_LONG: 5787 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5788 break; 5789 case T_INT: 5790 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5791 break; 5792 case T_CHAR: 5793 case T_SHORT: 5794 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5795 break; 5796 default: 5797 fatal("Unsupported type %s", type2name(bt)); 5798 break; 5799 } 5800 vpshufb(dst, src, dst, vec_enc); 5801 } 5802 5803 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5804 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5805 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5806 assert(is_integral_type(bt), ""); 5807 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5808 assert(VM_Version::supports_avx512cd(), ""); 5809 switch(bt) { 5810 case T_LONG: 5811 evplzcntq(dst, ktmp, src, merge, vec_enc); 5812 break; 5813 case T_INT: 5814 evplzcntd(dst, ktmp, src, merge, vec_enc); 5815 break; 5816 case T_SHORT: 5817 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5818 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5819 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5820 vpunpckhwd(dst, xtmp1, src, vec_enc); 5821 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5822 vpackusdw(dst, xtmp2, dst, vec_enc); 5823 break; 5824 case T_BYTE: 5825 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5826 // accessing the lookup table. 5827 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5828 // accessing the lookup table. 5829 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5830 assert(VM_Version::supports_avx512bw(), ""); 5831 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5832 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5833 vpand(xtmp2, dst, src, vec_enc); 5834 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5835 vpsrlw(xtmp3, src, 4, vec_enc); 5836 vpand(xtmp3, dst, xtmp3, vec_enc); 5837 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5838 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5839 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5840 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5841 break; 5842 default: 5843 fatal("Unsupported type %s", type2name(bt)); 5844 break; 5845 } 5846 } 5847 5848 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5849 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5850 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5851 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5852 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5853 // accessing the lookup table. 5854 vpand(dst, xtmp2, src, vec_enc); 5855 vpshufb(dst, xtmp1, dst, vec_enc); 5856 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5857 // accessing the lookup table. 5858 vpsrlw(xtmp3, src, 4, vec_enc); 5859 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5860 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5861 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5862 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5863 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5864 vpaddb(dst, dst, xtmp2, vec_enc); 5865 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5866 } 5867 5868 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5869 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5870 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5871 // Add zero counts of lower byte and upper byte of a word if 5872 // upper byte holds a zero value. 5873 vpsrlw(xtmp3, src, 8, vec_enc); 5874 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5875 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5876 vpsllw(xtmp2, dst, 8, vec_enc); 5877 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5878 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5879 vpsrlw(dst, dst, 8, vec_enc); 5880 } 5881 5882 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5883 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5884 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5885 // hence biased exponent can be used to compute leading zero count as per 5886 // following formula:- 5887 // LZCNT = 32 - (biased_exp - 127) 5888 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5889 5890 // Broadcast 0xFF 5891 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5892 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5893 5894 // Extract biased exponent. 5895 vcvtdq2ps(dst, src, vec_enc); 5896 vpsrld(dst, dst, 23, vec_enc); 5897 vpand(dst, dst, xtmp1, vec_enc); 5898 5899 // Broadcast 127. 5900 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5901 // Exponent = biased_exp - 127 5902 vpsubd(dst, dst, xtmp1, vec_enc); 5903 5904 // Exponent = Exponent + 1 5905 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5906 vpaddd(dst, dst, xtmp3, vec_enc); 5907 5908 // Replace -ve exponent with zero, exponent is -ve when src 5909 // lane contains a zero value. 5910 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5911 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5912 5913 // Rematerialize broadcast 32. 5914 vpslld(xtmp1, xtmp3, 5, vec_enc); 5915 // Exponent is 32 if corresponding source lane contains max_int value. 5916 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5917 // LZCNT = 32 - exponent 5918 vpsubd(dst, xtmp1, dst, vec_enc); 5919 5920 // Replace LZCNT with a value 1 if corresponding source lane 5921 // contains max_int value. 5922 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5923 5924 // Replace biased_exp with 0 if source lane value is less than zero. 5925 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5926 vblendvps(dst, dst, xtmp2, src, vec_enc); 5927 } 5928 5929 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5930 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5931 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5932 // Add zero counts of lower word and upper word of a double word if 5933 // upper word holds a zero value. 5934 vpsrld(xtmp3, src, 16, vec_enc); 5935 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5936 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 5937 vpslld(xtmp2, dst, 16, vec_enc); 5938 vpaddd(xtmp2, xtmp2, dst, vec_enc); 5939 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5940 vpsrld(dst, dst, 16, vec_enc); 5941 // Add zero counts of lower doubleword and upper doubleword of a 5942 // quadword if upper doubleword holds a zero value. 5943 vpsrlq(xtmp3, src, 32, vec_enc); 5944 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 5945 vpsllq(xtmp2, dst, 32, vec_enc); 5946 vpaddq(xtmp2, xtmp2, dst, vec_enc); 5947 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5948 vpsrlq(dst, dst, 32, vec_enc); 5949 } 5950 5951 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 5952 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5953 Register rtmp, int vec_enc) { 5954 assert(is_integral_type(bt), "unexpected type"); 5955 assert(vec_enc < Assembler::AVX_512bit, ""); 5956 switch(bt) { 5957 case T_LONG: 5958 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5959 break; 5960 case T_INT: 5961 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 5962 break; 5963 case T_SHORT: 5964 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5965 break; 5966 case T_BYTE: 5967 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5968 break; 5969 default: 5970 fatal("Unsupported type %s", type2name(bt)); 5971 break; 5972 } 5973 } 5974 5975 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5976 switch(bt) { 5977 case T_BYTE: 5978 vpsubb(dst, src1, src2, vec_enc); 5979 break; 5980 case T_SHORT: 5981 vpsubw(dst, src1, src2, vec_enc); 5982 break; 5983 case T_INT: 5984 vpsubd(dst, src1, src2, vec_enc); 5985 break; 5986 case T_LONG: 5987 vpsubq(dst, src1, src2, vec_enc); 5988 break; 5989 default: 5990 fatal("Unsupported type %s", type2name(bt)); 5991 break; 5992 } 5993 } 5994 5995 // Trailing zero count computation is based on leading zero count operation as per 5996 // following equation. All AVX3 targets support AVX512CD feature which offers 5997 // direct vector instruction to compute leading zero count. 5998 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5999 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6000 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6001 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6002 assert(is_integral_type(bt), ""); 6003 // xtmp = -1 6004 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6005 // xtmp = xtmp + src 6006 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6007 // xtmp = xtmp & ~src 6008 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6009 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6010 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6011 vpsub(bt, dst, xtmp4, dst, vec_enc); 6012 } 6013 6014 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6015 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6016 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6017 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6018 assert(is_integral_type(bt), ""); 6019 // xtmp = 0 6020 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6021 // xtmp = 0 - src 6022 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6023 // xtmp = xtmp | src 6024 vpor(xtmp3, xtmp3, src, vec_enc); 6025 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6026 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6027 vpsub(bt, dst, xtmp1, dst, vec_enc); 6028 } 6029 6030 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6031 Label done; 6032 Label neg_divisor_fastpath; 6033 cmpl(divisor, 0); 6034 jccb(Assembler::less, neg_divisor_fastpath); 6035 xorl(rdx, rdx); 6036 divl(divisor); 6037 jmpb(done); 6038 bind(neg_divisor_fastpath); 6039 // Fastpath for divisor < 0: 6040 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6041 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6042 movl(rdx, rax); 6043 subl(rdx, divisor); 6044 if (VM_Version::supports_bmi1()) { 6045 andnl(rax, rdx, rax); 6046 } else { 6047 notl(rdx); 6048 andl(rax, rdx); 6049 } 6050 shrl(rax, 31); 6051 bind(done); 6052 } 6053 6054 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6055 Label done; 6056 Label neg_divisor_fastpath; 6057 cmpl(divisor, 0); 6058 jccb(Assembler::less, neg_divisor_fastpath); 6059 xorl(rdx, rdx); 6060 divl(divisor); 6061 jmpb(done); 6062 bind(neg_divisor_fastpath); 6063 // Fastpath when divisor < 0: 6064 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6065 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6066 movl(rdx, rax); 6067 subl(rax, divisor); 6068 if (VM_Version::supports_bmi1()) { 6069 andnl(rax, rax, rdx); 6070 } else { 6071 notl(rax); 6072 andl(rax, rdx); 6073 } 6074 sarl(rax, 31); 6075 andl(rax, divisor); 6076 subl(rdx, rax); 6077 bind(done); 6078 } 6079 6080 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6081 Label done; 6082 Label neg_divisor_fastpath; 6083 6084 cmpl(divisor, 0); 6085 jccb(Assembler::less, neg_divisor_fastpath); 6086 xorl(rdx, rdx); 6087 divl(divisor); 6088 jmpb(done); 6089 bind(neg_divisor_fastpath); 6090 // Fastpath for divisor < 0: 6091 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6092 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6093 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6094 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6095 movl(rdx, rax); 6096 subl(rax, divisor); 6097 if (VM_Version::supports_bmi1()) { 6098 andnl(rax, rax, rdx); 6099 } else { 6100 notl(rax); 6101 andl(rax, rdx); 6102 } 6103 movl(tmp, rax); 6104 shrl(rax, 31); // quotient 6105 sarl(tmp, 31); 6106 andl(tmp, divisor); 6107 subl(rdx, tmp); // remainder 6108 bind(done); 6109 } 6110 6111 #ifdef _LP64 6112 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6113 XMMRegister xtmp2, Register rtmp) { 6114 if(VM_Version::supports_gfni()) { 6115 // Galois field instruction based bit reversal based on following algorithm. 6116 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6117 mov64(rtmp, 0x8040201008040201L); 6118 movq(xtmp1, src); 6119 movq(xtmp2, rtmp); 6120 gf2p8affineqb(xtmp1, xtmp2, 0); 6121 movq(dst, xtmp1); 6122 } else { 6123 // Swap even and odd numbered bits. 6124 movl(rtmp, src); 6125 andl(rtmp, 0x55555555); 6126 shll(rtmp, 1); 6127 movl(dst, src); 6128 andl(dst, 0xAAAAAAAA); 6129 shrl(dst, 1); 6130 orl(dst, rtmp); 6131 6132 // Swap LSB and MSB 2 bits of each nibble. 6133 movl(rtmp, dst); 6134 andl(rtmp, 0x33333333); 6135 shll(rtmp, 2); 6136 andl(dst, 0xCCCCCCCC); 6137 shrl(dst, 2); 6138 orl(dst, rtmp); 6139 6140 // Swap LSB and MSB 4 bits of each byte. 6141 movl(rtmp, dst); 6142 andl(rtmp, 0x0F0F0F0F); 6143 shll(rtmp, 4); 6144 andl(dst, 0xF0F0F0F0); 6145 shrl(dst, 4); 6146 orl(dst, rtmp); 6147 } 6148 bswapl(dst); 6149 } 6150 6151 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6152 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6153 if(VM_Version::supports_gfni()) { 6154 // Galois field instruction based bit reversal based on following algorithm. 6155 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6156 mov64(rtmp1, 0x8040201008040201L); 6157 movq(xtmp1, src); 6158 movq(xtmp2, rtmp1); 6159 gf2p8affineqb(xtmp1, xtmp2, 0); 6160 movq(dst, xtmp1); 6161 } else { 6162 // Swap even and odd numbered bits. 6163 movq(rtmp1, src); 6164 mov64(rtmp2, 0x5555555555555555L); 6165 andq(rtmp1, rtmp2); 6166 shlq(rtmp1, 1); 6167 movq(dst, src); 6168 notq(rtmp2); 6169 andq(dst, rtmp2); 6170 shrq(dst, 1); 6171 orq(dst, rtmp1); 6172 6173 // Swap LSB and MSB 2 bits of each nibble. 6174 movq(rtmp1, dst); 6175 mov64(rtmp2, 0x3333333333333333L); 6176 andq(rtmp1, rtmp2); 6177 shlq(rtmp1, 2); 6178 notq(rtmp2); 6179 andq(dst, rtmp2); 6180 shrq(dst, 2); 6181 orq(dst, rtmp1); 6182 6183 // Swap LSB and MSB 4 bits of each byte. 6184 movq(rtmp1, dst); 6185 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6186 andq(rtmp1, rtmp2); 6187 shlq(rtmp1, 4); 6188 notq(rtmp2); 6189 andq(dst, rtmp2); 6190 shrq(dst, 4); 6191 orq(dst, rtmp1); 6192 } 6193 bswapq(dst); 6194 } 6195 6196 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6197 Label done; 6198 Label neg_divisor_fastpath; 6199 cmpq(divisor, 0); 6200 jccb(Assembler::less, neg_divisor_fastpath); 6201 xorl(rdx, rdx); 6202 divq(divisor); 6203 jmpb(done); 6204 bind(neg_divisor_fastpath); 6205 // Fastpath for divisor < 0: 6206 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6207 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6208 movq(rdx, rax); 6209 subq(rdx, divisor); 6210 if (VM_Version::supports_bmi1()) { 6211 andnq(rax, rdx, rax); 6212 } else { 6213 notq(rdx); 6214 andq(rax, rdx); 6215 } 6216 shrq(rax, 63); 6217 bind(done); 6218 } 6219 6220 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6221 Label done; 6222 Label neg_divisor_fastpath; 6223 cmpq(divisor, 0); 6224 jccb(Assembler::less, neg_divisor_fastpath); 6225 xorq(rdx, rdx); 6226 divq(divisor); 6227 jmp(done); 6228 bind(neg_divisor_fastpath); 6229 // Fastpath when divisor < 0: 6230 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6231 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6232 movq(rdx, rax); 6233 subq(rax, divisor); 6234 if (VM_Version::supports_bmi1()) { 6235 andnq(rax, rax, rdx); 6236 } else { 6237 notq(rax); 6238 andq(rax, rdx); 6239 } 6240 sarq(rax, 63); 6241 andq(rax, divisor); 6242 subq(rdx, rax); 6243 bind(done); 6244 } 6245 6246 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6247 Label done; 6248 Label neg_divisor_fastpath; 6249 cmpq(divisor, 0); 6250 jccb(Assembler::less, neg_divisor_fastpath); 6251 xorq(rdx, rdx); 6252 divq(divisor); 6253 jmp(done); 6254 bind(neg_divisor_fastpath); 6255 // Fastpath for divisor < 0: 6256 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6257 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6258 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6259 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6260 movq(rdx, rax); 6261 subq(rax, divisor); 6262 if (VM_Version::supports_bmi1()) { 6263 andnq(rax, rax, rdx); 6264 } else { 6265 notq(rax); 6266 andq(rax, rdx); 6267 } 6268 movq(tmp, rax); 6269 shrq(rax, 63); // quotient 6270 sarq(tmp, 63); 6271 andq(tmp, divisor); 6272 subq(rdx, tmp); // remainder 6273 bind(done); 6274 } 6275 #endif 6276 6277 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6278 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6279 int vlen_enc) { 6280 assert(VM_Version::supports_avx512bw(), ""); 6281 // Byte shuffles are inlane operations and indices are determined using 6282 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6283 // normalized to index range 0-15. This makes sure that all the multiples 6284 // of an index value are placed at same relative position in 128 bit 6285 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6286 // will be 16th element in their respective 128 bit lanes. 6287 movl(rtmp, 16); 6288 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6289 6290 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6291 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6292 // original shuffle indices and move the shuffled lanes corresponding to true 6293 // mask to destination vector. 6294 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6295 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6296 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6297 6298 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6299 // and broadcasting second 128 bit lane. 6300 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6301 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6302 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6303 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6304 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6305 6306 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6307 // and broadcasting third 128 bit lane. 6308 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6309 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6310 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6311 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6312 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6313 6314 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6315 // and broadcasting third 128 bit lane. 6316 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6317 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6318 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6319 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6320 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6321 } 6322 6323 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6324 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6325 if (vlen_enc == AVX_128bit) { 6326 vpermilps(dst, src, shuffle, vlen_enc); 6327 } else if (bt == T_INT) { 6328 vpermd(dst, shuffle, src, vlen_enc); 6329 } else { 6330 assert(bt == T_FLOAT, ""); 6331 vpermps(dst, shuffle, src, vlen_enc); 6332 } 6333 }