1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address (displaced header location) - KILLED 249 // rax,: tmp -- KILLED 250 // scr: tmp -- KILLED 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 252 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 253 Metadata* method_data) { 254 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 255 // Ensure the register assignments are disjoint 256 assert(tmpReg == rax, ""); 257 assert(cx1Reg == noreg, ""); 258 assert(cx2Reg == noreg, ""); 259 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 260 261 // Possible cases that we'll encounter in fast_lock 262 // ------------------------------------------------ 263 // * Inflated 264 // -- unlocked 265 // -- Locked 266 // = by self 267 // = by other 268 // * neutral 269 // * stack-locked 270 // -- by self 271 // = sp-proximity test hits 272 // = sp-proximity test generates false-negative 273 // -- by other 274 // 275 276 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 277 278 if (DiagnoseSyncOnValueBasedClasses != 0) { 279 load_klass(tmpReg, objReg, scrReg); 280 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 281 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 282 jcc(Assembler::notZero, DONE_LABEL); 283 } 284 285 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 286 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 287 jcc(Assembler::notZero, IsInflated); 288 289 if (LockingMode == LM_MONITOR) { 290 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 291 testptr(objReg, objReg); 292 } else { 293 assert(LockingMode == LM_LEGACY, "must be"); 294 // Attempt stack-locking ... 295 orptr (tmpReg, markWord::unlocked_value); 296 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 297 lock(); 298 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 299 jcc(Assembler::equal, COUNT); // Success 300 301 // Recursive locking. 302 // The object is stack-locked: markword contains stack pointer to BasicLock. 303 // Locked by current thread if difference with current SP is less than one page. 304 subptr(tmpReg, rsp); 305 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 306 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 307 movptr(Address(boxReg, 0), tmpReg); 308 } 309 jmp(DONE_LABEL); 310 311 bind(IsInflated); 312 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 313 314 #ifndef _LP64 315 // The object is inflated. 316 317 // boxReg refers to the on-stack BasicLock in the current frame. 318 // We'd like to write: 319 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 320 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 321 // additional latency as we have another ST in the store buffer that must drain. 322 323 // avoid ST-before-CAS 324 // register juggle because we need tmpReg for cmpxchgptr below 325 movptr(scrReg, boxReg); 326 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 327 328 // Optimistic form: consider XORL tmpReg,tmpReg 329 movptr(tmpReg, NULL_WORD); 330 331 // Appears unlocked - try to swing _owner from null to non-null. 332 // Ideally, I'd manifest "Self" with get_thread and then attempt 333 // to CAS the register containing Self into m->Owner. 334 // But we don't have enough registers, so instead we can either try to CAS 335 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 336 // we later store "Self" into m->Owner. Transiently storing a stack address 337 // (rsp or the address of the box) into m->owner is harmless. 338 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 339 lock(); 340 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 341 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 342 // If we weren't able to swing _owner from null to the BasicLock 343 // then take the slow path. 344 jccb (Assembler::notZero, NO_COUNT); 345 // update _owner from BasicLock to thread 346 get_thread (scrReg); // beware: clobbers ICCs 347 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 348 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 349 350 // If the CAS fails we can either retry or pass control to the slow path. 351 // We use the latter tactic. 352 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 353 // If the CAS was successful ... 354 // Self has acquired the lock 355 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 356 // Intentional fall-through into DONE_LABEL ... 357 #else // _LP64 358 // It's inflated and we use scrReg for ObjectMonitor* in this section. 359 movq(scrReg, tmpReg); 360 xorq(tmpReg, tmpReg); 361 lock(); 362 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 363 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 364 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 365 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 366 // Propagate ICC.ZF from CAS above into DONE_LABEL. 367 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 368 369 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 370 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 371 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 372 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 373 #endif // _LP64 374 bind(DONE_LABEL); 375 376 // ZFlag == 1 count in fast path 377 // ZFlag == 0 count in slow path 378 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 379 380 bind(COUNT); 381 // Count monitors in fast path 382 increment(Address(thread, JavaThread::held_monitor_count_offset())); 383 384 xorl(tmpReg, tmpReg); // Set ZF == 1 385 386 bind(NO_COUNT); 387 388 // At NO_COUNT the icc ZFlag is set as follows ... 389 // fast_unlock uses the same protocol. 390 // ZFlag == 1 -> Success 391 // ZFlag == 0 -> Failure - force control through the slow path 392 } 393 394 // obj: object to unlock 395 // box: box address (displaced header location), killed. Must be EAX. 396 // tmp: killed, cannot be obj nor box. 397 // 398 // Some commentary on balanced locking: 399 // 400 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 401 // Methods that don't have provably balanced locking are forced to run in the 402 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 403 // The interpreter provides two properties: 404 // I1: At return-time the interpreter automatically and quietly unlocks any 405 // objects acquired the current activation (frame). Recall that the 406 // interpreter maintains an on-stack list of locks currently held by 407 // a frame. 408 // I2: If a method attempts to unlock an object that is not held by the 409 // the frame the interpreter throws IMSX. 410 // 411 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 412 // B() doesn't have provably balanced locking so it runs in the interpreter. 413 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 414 // is still locked by A(). 415 // 416 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 417 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 418 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 419 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 420 // Arguably given that the spec legislates the JNI case as undefined our implementation 421 // could reasonably *avoid* checking owner in fast_unlock(). 422 // In the interest of performance we elide m->Owner==Self check in unlock. 423 // A perfectly viable alternative is to elide the owner check except when 424 // Xcheck:jni is enabled. 425 426 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 427 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 428 assert(boxReg == rax, ""); 429 assert_different_registers(objReg, boxReg, tmpReg); 430 431 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 432 433 if (LockingMode == LM_LEGACY) { 434 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 435 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 436 } 437 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 438 if (LockingMode != LM_MONITOR) { 439 testptr(tmpReg, markWord::monitor_value); // Inflated? 440 jcc(Assembler::zero, Stacked); 441 } 442 443 // It's inflated. 444 445 // Despite our balanced locking property we still check that m->_owner == Self 446 // as java routines or native JNI code called by this thread might 447 // have released the lock. 448 // Refer to the comments in synchronizer.cpp for how we might encode extra 449 // state in _succ so we can avoid fetching EntryList|cxq. 450 // 451 // If there's no contention try a 1-0 exit. That is, exit without 452 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 453 // we detect and recover from the race that the 1-0 exit admits. 454 // 455 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 456 // before it STs null into _owner, releasing the lock. Updates 457 // to data protected by the critical section must be visible before 458 // we drop the lock (and thus before any other thread could acquire 459 // the lock and observe the fields protected by the lock). 460 // IA32's memory-model is SPO, so STs are ordered with respect to 461 // each other and there's no need for an explicit barrier (fence). 462 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 463 #ifndef _LP64 464 // Note that we could employ various encoding schemes to reduce 465 // the number of loads below (currently 4) to just 2 or 3. 466 // Refer to the comments in synchronizer.cpp. 467 // In practice the chain of fetches doesn't seem to impact performance, however. 468 xorptr(boxReg, boxReg); 469 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 470 jccb (Assembler::notZero, DONE_LABEL); 471 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 472 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 473 jccb (Assembler::notZero, DONE_LABEL); 474 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 475 jmpb (DONE_LABEL); 476 #else // _LP64 477 // It's inflated 478 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 479 480 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 481 jccb(Assembler::equal, LNotRecursive); 482 483 // Recursive inflated unlock 484 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 485 jmpb(LSuccess); 486 487 bind(LNotRecursive); 488 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 489 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 490 jccb (Assembler::notZero, CheckSucc); 491 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 492 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 493 jmpb (DONE_LABEL); 494 495 // Try to avoid passing control into the slow_path ... 496 bind (CheckSucc); 497 498 // The following optional optimization can be elided if necessary 499 // Effectively: if (succ == null) goto slow path 500 // The code reduces the window for a race, however, 501 // and thus benefits performance. 502 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 503 jccb (Assembler::zero, LGoSlowPath); 504 505 xorptr(boxReg, boxReg); 506 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 507 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 508 509 // Memory barrier/fence 510 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 511 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 512 // This is faster on Nehalem and AMD Shanghai/Barcelona. 513 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 514 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 515 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 516 lock(); addl(Address(rsp, 0), 0); 517 518 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 519 jccb (Assembler::notZero, LSuccess); 520 521 // Rare inopportune interleaving - race. 522 // The successor vanished in the small window above. 523 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 524 // We need to ensure progress and succession. 525 // Try to reacquire the lock. 526 // If that fails then the new owner is responsible for succession and this 527 // thread needs to take no further action and can exit via the fast path (success). 528 // If the re-acquire succeeds then pass control into the slow path. 529 // As implemented, this latter mode is horrible because we generated more 530 // coherence traffic on the lock *and* artificially extended the critical section 531 // length while by virtue of passing control into the slow path. 532 533 // box is really RAX -- the following CMPXCHG depends on that binding 534 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 535 lock(); 536 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 537 // There's no successor so we tried to regrab the lock. 538 // If that didn't work, then another thread grabbed the 539 // lock so we're done (and exit was a success). 540 jccb (Assembler::notEqual, LSuccess); 541 // Intentional fall-through into slow path 542 543 bind (LGoSlowPath); 544 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 545 jmpb (DONE_LABEL); 546 547 bind (LSuccess); 548 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 549 jmpb (DONE_LABEL); 550 551 #endif 552 if (LockingMode == LM_LEGACY) { 553 bind (Stacked); 554 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 555 lock(); 556 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 557 // Intentional fall-thru into DONE_LABEL 558 } 559 560 bind(DONE_LABEL); 561 562 // ZFlag == 1 count in fast path 563 // ZFlag == 0 count in slow path 564 jccb(Assembler::notZero, NO_COUNT); 565 566 bind(COUNT); 567 // Count monitors in fast path 568 #ifndef _LP64 569 get_thread(tmpReg); 570 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 571 #else // _LP64 572 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 573 #endif 574 575 xorl(tmpReg, tmpReg); // Set ZF == 1 576 577 bind(NO_COUNT); 578 } 579 580 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 581 Register t, Register thread) { 582 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 583 assert(rax_reg == rax, "Used for CAS"); 584 assert_different_registers(obj, box, rax_reg, t, thread); 585 586 // Handle inflated monitor. 587 Label inflated; 588 // Finish fast lock successfully. ZF value is irrelevant. 589 Label locked; 590 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 591 Label slow_path; 592 593 if (UseObjectMonitorTable) { 594 // Clear cache in case fast locking succeeds. 595 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 596 } 597 598 if (DiagnoseSyncOnValueBasedClasses != 0) { 599 load_klass(rax_reg, obj, t); 600 movl(rax_reg, Address(rax_reg, Klass::access_flags_offset())); 601 testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS); 602 jcc(Assembler::notZero, slow_path); 603 } 604 605 const Register mark = t; 606 607 { // Lightweight Lock 608 609 Label push; 610 611 const Register top = UseObjectMonitorTable ? rax_reg : box; 612 613 // Load the mark. 614 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 615 616 // Prefetch top. 617 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 618 619 // Check for monitor (0b10). 620 testptr(mark, markWord::monitor_value); 621 jcc(Assembler::notZero, inflated); 622 623 // Check if lock-stack is full. 624 cmpl(top, LockStack::end_offset() - 1); 625 jcc(Assembler::greater, slow_path); 626 627 // Check if recursive. 628 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 629 jccb(Assembler::equal, push); 630 631 // Try to lock. Transition lock bits 0b01 => 0b00 632 movptr(rax_reg, mark); 633 orptr(rax_reg, markWord::unlocked_value); 634 andptr(mark, ~(int32_t)markWord::unlocked_value); 635 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 636 jcc(Assembler::notEqual, slow_path); 637 638 if (UseObjectMonitorTable) { 639 // Need to reload top, clobbered by CAS. 640 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 641 } 642 bind(push); 643 // After successful lock, push object on lock-stack. 644 movptr(Address(thread, top), obj); 645 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 646 jmpb(locked); 647 } 648 649 { // Handle inflated monitor. 650 bind(inflated); 651 652 const Register monitor = t; 653 654 if (!UseObjectMonitorTable) { 655 assert(mark == monitor, "should be the same here"); 656 } else { 657 // Uses ObjectMonitorTable. Look for the monitor in the om_cache. 658 // Fetch ObjectMonitor* from the cache or take the slow-path. 659 Label monitor_found; 660 661 // Load cache address 662 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 663 664 const int num_unrolled = 2; 665 for (int i = 0; i < num_unrolled; i++) { 666 cmpptr(obj, Address(t)); 667 jccb(Assembler::equal, monitor_found); 668 if (i + 1 != num_unrolled) { 669 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 670 } 671 } 672 673 // Loop after unrolling, advance iterator. 674 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 675 676 Label loop; 677 678 // Search for obj in cache. 679 bind(loop); 680 681 // Check for match. 682 cmpptr(obj, Address(t)); 683 jccb(Assembler::equal, monitor_found); 684 685 // Search until null encountered, guaranteed _null_sentinel at end. 686 cmpptr(Address(t), 1); 687 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 688 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 689 jmpb(loop); 690 691 // Cache hit. 692 bind(monitor_found); 693 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 694 } 695 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 696 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 697 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 698 699 Label monitor_locked; 700 // Lock the monitor. 701 702 // CAS owner (null => current thread). 703 xorptr(rax_reg, rax_reg); 704 lock(); cmpxchgptr(thread, owner_address); 705 jccb(Assembler::equal, monitor_locked); 706 707 // Check if recursive. 708 cmpptr(thread, rax_reg); 709 jccb(Assembler::notEqual, slow_path); 710 711 // Recursive. 712 increment(recursions_address); 713 714 bind(monitor_locked); 715 if (UseObjectMonitorTable) { 716 // Cache the monitor for unlock 717 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 718 } 719 } 720 721 bind(locked); 722 increment(Address(thread, JavaThread::held_monitor_count_offset())); 723 // Set ZF = 1 724 xorl(rax_reg, rax_reg); 725 726 #ifdef ASSERT 727 // Check that locked label is reached with ZF set. 728 Label zf_correct; 729 Label zf_bad_zero; 730 jcc(Assembler::zero, zf_correct); 731 jmp(zf_bad_zero); 732 #endif 733 734 bind(slow_path); 735 #ifdef ASSERT 736 // Check that slow_path label is reached with ZF not set. 737 jcc(Assembler::notZero, zf_correct); 738 stop("Fast Lock ZF != 0"); 739 bind(zf_bad_zero); 740 stop("Fast Lock ZF != 1"); 741 bind(zf_correct); 742 #endif 743 // C2 uses the value of ZF to determine the continuation. 744 } 745 746 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 747 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 748 assert(reg_rax == rax, "Used for CAS"); 749 assert_different_registers(obj, reg_rax, t); 750 751 // Handle inflated monitor. 752 Label inflated, inflated_check_lock_stack; 753 // Finish fast unlock successfully. MUST jump with ZF == 1 754 Label unlocked; 755 756 // Assume success. 757 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 758 759 const Register mark = t; 760 const Register monitor = t; 761 const Register top = UseObjectMonitorTable ? t : reg_rax; 762 const Register box = reg_rax; 763 764 Label dummy; 765 C2FastUnlockLightweightStub* stub = nullptr; 766 767 if (!Compile::current()->output()->in_scratch_emit_size()) { 768 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 769 Compile::current()->output()->add_stub(stub); 770 } 771 772 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 773 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 774 Label& slow_path = stub == nullptr ? dummy : stub->slow_path(); 775 776 { // Lightweight Unlock 777 778 // Load top. 779 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 780 781 if (!UseObjectMonitorTable) { 782 // Prefetch mark. 783 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 784 } 785 786 // Check if obj is top of lock-stack. 787 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 788 // Top of lock stack was not obj. Must be monitor. 789 jcc(Assembler::notEqual, inflated_check_lock_stack); 790 791 // Pop lock-stack. 792 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 793 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 794 795 // Check if recursive. 796 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 797 jcc(Assembler::equal, unlocked); 798 799 // We elide the monitor check, let the CAS fail instead. 800 801 if (UseObjectMonitorTable) { 802 // Load mark. 803 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 804 } 805 806 // Try to unlock. Transition lock bits 0b00 => 0b01 807 movptr(reg_rax, mark); 808 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 809 orptr(mark, markWord::unlocked_value); 810 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 811 jcc(Assembler::notEqual, push_and_slow_path); 812 jmp(unlocked); 813 } 814 815 816 { // Handle inflated monitor. 817 bind(inflated_check_lock_stack); 818 #ifdef ASSERT 819 Label check_done; 820 subl(top, oopSize); 821 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 822 jcc(Assembler::below, check_done); 823 cmpptr(obj, Address(thread, top)); 824 jccb(Assembler::notEqual, inflated_check_lock_stack); 825 stop("Fast Unlock lock on stack"); 826 bind(check_done); 827 if (UseObjectMonitorTable) { 828 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 829 } 830 testptr(mark, markWord::monitor_value); 831 jccb(Assembler::notZero, inflated); 832 stop("Fast Unlock not monitor"); 833 #endif 834 835 bind(inflated); 836 837 if (!UseObjectMonitorTable) { 838 assert(mark == monitor, "should be the same here"); 839 } else { 840 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack. 841 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 842 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*) 843 cmpptr(monitor, alignof(ObjectMonitor*)); 844 jcc(Assembler::below, slow_path); 845 } 846 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 847 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 848 const Address cxq_address{monitor, ObjectMonitor::cxq_offset() - monitor_tag}; 849 const Address EntryList_address{monitor, ObjectMonitor::EntryList_offset() - monitor_tag}; 850 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag}; 851 852 Label recursive; 853 854 // Check if recursive. 855 cmpptr(recursions_address, 0); 856 jccb(Assembler::notEqual, recursive); 857 858 // Check if the entry lists are empty. 859 movptr(reg_rax, cxq_address); 860 orptr(reg_rax, EntryList_address); 861 jcc(Assembler::notZero, check_successor); 862 863 // Release lock. 864 movptr(owner_address, NULL_WORD); 865 jmpb(unlocked); 866 867 // Recursive unlock. 868 bind(recursive); 869 decrement(recursions_address); 870 xorl(t, t); 871 } 872 873 bind(unlocked); 874 if (stub != nullptr) { 875 bind(stub->unlocked_continuation()); 876 } 877 878 #ifdef ASSERT 879 // Check that unlocked label is reached with ZF set. 880 Label zf_correct; 881 jcc(Assembler::zero, zf_correct); 882 stop("Fast Unlock ZF != 1"); 883 #endif 884 885 if (stub != nullptr) { 886 bind(stub->slow_path_continuation()); 887 } 888 #ifdef ASSERT 889 // Check that stub->continuation() label is reached with ZF not set. 890 jccb(Assembler::notZero, zf_correct); 891 stop("Fast Unlock ZF != 0"); 892 bind(zf_correct); 893 #endif 894 // C2 uses the value of ZF to determine the continuation. 895 } 896 897 //------------------------------------------------------------------------------------------- 898 // Generic instructions support for use in .ad files C2 code generation 899 900 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 901 if (dst != src) { 902 movdqu(dst, src); 903 } 904 if (opcode == Op_AbsVD) { 905 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 906 } else { 907 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 908 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 909 } 910 } 911 912 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 913 if (opcode == Op_AbsVD) { 914 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 915 } else { 916 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 917 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 918 } 919 } 920 921 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 922 if (dst != src) { 923 movdqu(dst, src); 924 } 925 if (opcode == Op_AbsVF) { 926 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 927 } else { 928 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 929 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 930 } 931 } 932 933 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 934 if (opcode == Op_AbsVF) { 935 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 936 } else { 937 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 938 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 939 } 940 } 941 942 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 943 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 944 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 945 946 if (opcode == Op_MinV) { 947 if (elem_bt == T_BYTE) { 948 pminsb(dst, src); 949 } else if (elem_bt == T_SHORT) { 950 pminsw(dst, src); 951 } else if (elem_bt == T_INT) { 952 pminsd(dst, src); 953 } else { 954 assert(elem_bt == T_LONG, "required"); 955 assert(tmp == xmm0, "required"); 956 assert_different_registers(dst, src, tmp); 957 movdqu(xmm0, dst); 958 pcmpgtq(xmm0, src); 959 blendvpd(dst, src); // xmm0 as mask 960 } 961 } else { // opcode == Op_MaxV 962 if (elem_bt == T_BYTE) { 963 pmaxsb(dst, src); 964 } else if (elem_bt == T_SHORT) { 965 pmaxsw(dst, src); 966 } else if (elem_bt == T_INT) { 967 pmaxsd(dst, src); 968 } else { 969 assert(elem_bt == T_LONG, "required"); 970 assert(tmp == xmm0, "required"); 971 assert_different_registers(dst, src, tmp); 972 movdqu(xmm0, src); 973 pcmpgtq(xmm0, dst); 974 blendvpd(dst, src); // xmm0 as mask 975 } 976 } 977 } 978 979 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 980 XMMRegister dst, XMMRegister src1, XMMRegister src2, 981 int vlen_enc) { 982 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 983 984 if (opcode == Op_MinV) { 985 if (elem_bt == T_BYTE) { 986 vpminsb(dst, src1, src2, vlen_enc); 987 } else if (elem_bt == T_SHORT) { 988 vpminsw(dst, src1, src2, vlen_enc); 989 } else if (elem_bt == T_INT) { 990 vpminsd(dst, src1, src2, vlen_enc); 991 } else { 992 assert(elem_bt == T_LONG, "required"); 993 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 994 vpminsq(dst, src1, src2, vlen_enc); 995 } else { 996 assert_different_registers(dst, src1, src2); 997 vpcmpgtq(dst, src1, src2, vlen_enc); 998 vblendvpd(dst, src1, src2, dst, vlen_enc); 999 } 1000 } 1001 } else { // opcode == Op_MaxV 1002 if (elem_bt == T_BYTE) { 1003 vpmaxsb(dst, src1, src2, vlen_enc); 1004 } else if (elem_bt == T_SHORT) { 1005 vpmaxsw(dst, src1, src2, vlen_enc); 1006 } else if (elem_bt == T_INT) { 1007 vpmaxsd(dst, src1, src2, vlen_enc); 1008 } else { 1009 assert(elem_bt == T_LONG, "required"); 1010 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1011 vpmaxsq(dst, src1, src2, vlen_enc); 1012 } else { 1013 assert_different_registers(dst, src1, src2); 1014 vpcmpgtq(dst, src1, src2, vlen_enc); 1015 vblendvpd(dst, src2, src1, dst, vlen_enc); 1016 } 1017 } 1018 } 1019 } 1020 1021 // Float/Double min max 1022 1023 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1024 XMMRegister dst, XMMRegister a, XMMRegister b, 1025 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1026 int vlen_enc) { 1027 assert(UseAVX > 0, "required"); 1028 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1029 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1030 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1031 assert_different_registers(a, tmp, atmp, btmp); 1032 assert_different_registers(b, tmp, atmp, btmp); 1033 1034 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1035 bool is_double_word = is_double_word_type(elem_bt); 1036 1037 /* Note on 'non-obvious' assembly sequence: 1038 * 1039 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1040 * and Java on how they handle floats: 1041 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1042 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1043 * 1044 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1045 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1046 * (only useful when signs differ, noop otherwise) 1047 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1048 1049 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1050 * btmp = (b < +0.0) ? a : b 1051 * atmp = (b < +0.0) ? b : a 1052 * Tmp = Max_Float(atmp , btmp) 1053 * Res = (atmp == NaN) ? atmp : Tmp 1054 */ 1055 1056 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1057 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1058 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1059 XMMRegister mask; 1060 1061 if (!is_double_word && is_min) { 1062 mask = a; 1063 vblend = &MacroAssembler::vblendvps; 1064 vmaxmin = &MacroAssembler::vminps; 1065 vcmp = &MacroAssembler::vcmpps; 1066 } else if (!is_double_word && !is_min) { 1067 mask = b; 1068 vblend = &MacroAssembler::vblendvps; 1069 vmaxmin = &MacroAssembler::vmaxps; 1070 vcmp = &MacroAssembler::vcmpps; 1071 } else if (is_double_word && is_min) { 1072 mask = a; 1073 vblend = &MacroAssembler::vblendvpd; 1074 vmaxmin = &MacroAssembler::vminpd; 1075 vcmp = &MacroAssembler::vcmppd; 1076 } else { 1077 assert(is_double_word && !is_min, "sanity"); 1078 mask = b; 1079 vblend = &MacroAssembler::vblendvpd; 1080 vmaxmin = &MacroAssembler::vmaxpd; 1081 vcmp = &MacroAssembler::vcmppd; 1082 } 1083 1084 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1085 XMMRegister maxmin, scratch; 1086 if (dst == btmp) { 1087 maxmin = btmp; 1088 scratch = tmp; 1089 } else { 1090 maxmin = tmp; 1091 scratch = btmp; 1092 } 1093 1094 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1095 if (precompute_mask && !is_double_word) { 1096 vpsrad(tmp, mask, 32, vlen_enc); 1097 mask = tmp; 1098 } else if (precompute_mask && is_double_word) { 1099 vpxor(tmp, tmp, tmp, vlen_enc); 1100 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1101 mask = tmp; 1102 } 1103 1104 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1105 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1106 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1107 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1108 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1109 } 1110 1111 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1112 XMMRegister dst, XMMRegister a, XMMRegister b, 1113 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1114 int vlen_enc) { 1115 assert(UseAVX > 2, "required"); 1116 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1117 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1118 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1119 assert_different_registers(dst, a, atmp, btmp); 1120 assert_different_registers(dst, b, atmp, btmp); 1121 1122 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1123 bool is_double_word = is_double_word_type(elem_bt); 1124 bool merge = true; 1125 1126 if (!is_double_word && is_min) { 1127 evpmovd2m(ktmp, a, vlen_enc); 1128 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1129 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1130 vminps(dst, atmp, btmp, vlen_enc); 1131 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1132 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1133 } else if (!is_double_word && !is_min) { 1134 evpmovd2m(ktmp, b, vlen_enc); 1135 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1136 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1137 vmaxps(dst, atmp, btmp, vlen_enc); 1138 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1139 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1140 } else if (is_double_word && is_min) { 1141 evpmovq2m(ktmp, a, vlen_enc); 1142 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1143 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1144 vminpd(dst, atmp, btmp, vlen_enc); 1145 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1146 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1147 } else { 1148 assert(is_double_word && !is_min, "sanity"); 1149 evpmovq2m(ktmp, b, vlen_enc); 1150 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1151 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1152 vmaxpd(dst, atmp, btmp, vlen_enc); 1153 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1154 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1155 } 1156 } 1157 1158 // Float/Double signum 1159 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1160 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1161 1162 Label DONE_LABEL; 1163 1164 if (opcode == Op_SignumF) { 1165 assert(UseSSE > 0, "required"); 1166 ucomiss(dst, zero); 1167 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1168 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1169 movflt(dst, one); 1170 jcc(Assembler::above, DONE_LABEL); 1171 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1172 } else if (opcode == Op_SignumD) { 1173 assert(UseSSE > 1, "required"); 1174 ucomisd(dst, zero); 1175 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1176 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1177 movdbl(dst, one); 1178 jcc(Assembler::above, DONE_LABEL); 1179 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1180 } 1181 1182 bind(DONE_LABEL); 1183 } 1184 1185 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1186 if (sign) { 1187 pmovsxbw(dst, src); 1188 } else { 1189 pmovzxbw(dst, src); 1190 } 1191 } 1192 1193 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1194 if (sign) { 1195 vpmovsxbw(dst, src, vector_len); 1196 } else { 1197 vpmovzxbw(dst, src, vector_len); 1198 } 1199 } 1200 1201 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1202 if (sign) { 1203 vpmovsxbd(dst, src, vector_len); 1204 } else { 1205 vpmovzxbd(dst, src, vector_len); 1206 } 1207 } 1208 1209 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1210 if (sign) { 1211 vpmovsxwd(dst, src, vector_len); 1212 } else { 1213 vpmovzxwd(dst, src, vector_len); 1214 } 1215 } 1216 1217 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1218 int shift, int vector_len) { 1219 if (opcode == Op_RotateLeftV) { 1220 if (etype == T_INT) { 1221 evprold(dst, src, shift, vector_len); 1222 } else { 1223 assert(etype == T_LONG, "expected type T_LONG"); 1224 evprolq(dst, src, shift, vector_len); 1225 } 1226 } else { 1227 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1228 if (etype == T_INT) { 1229 evprord(dst, src, shift, vector_len); 1230 } else { 1231 assert(etype == T_LONG, "expected type T_LONG"); 1232 evprorq(dst, src, shift, vector_len); 1233 } 1234 } 1235 } 1236 1237 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1238 XMMRegister shift, int vector_len) { 1239 if (opcode == Op_RotateLeftV) { 1240 if (etype == T_INT) { 1241 evprolvd(dst, src, shift, vector_len); 1242 } else { 1243 assert(etype == T_LONG, "expected type T_LONG"); 1244 evprolvq(dst, src, shift, vector_len); 1245 } 1246 } else { 1247 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1248 if (etype == T_INT) { 1249 evprorvd(dst, src, shift, vector_len); 1250 } else { 1251 assert(etype == T_LONG, "expected type T_LONG"); 1252 evprorvq(dst, src, shift, vector_len); 1253 } 1254 } 1255 } 1256 1257 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1258 if (opcode == Op_RShiftVI) { 1259 psrad(dst, shift); 1260 } else if (opcode == Op_LShiftVI) { 1261 pslld(dst, shift); 1262 } else { 1263 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1264 psrld(dst, shift); 1265 } 1266 } 1267 1268 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1269 switch (opcode) { 1270 case Op_RShiftVI: psrad(dst, shift); break; 1271 case Op_LShiftVI: pslld(dst, shift); break; 1272 case Op_URShiftVI: psrld(dst, shift); break; 1273 1274 default: assert(false, "%s", NodeClassNames[opcode]); 1275 } 1276 } 1277 1278 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1279 if (opcode == Op_RShiftVI) { 1280 vpsrad(dst, nds, shift, vector_len); 1281 } else if (opcode == Op_LShiftVI) { 1282 vpslld(dst, nds, shift, vector_len); 1283 } else { 1284 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1285 vpsrld(dst, nds, shift, vector_len); 1286 } 1287 } 1288 1289 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1290 switch (opcode) { 1291 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1292 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1293 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1294 1295 default: assert(false, "%s", NodeClassNames[opcode]); 1296 } 1297 } 1298 1299 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1300 switch (opcode) { 1301 case Op_RShiftVB: // fall-through 1302 case Op_RShiftVS: psraw(dst, shift); break; 1303 1304 case Op_LShiftVB: // fall-through 1305 case Op_LShiftVS: psllw(dst, shift); break; 1306 1307 case Op_URShiftVS: // fall-through 1308 case Op_URShiftVB: psrlw(dst, shift); break; 1309 1310 default: assert(false, "%s", NodeClassNames[opcode]); 1311 } 1312 } 1313 1314 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1315 switch (opcode) { 1316 case Op_RShiftVB: // fall-through 1317 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1318 1319 case Op_LShiftVB: // fall-through 1320 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1321 1322 case Op_URShiftVS: // fall-through 1323 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1324 1325 default: assert(false, "%s", NodeClassNames[opcode]); 1326 } 1327 } 1328 1329 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1330 switch (opcode) { 1331 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1332 case Op_LShiftVL: psllq(dst, shift); break; 1333 case Op_URShiftVL: psrlq(dst, shift); break; 1334 1335 default: assert(false, "%s", NodeClassNames[opcode]); 1336 } 1337 } 1338 1339 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1340 if (opcode == Op_RShiftVL) { 1341 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1342 } else if (opcode == Op_LShiftVL) { 1343 psllq(dst, shift); 1344 } else { 1345 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1346 psrlq(dst, shift); 1347 } 1348 } 1349 1350 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1351 switch (opcode) { 1352 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1353 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1354 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1355 1356 default: assert(false, "%s", NodeClassNames[opcode]); 1357 } 1358 } 1359 1360 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1361 if (opcode == Op_RShiftVL) { 1362 evpsraq(dst, nds, shift, vector_len); 1363 } else if (opcode == Op_LShiftVL) { 1364 vpsllq(dst, nds, shift, vector_len); 1365 } else { 1366 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1367 vpsrlq(dst, nds, shift, vector_len); 1368 } 1369 } 1370 1371 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1372 switch (opcode) { 1373 case Op_RShiftVB: // fall-through 1374 case Op_RShiftVS: // fall-through 1375 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1376 1377 case Op_LShiftVB: // fall-through 1378 case Op_LShiftVS: // fall-through 1379 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1380 1381 case Op_URShiftVB: // fall-through 1382 case Op_URShiftVS: // fall-through 1383 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1384 1385 default: assert(false, "%s", NodeClassNames[opcode]); 1386 } 1387 } 1388 1389 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1390 switch (opcode) { 1391 case Op_RShiftVB: // fall-through 1392 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1393 1394 case Op_LShiftVB: // fall-through 1395 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1396 1397 case Op_URShiftVB: // fall-through 1398 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1399 1400 default: assert(false, "%s", NodeClassNames[opcode]); 1401 } 1402 } 1403 1404 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1405 assert(UseAVX >= 2, "required"); 1406 switch (opcode) { 1407 case Op_RShiftVL: { 1408 if (UseAVX > 2) { 1409 assert(tmp == xnoreg, "not used"); 1410 if (!VM_Version::supports_avx512vl()) { 1411 vlen_enc = Assembler::AVX_512bit; 1412 } 1413 evpsravq(dst, src, shift, vlen_enc); 1414 } else { 1415 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1416 vpsrlvq(dst, src, shift, vlen_enc); 1417 vpsrlvq(tmp, tmp, shift, vlen_enc); 1418 vpxor(dst, dst, tmp, vlen_enc); 1419 vpsubq(dst, dst, tmp, vlen_enc); 1420 } 1421 break; 1422 } 1423 case Op_LShiftVL: { 1424 assert(tmp == xnoreg, "not used"); 1425 vpsllvq(dst, src, shift, vlen_enc); 1426 break; 1427 } 1428 case Op_URShiftVL: { 1429 assert(tmp == xnoreg, "not used"); 1430 vpsrlvq(dst, src, shift, vlen_enc); 1431 break; 1432 } 1433 default: assert(false, "%s", NodeClassNames[opcode]); 1434 } 1435 } 1436 1437 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1438 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1439 assert(opcode == Op_LShiftVB || 1440 opcode == Op_RShiftVB || 1441 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1442 bool sign = (opcode != Op_URShiftVB); 1443 assert(vector_len == 0, "required"); 1444 vextendbd(sign, dst, src, 1); 1445 vpmovzxbd(vtmp, shift, 1); 1446 varshiftd(opcode, dst, dst, vtmp, 1); 1447 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1448 vextracti128_high(vtmp, dst); 1449 vpackusdw(dst, dst, vtmp, 0); 1450 } 1451 1452 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1453 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1454 assert(opcode == Op_LShiftVB || 1455 opcode == Op_RShiftVB || 1456 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1457 bool sign = (opcode != Op_URShiftVB); 1458 int ext_vector_len = vector_len + 1; 1459 vextendbw(sign, dst, src, ext_vector_len); 1460 vpmovzxbw(vtmp, shift, ext_vector_len); 1461 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1462 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1463 if (vector_len == 0) { 1464 vextracti128_high(vtmp, dst); 1465 vpackuswb(dst, dst, vtmp, vector_len); 1466 } else { 1467 vextracti64x4_high(vtmp, dst); 1468 vpackuswb(dst, dst, vtmp, vector_len); 1469 vpermq(dst, dst, 0xD8, vector_len); 1470 } 1471 } 1472 1473 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1474 switch(typ) { 1475 case T_BYTE: 1476 pinsrb(dst, val, idx); 1477 break; 1478 case T_SHORT: 1479 pinsrw(dst, val, idx); 1480 break; 1481 case T_INT: 1482 pinsrd(dst, val, idx); 1483 break; 1484 case T_LONG: 1485 pinsrq(dst, val, idx); 1486 break; 1487 default: 1488 assert(false,"Should not reach here."); 1489 break; 1490 } 1491 } 1492 1493 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1494 switch(typ) { 1495 case T_BYTE: 1496 vpinsrb(dst, src, val, idx); 1497 break; 1498 case T_SHORT: 1499 vpinsrw(dst, src, val, idx); 1500 break; 1501 case T_INT: 1502 vpinsrd(dst, src, val, idx); 1503 break; 1504 case T_LONG: 1505 vpinsrq(dst, src, val, idx); 1506 break; 1507 default: 1508 assert(false,"Should not reach here."); 1509 break; 1510 } 1511 } 1512 1513 #ifdef _LP64 1514 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1515 XMMRegister dst, Register base, 1516 Register idx_base, 1517 Register offset, Register mask, 1518 Register mask_idx, Register rtmp, 1519 int vlen_enc) { 1520 vpxor(dst, dst, dst, vlen_enc); 1521 if (elem_bt == T_SHORT) { 1522 for (int i = 0; i < 4; i++) { 1523 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1524 Label skip_load; 1525 btq(mask, mask_idx); 1526 jccb(Assembler::carryClear, skip_load); 1527 movl(rtmp, Address(idx_base, i * 4)); 1528 if (offset != noreg) { 1529 addl(rtmp, offset); 1530 } 1531 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1532 bind(skip_load); 1533 incq(mask_idx); 1534 } 1535 } else { 1536 assert(elem_bt == T_BYTE, ""); 1537 for (int i = 0; i < 8; i++) { 1538 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1539 Label skip_load; 1540 btq(mask, mask_idx); 1541 jccb(Assembler::carryClear, skip_load); 1542 movl(rtmp, Address(idx_base, i * 4)); 1543 if (offset != noreg) { 1544 addl(rtmp, offset); 1545 } 1546 pinsrb(dst, Address(base, rtmp), i); 1547 bind(skip_load); 1548 incq(mask_idx); 1549 } 1550 } 1551 } 1552 #endif // _LP64 1553 1554 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1555 Register base, Register idx_base, 1556 Register offset, Register rtmp, 1557 int vlen_enc) { 1558 vpxor(dst, dst, dst, vlen_enc); 1559 if (elem_bt == T_SHORT) { 1560 for (int i = 0; i < 4; i++) { 1561 // dst[i] = src[offset + idx_base[i]] 1562 movl(rtmp, Address(idx_base, i * 4)); 1563 if (offset != noreg) { 1564 addl(rtmp, offset); 1565 } 1566 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1567 } 1568 } else { 1569 assert(elem_bt == T_BYTE, ""); 1570 for (int i = 0; i < 8; i++) { 1571 // dst[i] = src[offset + idx_base[i]] 1572 movl(rtmp, Address(idx_base, i * 4)); 1573 if (offset != noreg) { 1574 addl(rtmp, offset); 1575 } 1576 pinsrb(dst, Address(base, rtmp), i); 1577 } 1578 } 1579 } 1580 1581 /* 1582 * Gather using hybrid algorithm, first partially unroll scalar loop 1583 * to accumulate values from gather indices into a quad-word(64bit) slice. 1584 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1585 * permutation to place the slice into appropriate vector lane 1586 * locations in destination vector. Following pseudo code describes the 1587 * algorithm in detail: 1588 * 1589 * DST_VEC = ZERO_VEC 1590 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1591 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1592 * FOREACH_ITER: 1593 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1594 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1595 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1596 * PERM_INDEX = PERM_INDEX - TWO_VEC 1597 * 1598 * With each iteration, doubleword permute indices (0,1) corresponding 1599 * to gathered quadword gets right shifted by two lane positions. 1600 * 1601 */ 1602 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1603 Register base, Register idx_base, 1604 Register offset, Register mask, 1605 XMMRegister xtmp1, XMMRegister xtmp2, 1606 XMMRegister temp_dst, Register rtmp, 1607 Register mask_idx, Register length, 1608 int vector_len, int vlen_enc) { 1609 Label GATHER8_LOOP; 1610 assert(is_subword_type(elem_ty), ""); 1611 movl(length, vector_len); 1612 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1613 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1614 vallones(xtmp2, vlen_enc); 1615 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1616 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1617 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1618 1619 bind(GATHER8_LOOP); 1620 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1621 if (mask == noreg) { 1622 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1623 } else { 1624 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1625 } 1626 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1627 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1628 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1629 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1630 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1631 vpor(dst, dst, temp_dst, vlen_enc); 1632 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1633 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1634 jcc(Assembler::notEqual, GATHER8_LOOP); 1635 } 1636 1637 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1638 switch(typ) { 1639 case T_INT: 1640 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1641 break; 1642 case T_FLOAT: 1643 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1644 break; 1645 case T_LONG: 1646 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1647 break; 1648 case T_DOUBLE: 1649 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1650 break; 1651 default: 1652 assert(false,"Should not reach here."); 1653 break; 1654 } 1655 } 1656 1657 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1658 switch(typ) { 1659 case T_INT: 1660 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1661 break; 1662 case T_FLOAT: 1663 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1664 break; 1665 case T_LONG: 1666 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1667 break; 1668 case T_DOUBLE: 1669 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1670 break; 1671 default: 1672 assert(false,"Should not reach here."); 1673 break; 1674 } 1675 } 1676 1677 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1678 switch(typ) { 1679 case T_INT: 1680 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1681 break; 1682 case T_FLOAT: 1683 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1684 break; 1685 case T_LONG: 1686 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1687 break; 1688 case T_DOUBLE: 1689 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1690 break; 1691 default: 1692 assert(false,"Should not reach here."); 1693 break; 1694 } 1695 } 1696 1697 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1698 if (vlen_in_bytes <= 16) { 1699 pxor (dst, dst); 1700 psubb(dst, src); 1701 switch (elem_bt) { 1702 case T_BYTE: /* nothing to do */ break; 1703 case T_SHORT: pmovsxbw(dst, dst); break; 1704 case T_INT: pmovsxbd(dst, dst); break; 1705 case T_FLOAT: pmovsxbd(dst, dst); break; 1706 case T_LONG: pmovsxbq(dst, dst); break; 1707 case T_DOUBLE: pmovsxbq(dst, dst); break; 1708 1709 default: assert(false, "%s", type2name(elem_bt)); 1710 } 1711 } else { 1712 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1713 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1714 1715 vpxor (dst, dst, dst, vlen_enc); 1716 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1717 1718 switch (elem_bt) { 1719 case T_BYTE: /* nothing to do */ break; 1720 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1721 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1722 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1723 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1724 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1725 1726 default: assert(false, "%s", type2name(elem_bt)); 1727 } 1728 } 1729 } 1730 1731 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1732 if (novlbwdq) { 1733 vpmovsxbd(xtmp, src, vlen_enc); 1734 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1735 Assembler::eq, true, vlen_enc, noreg); 1736 } else { 1737 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1738 vpsubb(xtmp, xtmp, src, vlen_enc); 1739 evpmovb2m(dst, xtmp, vlen_enc); 1740 } 1741 } 1742 1743 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1744 switch (vlen_in_bytes) { 1745 case 4: movdl(dst, src); break; 1746 case 8: movq(dst, src); break; 1747 case 16: movdqu(dst, src); break; 1748 case 32: vmovdqu(dst, src); break; 1749 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1750 default: ShouldNotReachHere(); 1751 } 1752 } 1753 1754 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1755 assert(rscratch != noreg || always_reachable(src), "missing"); 1756 1757 if (reachable(src)) { 1758 load_vector(dst, as_Address(src), vlen_in_bytes); 1759 } else { 1760 lea(rscratch, src); 1761 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1762 } 1763 } 1764 1765 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1766 int vlen_enc = vector_length_encoding(vlen); 1767 if (VM_Version::supports_avx()) { 1768 if (bt == T_LONG) { 1769 if (VM_Version::supports_avx2()) { 1770 vpbroadcastq(dst, src, vlen_enc); 1771 } else { 1772 vmovddup(dst, src, vlen_enc); 1773 } 1774 } else if (bt == T_DOUBLE) { 1775 if (vlen_enc != Assembler::AVX_128bit) { 1776 vbroadcastsd(dst, src, vlen_enc, noreg); 1777 } else { 1778 vmovddup(dst, src, vlen_enc); 1779 } 1780 } else { 1781 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1782 vpbroadcastd(dst, src, vlen_enc); 1783 } else { 1784 vbroadcastss(dst, src, vlen_enc); 1785 } 1786 } 1787 } else if (VM_Version::supports_sse3()) { 1788 movddup(dst, src); 1789 } else { 1790 movq(dst, src); 1791 if (vlen == 16) { 1792 punpcklqdq(dst, dst); 1793 } 1794 } 1795 } 1796 1797 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1798 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1799 int offset = exact_log2(type2aelembytes(bt)) << 6; 1800 if (is_floating_point_type(bt)) { 1801 offset += 128; 1802 } 1803 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1804 load_vector(dst, addr, vlen_in_bytes); 1805 } 1806 1807 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1808 1809 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1810 int vector_len = Assembler::AVX_128bit; 1811 1812 switch (opcode) { 1813 case Op_AndReductionV: pand(dst, src); break; 1814 case Op_OrReductionV: por (dst, src); break; 1815 case Op_XorReductionV: pxor(dst, src); break; 1816 case Op_MinReductionV: 1817 switch (typ) { 1818 case T_BYTE: pminsb(dst, src); break; 1819 case T_SHORT: pminsw(dst, src); break; 1820 case T_INT: pminsd(dst, src); break; 1821 case T_LONG: assert(UseAVX > 2, "required"); 1822 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1823 default: assert(false, "wrong type"); 1824 } 1825 break; 1826 case Op_MaxReductionV: 1827 switch (typ) { 1828 case T_BYTE: pmaxsb(dst, src); break; 1829 case T_SHORT: pmaxsw(dst, src); break; 1830 case T_INT: pmaxsd(dst, src); break; 1831 case T_LONG: assert(UseAVX > 2, "required"); 1832 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1833 default: assert(false, "wrong type"); 1834 } 1835 break; 1836 case Op_AddReductionVF: addss(dst, src); break; 1837 case Op_AddReductionVD: addsd(dst, src); break; 1838 case Op_AddReductionVI: 1839 switch (typ) { 1840 case T_BYTE: paddb(dst, src); break; 1841 case T_SHORT: paddw(dst, src); break; 1842 case T_INT: paddd(dst, src); break; 1843 default: assert(false, "wrong type"); 1844 } 1845 break; 1846 case Op_AddReductionVL: paddq(dst, src); break; 1847 case Op_MulReductionVF: mulss(dst, src); break; 1848 case Op_MulReductionVD: mulsd(dst, src); break; 1849 case Op_MulReductionVI: 1850 switch (typ) { 1851 case T_SHORT: pmullw(dst, src); break; 1852 case T_INT: pmulld(dst, src); break; 1853 default: assert(false, "wrong type"); 1854 } 1855 break; 1856 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1857 evpmullq(dst, dst, src, vector_len); break; 1858 default: assert(false, "wrong opcode"); 1859 } 1860 } 1861 1862 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1863 int vector_len = Assembler::AVX_256bit; 1864 1865 switch (opcode) { 1866 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1867 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1868 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1869 case Op_MinReductionV: 1870 switch (typ) { 1871 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1872 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1873 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1874 case T_LONG: assert(UseAVX > 2, "required"); 1875 vpminsq(dst, src1, src2, vector_len); break; 1876 default: assert(false, "wrong type"); 1877 } 1878 break; 1879 case Op_MaxReductionV: 1880 switch (typ) { 1881 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1882 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1883 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1884 case T_LONG: assert(UseAVX > 2, "required"); 1885 vpmaxsq(dst, src1, src2, vector_len); break; 1886 default: assert(false, "wrong type"); 1887 } 1888 break; 1889 case Op_AddReductionVI: 1890 switch (typ) { 1891 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1892 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1893 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1894 default: assert(false, "wrong type"); 1895 } 1896 break; 1897 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1898 case Op_MulReductionVI: 1899 switch (typ) { 1900 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1901 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1902 default: assert(false, "wrong type"); 1903 } 1904 break; 1905 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1906 default: assert(false, "wrong opcode"); 1907 } 1908 } 1909 1910 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1911 XMMRegister dst, XMMRegister src, 1912 XMMRegister vtmp1, XMMRegister vtmp2) { 1913 switch (opcode) { 1914 case Op_AddReductionVF: 1915 case Op_MulReductionVF: 1916 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1917 break; 1918 1919 case Op_AddReductionVD: 1920 case Op_MulReductionVD: 1921 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1922 break; 1923 1924 default: assert(false, "wrong opcode"); 1925 } 1926 } 1927 1928 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1929 Register dst, Register src1, XMMRegister src2, 1930 XMMRegister vtmp1, XMMRegister vtmp2) { 1931 switch (vlen) { 1932 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1933 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1934 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1935 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1936 1937 default: assert(false, "wrong vector length"); 1938 } 1939 } 1940 1941 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1942 Register dst, Register src1, XMMRegister src2, 1943 XMMRegister vtmp1, XMMRegister vtmp2) { 1944 switch (vlen) { 1945 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1946 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1947 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1948 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1949 1950 default: assert(false, "wrong vector length"); 1951 } 1952 } 1953 1954 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1955 Register dst, Register src1, XMMRegister src2, 1956 XMMRegister vtmp1, XMMRegister vtmp2) { 1957 switch (vlen) { 1958 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1959 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1960 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1961 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1962 1963 default: assert(false, "wrong vector length"); 1964 } 1965 } 1966 1967 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1968 Register dst, Register src1, XMMRegister src2, 1969 XMMRegister vtmp1, XMMRegister vtmp2) { 1970 switch (vlen) { 1971 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1972 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1973 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1974 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1975 1976 default: assert(false, "wrong vector length"); 1977 } 1978 } 1979 1980 #ifdef _LP64 1981 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1982 Register dst, Register src1, XMMRegister src2, 1983 XMMRegister vtmp1, XMMRegister vtmp2) { 1984 switch (vlen) { 1985 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1986 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1987 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1988 1989 default: assert(false, "wrong vector length"); 1990 } 1991 } 1992 #endif // _LP64 1993 1994 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1995 switch (vlen) { 1996 case 2: 1997 assert(vtmp2 == xnoreg, ""); 1998 reduce2F(opcode, dst, src, vtmp1); 1999 break; 2000 case 4: 2001 assert(vtmp2 == xnoreg, ""); 2002 reduce4F(opcode, dst, src, vtmp1); 2003 break; 2004 case 8: 2005 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2006 break; 2007 case 16: 2008 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2009 break; 2010 default: assert(false, "wrong vector length"); 2011 } 2012 } 2013 2014 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2015 switch (vlen) { 2016 case 2: 2017 assert(vtmp2 == xnoreg, ""); 2018 reduce2D(opcode, dst, src, vtmp1); 2019 break; 2020 case 4: 2021 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2022 break; 2023 case 8: 2024 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2025 break; 2026 default: assert(false, "wrong vector length"); 2027 } 2028 } 2029 2030 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2031 if (opcode == Op_AddReductionVI) { 2032 if (vtmp1 != src2) { 2033 movdqu(vtmp1, src2); 2034 } 2035 phaddd(vtmp1, vtmp1); 2036 } else { 2037 pshufd(vtmp1, src2, 0x1); 2038 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2039 } 2040 movdl(vtmp2, src1); 2041 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2042 movdl(dst, vtmp1); 2043 } 2044 2045 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2046 if (opcode == Op_AddReductionVI) { 2047 if (vtmp1 != src2) { 2048 movdqu(vtmp1, src2); 2049 } 2050 phaddd(vtmp1, src2); 2051 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2052 } else { 2053 pshufd(vtmp2, src2, 0xE); 2054 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2055 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2056 } 2057 } 2058 2059 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2060 if (opcode == Op_AddReductionVI) { 2061 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2062 vextracti128_high(vtmp2, vtmp1); 2063 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2064 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2065 } else { 2066 vextracti128_high(vtmp1, src2); 2067 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2068 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2069 } 2070 } 2071 2072 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2073 vextracti64x4_high(vtmp2, src2); 2074 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2075 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2076 } 2077 2078 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2079 pshufd(vtmp2, src2, 0x1); 2080 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2081 movdqu(vtmp1, vtmp2); 2082 psrldq(vtmp1, 2); 2083 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2084 movdqu(vtmp2, vtmp1); 2085 psrldq(vtmp2, 1); 2086 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2087 movdl(vtmp2, src1); 2088 pmovsxbd(vtmp1, vtmp1); 2089 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2090 pextrb(dst, vtmp1, 0x0); 2091 movsbl(dst, dst); 2092 } 2093 2094 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2095 pshufd(vtmp1, src2, 0xE); 2096 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2097 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2098 } 2099 2100 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2101 vextracti128_high(vtmp2, src2); 2102 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2103 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2104 } 2105 2106 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2107 vextracti64x4_high(vtmp1, src2); 2108 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2109 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2110 } 2111 2112 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2113 pmovsxbw(vtmp2, src2); 2114 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2115 } 2116 2117 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2118 if (UseAVX > 1) { 2119 int vector_len = Assembler::AVX_256bit; 2120 vpmovsxbw(vtmp1, src2, vector_len); 2121 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2122 } else { 2123 pmovsxbw(vtmp2, src2); 2124 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2125 pshufd(vtmp2, src2, 0x1); 2126 pmovsxbw(vtmp2, src2); 2127 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2128 } 2129 } 2130 2131 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2132 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2133 int vector_len = Assembler::AVX_512bit; 2134 vpmovsxbw(vtmp1, src2, vector_len); 2135 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2136 } else { 2137 assert(UseAVX >= 2,"Should not reach here."); 2138 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2139 vextracti128_high(vtmp2, src2); 2140 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2141 } 2142 } 2143 2144 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2145 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2146 vextracti64x4_high(vtmp2, src2); 2147 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2148 } 2149 2150 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2151 if (opcode == Op_AddReductionVI) { 2152 if (vtmp1 != src2) { 2153 movdqu(vtmp1, src2); 2154 } 2155 phaddw(vtmp1, vtmp1); 2156 phaddw(vtmp1, vtmp1); 2157 } else { 2158 pshufd(vtmp2, src2, 0x1); 2159 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2160 movdqu(vtmp1, vtmp2); 2161 psrldq(vtmp1, 2); 2162 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2163 } 2164 movdl(vtmp2, src1); 2165 pmovsxwd(vtmp1, vtmp1); 2166 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2167 pextrw(dst, vtmp1, 0x0); 2168 movswl(dst, dst); 2169 } 2170 2171 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2172 if (opcode == Op_AddReductionVI) { 2173 if (vtmp1 != src2) { 2174 movdqu(vtmp1, src2); 2175 } 2176 phaddw(vtmp1, src2); 2177 } else { 2178 pshufd(vtmp1, src2, 0xE); 2179 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2180 } 2181 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2182 } 2183 2184 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2185 if (opcode == Op_AddReductionVI) { 2186 int vector_len = Assembler::AVX_256bit; 2187 vphaddw(vtmp2, src2, src2, vector_len); 2188 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2189 } else { 2190 vextracti128_high(vtmp2, src2); 2191 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2192 } 2193 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2194 } 2195 2196 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2197 int vector_len = Assembler::AVX_256bit; 2198 vextracti64x4_high(vtmp1, src2); 2199 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2200 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2201 } 2202 2203 #ifdef _LP64 2204 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2205 pshufd(vtmp2, src2, 0xE); 2206 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2207 movdq(vtmp1, src1); 2208 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2209 movdq(dst, vtmp1); 2210 } 2211 2212 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2213 vextracti128_high(vtmp1, src2); 2214 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2215 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2216 } 2217 2218 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2219 vextracti64x4_high(vtmp2, src2); 2220 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2221 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2222 } 2223 2224 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2225 mov64(temp, -1L); 2226 bzhiq(temp, temp, len); 2227 kmovql(dst, temp); 2228 } 2229 #endif // _LP64 2230 2231 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2232 reduce_operation_128(T_FLOAT, opcode, dst, src); 2233 pshufd(vtmp, src, 0x1); 2234 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2235 } 2236 2237 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2238 reduce2F(opcode, dst, src, vtmp); 2239 pshufd(vtmp, src, 0x2); 2240 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2241 pshufd(vtmp, src, 0x3); 2242 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2243 } 2244 2245 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2246 reduce4F(opcode, dst, src, vtmp2); 2247 vextractf128_high(vtmp2, src); 2248 reduce4F(opcode, dst, vtmp2, vtmp1); 2249 } 2250 2251 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2252 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2253 vextracti64x4_high(vtmp1, src); 2254 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2255 } 2256 2257 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2258 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2259 pshufd(vtmp, src, 0xE); 2260 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2261 } 2262 2263 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2264 reduce2D(opcode, dst, src, vtmp2); 2265 vextractf128_high(vtmp2, src); 2266 reduce2D(opcode, dst, vtmp2, vtmp1); 2267 } 2268 2269 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2270 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2271 vextracti64x4_high(vtmp1, src); 2272 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2273 } 2274 2275 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2276 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2277 } 2278 2279 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2280 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2281 } 2282 2283 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2284 int vec_enc) { 2285 switch(elem_bt) { 2286 case T_INT: 2287 case T_FLOAT: 2288 vmaskmovps(dst, src, mask, vec_enc); 2289 break; 2290 case T_LONG: 2291 case T_DOUBLE: 2292 vmaskmovpd(dst, src, mask, vec_enc); 2293 break; 2294 default: 2295 fatal("Unsupported type %s", type2name(elem_bt)); 2296 break; 2297 } 2298 } 2299 2300 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2301 int vec_enc) { 2302 switch(elem_bt) { 2303 case T_INT: 2304 case T_FLOAT: 2305 vmaskmovps(dst, src, mask, vec_enc); 2306 break; 2307 case T_LONG: 2308 case T_DOUBLE: 2309 vmaskmovpd(dst, src, mask, vec_enc); 2310 break; 2311 default: 2312 fatal("Unsupported type %s", type2name(elem_bt)); 2313 break; 2314 } 2315 } 2316 2317 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2318 XMMRegister dst, XMMRegister src, 2319 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2320 XMMRegister xmm_0, XMMRegister xmm_1) { 2321 const int permconst[] = {1, 14}; 2322 XMMRegister wsrc = src; 2323 XMMRegister wdst = xmm_0; 2324 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2325 2326 int vlen_enc = Assembler::AVX_128bit; 2327 if (vlen == 16) { 2328 vlen_enc = Assembler::AVX_256bit; 2329 } 2330 2331 for (int i = log2(vlen) - 1; i >=0; i--) { 2332 if (i == 0 && !is_dst_valid) { 2333 wdst = dst; 2334 } 2335 if (i == 3) { 2336 vextracti64x4_high(wtmp, wsrc); 2337 } else if (i == 2) { 2338 vextracti128_high(wtmp, wsrc); 2339 } else { // i = [0,1] 2340 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2341 } 2342 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2343 wsrc = wdst; 2344 vlen_enc = Assembler::AVX_128bit; 2345 } 2346 if (is_dst_valid) { 2347 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2348 } 2349 } 2350 2351 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2352 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2353 XMMRegister xmm_0, XMMRegister xmm_1) { 2354 XMMRegister wsrc = src; 2355 XMMRegister wdst = xmm_0; 2356 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2357 int vlen_enc = Assembler::AVX_128bit; 2358 if (vlen == 8) { 2359 vlen_enc = Assembler::AVX_256bit; 2360 } 2361 for (int i = log2(vlen) - 1; i >=0; i--) { 2362 if (i == 0 && !is_dst_valid) { 2363 wdst = dst; 2364 } 2365 if (i == 1) { 2366 vextracti128_high(wtmp, wsrc); 2367 } else if (i == 2) { 2368 vextracti64x4_high(wtmp, wsrc); 2369 } else { 2370 assert(i == 0, "%d", i); 2371 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2372 } 2373 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2374 wsrc = wdst; 2375 vlen_enc = Assembler::AVX_128bit; 2376 } 2377 if (is_dst_valid) { 2378 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2379 } 2380 } 2381 2382 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2383 switch (bt) { 2384 case T_BYTE: pextrb(dst, src, idx); break; 2385 case T_SHORT: pextrw(dst, src, idx); break; 2386 case T_INT: pextrd(dst, src, idx); break; 2387 case T_LONG: pextrq(dst, src, idx); break; 2388 2389 default: 2390 assert(false,"Should not reach here."); 2391 break; 2392 } 2393 } 2394 2395 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2396 int esize = type2aelembytes(typ); 2397 int elem_per_lane = 16/esize; 2398 int lane = elemindex / elem_per_lane; 2399 int eindex = elemindex % elem_per_lane; 2400 2401 if (lane >= 2) { 2402 assert(UseAVX > 2, "required"); 2403 vextractf32x4(dst, src, lane & 3); 2404 return dst; 2405 } else if (lane > 0) { 2406 assert(UseAVX > 0, "required"); 2407 vextractf128(dst, src, lane); 2408 return dst; 2409 } else { 2410 return src; 2411 } 2412 } 2413 2414 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2415 if (typ == T_BYTE) { 2416 movsbl(dst, dst); 2417 } else if (typ == T_SHORT) { 2418 movswl(dst, dst); 2419 } 2420 } 2421 2422 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2423 int esize = type2aelembytes(typ); 2424 int elem_per_lane = 16/esize; 2425 int eindex = elemindex % elem_per_lane; 2426 assert(is_integral_type(typ),"required"); 2427 2428 if (eindex == 0) { 2429 if (typ == T_LONG) { 2430 movq(dst, src); 2431 } else { 2432 movdl(dst, src); 2433 movsxl(typ, dst); 2434 } 2435 } else { 2436 extract(typ, dst, src, eindex); 2437 movsxl(typ, dst); 2438 } 2439 } 2440 2441 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2442 int esize = type2aelembytes(typ); 2443 int elem_per_lane = 16/esize; 2444 int eindex = elemindex % elem_per_lane; 2445 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2446 2447 if (eindex == 0) { 2448 movq(dst, src); 2449 } else { 2450 if (typ == T_FLOAT) { 2451 if (UseAVX == 0) { 2452 movdqu(dst, src); 2453 shufps(dst, dst, eindex); 2454 } else { 2455 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2456 } 2457 } else { 2458 if (UseAVX == 0) { 2459 movdqu(dst, src); 2460 psrldq(dst, eindex*esize); 2461 } else { 2462 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2463 } 2464 movq(dst, dst); 2465 } 2466 } 2467 // Zero upper bits 2468 if (typ == T_FLOAT) { 2469 if (UseAVX == 0) { 2470 assert(vtmp != xnoreg, "required."); 2471 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2472 pand(dst, vtmp); 2473 } else { 2474 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2475 } 2476 } 2477 } 2478 2479 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2480 switch(typ) { 2481 case T_BYTE: 2482 case T_BOOLEAN: 2483 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2484 break; 2485 case T_SHORT: 2486 case T_CHAR: 2487 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2488 break; 2489 case T_INT: 2490 case T_FLOAT: 2491 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2492 break; 2493 case T_LONG: 2494 case T_DOUBLE: 2495 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2496 break; 2497 default: 2498 assert(false,"Should not reach here."); 2499 break; 2500 } 2501 } 2502 2503 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2504 assert(rscratch != noreg || always_reachable(src2), "missing"); 2505 2506 switch(typ) { 2507 case T_BOOLEAN: 2508 case T_BYTE: 2509 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2510 break; 2511 case T_CHAR: 2512 case T_SHORT: 2513 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2514 break; 2515 case T_INT: 2516 case T_FLOAT: 2517 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2518 break; 2519 case T_LONG: 2520 case T_DOUBLE: 2521 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2522 break; 2523 default: 2524 assert(false,"Should not reach here."); 2525 break; 2526 } 2527 } 2528 2529 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2530 switch(typ) { 2531 case T_BYTE: 2532 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2533 break; 2534 case T_SHORT: 2535 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2536 break; 2537 case T_INT: 2538 case T_FLOAT: 2539 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2540 break; 2541 case T_LONG: 2542 case T_DOUBLE: 2543 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2544 break; 2545 default: 2546 assert(false,"Should not reach here."); 2547 break; 2548 } 2549 } 2550 2551 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2552 assert(vlen_in_bytes <= 32, ""); 2553 int esize = type2aelembytes(bt); 2554 if (vlen_in_bytes == 32) { 2555 assert(vtmp == xnoreg, "required."); 2556 if (esize >= 4) { 2557 vtestps(src1, src2, AVX_256bit); 2558 } else { 2559 vptest(src1, src2, AVX_256bit); 2560 } 2561 return; 2562 } 2563 if (vlen_in_bytes < 16) { 2564 // Duplicate the lower part to fill the whole register, 2565 // Don't need to do so for src2 2566 assert(vtmp != xnoreg, "required"); 2567 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2568 pshufd(vtmp, src1, shuffle_imm); 2569 } else { 2570 assert(vtmp == xnoreg, "required"); 2571 vtmp = src1; 2572 } 2573 if (esize >= 4 && VM_Version::supports_avx()) { 2574 vtestps(vtmp, src2, AVX_128bit); 2575 } else { 2576 ptest(vtmp, src2); 2577 } 2578 } 2579 2580 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2581 assert(UseAVX >= 2, "required"); 2582 #ifdef ASSERT 2583 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2584 bool is_bw_supported = VM_Version::supports_avx512bw(); 2585 if (is_bw && !is_bw_supported) { 2586 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2587 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2588 "XMM register should be 0-15"); 2589 } 2590 #endif // ASSERT 2591 switch (elem_bt) { 2592 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2593 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2594 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2595 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2596 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2597 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2598 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2599 } 2600 } 2601 2602 #ifdef _LP64 2603 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2604 assert(UseAVX >= 2, "required"); 2605 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2606 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2607 if ((UseAVX > 2) && 2608 (!is_bw || VM_Version::supports_avx512bw()) && 2609 (!is_vl || VM_Version::supports_avx512vl())) { 2610 switch (elem_bt) { 2611 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2612 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2613 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2614 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2615 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2616 } 2617 } else { 2618 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2619 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2620 switch (elem_bt) { 2621 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2622 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2623 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2624 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2625 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2626 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2627 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2628 } 2629 } 2630 } 2631 #endif 2632 2633 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2634 switch (to_elem_bt) { 2635 case T_SHORT: 2636 vpmovsxbw(dst, src, vlen_enc); 2637 break; 2638 case T_INT: 2639 vpmovsxbd(dst, src, vlen_enc); 2640 break; 2641 case T_FLOAT: 2642 vpmovsxbd(dst, src, vlen_enc); 2643 vcvtdq2ps(dst, dst, vlen_enc); 2644 break; 2645 case T_LONG: 2646 vpmovsxbq(dst, src, vlen_enc); 2647 break; 2648 case T_DOUBLE: { 2649 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2650 vpmovsxbd(dst, src, mid_vlen_enc); 2651 vcvtdq2pd(dst, dst, vlen_enc); 2652 break; 2653 } 2654 default: 2655 fatal("Unsupported type %s", type2name(to_elem_bt)); 2656 break; 2657 } 2658 } 2659 2660 //------------------------------------------------------------------------------------------- 2661 2662 // IndexOf for constant substrings with size >= 8 chars 2663 // which don't need to be loaded through stack. 2664 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2665 Register cnt1, Register cnt2, 2666 int int_cnt2, Register result, 2667 XMMRegister vec, Register tmp, 2668 int ae) { 2669 ShortBranchVerifier sbv(this); 2670 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2671 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2672 2673 // This method uses the pcmpestri instruction with bound registers 2674 // inputs: 2675 // xmm - substring 2676 // rax - substring length (elements count) 2677 // mem - scanned string 2678 // rdx - string length (elements count) 2679 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2680 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2681 // outputs: 2682 // rcx - matched index in string 2683 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2684 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2685 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2686 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2687 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2688 2689 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2690 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2691 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2692 2693 // Note, inline_string_indexOf() generates checks: 2694 // if (substr.count > string.count) return -1; 2695 // if (substr.count == 0) return 0; 2696 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2697 2698 // Load substring. 2699 if (ae == StrIntrinsicNode::UL) { 2700 pmovzxbw(vec, Address(str2, 0)); 2701 } else { 2702 movdqu(vec, Address(str2, 0)); 2703 } 2704 movl(cnt2, int_cnt2); 2705 movptr(result, str1); // string addr 2706 2707 if (int_cnt2 > stride) { 2708 jmpb(SCAN_TO_SUBSTR); 2709 2710 // Reload substr for rescan, this code 2711 // is executed only for large substrings (> 8 chars) 2712 bind(RELOAD_SUBSTR); 2713 if (ae == StrIntrinsicNode::UL) { 2714 pmovzxbw(vec, Address(str2, 0)); 2715 } else { 2716 movdqu(vec, Address(str2, 0)); 2717 } 2718 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2719 2720 bind(RELOAD_STR); 2721 // We came here after the beginning of the substring was 2722 // matched but the rest of it was not so we need to search 2723 // again. Start from the next element after the previous match. 2724 2725 // cnt2 is number of substring reminding elements and 2726 // cnt1 is number of string reminding elements when cmp failed. 2727 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2728 subl(cnt1, cnt2); 2729 addl(cnt1, int_cnt2); 2730 movl(cnt2, int_cnt2); // Now restore cnt2 2731 2732 decrementl(cnt1); // Shift to next element 2733 cmpl(cnt1, cnt2); 2734 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2735 2736 addptr(result, (1<<scale1)); 2737 2738 } // (int_cnt2 > 8) 2739 2740 // Scan string for start of substr in 16-byte vectors 2741 bind(SCAN_TO_SUBSTR); 2742 pcmpestri(vec, Address(result, 0), mode); 2743 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2744 subl(cnt1, stride); 2745 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2746 cmpl(cnt1, cnt2); 2747 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2748 addptr(result, 16); 2749 jmpb(SCAN_TO_SUBSTR); 2750 2751 // Found a potential substr 2752 bind(FOUND_CANDIDATE); 2753 // Matched whole vector if first element matched (tmp(rcx) == 0). 2754 if (int_cnt2 == stride) { 2755 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2756 } else { // int_cnt2 > 8 2757 jccb(Assembler::overflow, FOUND_SUBSTR); 2758 } 2759 // After pcmpestri tmp(rcx) contains matched element index 2760 // Compute start addr of substr 2761 lea(result, Address(result, tmp, scale1)); 2762 2763 // Make sure string is still long enough 2764 subl(cnt1, tmp); 2765 cmpl(cnt1, cnt2); 2766 if (int_cnt2 == stride) { 2767 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2768 } else { // int_cnt2 > 8 2769 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2770 } 2771 // Left less then substring. 2772 2773 bind(RET_NOT_FOUND); 2774 movl(result, -1); 2775 jmp(EXIT); 2776 2777 if (int_cnt2 > stride) { 2778 // This code is optimized for the case when whole substring 2779 // is matched if its head is matched. 2780 bind(MATCH_SUBSTR_HEAD); 2781 pcmpestri(vec, Address(result, 0), mode); 2782 // Reload only string if does not match 2783 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2784 2785 Label CONT_SCAN_SUBSTR; 2786 // Compare the rest of substring (> 8 chars). 2787 bind(FOUND_SUBSTR); 2788 // First 8 chars are already matched. 2789 negptr(cnt2); 2790 addptr(cnt2, stride); 2791 2792 bind(SCAN_SUBSTR); 2793 subl(cnt1, stride); 2794 cmpl(cnt2, -stride); // Do not read beyond substring 2795 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2796 // Back-up strings to avoid reading beyond substring: 2797 // cnt1 = cnt1 - cnt2 + 8 2798 addl(cnt1, cnt2); // cnt2 is negative 2799 addl(cnt1, stride); 2800 movl(cnt2, stride); negptr(cnt2); 2801 bind(CONT_SCAN_SUBSTR); 2802 if (int_cnt2 < (int)G) { 2803 int tail_off1 = int_cnt2<<scale1; 2804 int tail_off2 = int_cnt2<<scale2; 2805 if (ae == StrIntrinsicNode::UL) { 2806 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2807 } else { 2808 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2809 } 2810 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2811 } else { 2812 // calculate index in register to avoid integer overflow (int_cnt2*2) 2813 movl(tmp, int_cnt2); 2814 addptr(tmp, cnt2); 2815 if (ae == StrIntrinsicNode::UL) { 2816 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2817 } else { 2818 movdqu(vec, Address(str2, tmp, scale2, 0)); 2819 } 2820 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2821 } 2822 // Need to reload strings pointers if not matched whole vector 2823 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2824 addptr(cnt2, stride); 2825 jcc(Assembler::negative, SCAN_SUBSTR); 2826 // Fall through if found full substring 2827 2828 } // (int_cnt2 > 8) 2829 2830 bind(RET_FOUND); 2831 // Found result if we matched full small substring. 2832 // Compute substr offset 2833 subptr(result, str1); 2834 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2835 shrl(result, 1); // index 2836 } 2837 bind(EXIT); 2838 2839 } // string_indexofC8 2840 2841 // Small strings are loaded through stack if they cross page boundary. 2842 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2843 Register cnt1, Register cnt2, 2844 int int_cnt2, Register result, 2845 XMMRegister vec, Register tmp, 2846 int ae) { 2847 ShortBranchVerifier sbv(this); 2848 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2849 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2850 2851 // 2852 // int_cnt2 is length of small (< 8 chars) constant substring 2853 // or (-1) for non constant substring in which case its length 2854 // is in cnt2 register. 2855 // 2856 // Note, inline_string_indexOf() generates checks: 2857 // if (substr.count > string.count) return -1; 2858 // if (substr.count == 0) return 0; 2859 // 2860 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2861 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2862 // This method uses the pcmpestri instruction with bound registers 2863 // inputs: 2864 // xmm - substring 2865 // rax - substring length (elements count) 2866 // mem - scanned string 2867 // rdx - string length (elements count) 2868 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2869 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2870 // outputs: 2871 // rcx - matched index in string 2872 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2873 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2874 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2875 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2876 2877 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2878 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2879 FOUND_CANDIDATE; 2880 2881 { //======================================================== 2882 // We don't know where these strings are located 2883 // and we can't read beyond them. Load them through stack. 2884 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2885 2886 movptr(tmp, rsp); // save old SP 2887 2888 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2889 if (int_cnt2 == (1>>scale2)) { // One byte 2890 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2891 load_unsigned_byte(result, Address(str2, 0)); 2892 movdl(vec, result); // move 32 bits 2893 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2894 // Not enough header space in 32-bit VM: 12+3 = 15. 2895 movl(result, Address(str2, -1)); 2896 shrl(result, 8); 2897 movdl(vec, result); // move 32 bits 2898 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2899 load_unsigned_short(result, Address(str2, 0)); 2900 movdl(vec, result); // move 32 bits 2901 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2902 movdl(vec, Address(str2, 0)); // move 32 bits 2903 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2904 movq(vec, Address(str2, 0)); // move 64 bits 2905 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2906 // Array header size is 12 bytes in 32-bit VM 2907 // + 6 bytes for 3 chars == 18 bytes, 2908 // enough space to load vec and shift. 2909 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2910 if (ae == StrIntrinsicNode::UL) { 2911 int tail_off = int_cnt2-8; 2912 pmovzxbw(vec, Address(str2, tail_off)); 2913 psrldq(vec, -2*tail_off); 2914 } 2915 else { 2916 int tail_off = int_cnt2*(1<<scale2); 2917 movdqu(vec, Address(str2, tail_off-16)); 2918 psrldq(vec, 16-tail_off); 2919 } 2920 } 2921 } else { // not constant substring 2922 cmpl(cnt2, stride); 2923 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2924 2925 // We can read beyond string if srt+16 does not cross page boundary 2926 // since heaps are aligned and mapped by pages. 2927 assert(os::vm_page_size() < (int)G, "default page should be small"); 2928 movl(result, str2); // We need only low 32 bits 2929 andl(result, ((int)os::vm_page_size()-1)); 2930 cmpl(result, ((int)os::vm_page_size()-16)); 2931 jccb(Assembler::belowEqual, CHECK_STR); 2932 2933 // Move small strings to stack to allow load 16 bytes into vec. 2934 subptr(rsp, 16); 2935 int stk_offset = wordSize-(1<<scale2); 2936 push(cnt2); 2937 2938 bind(COPY_SUBSTR); 2939 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2940 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2941 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2942 } else if (ae == StrIntrinsicNode::UU) { 2943 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2944 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2945 } 2946 decrement(cnt2); 2947 jccb(Assembler::notZero, COPY_SUBSTR); 2948 2949 pop(cnt2); 2950 movptr(str2, rsp); // New substring address 2951 } // non constant 2952 2953 bind(CHECK_STR); 2954 cmpl(cnt1, stride); 2955 jccb(Assembler::aboveEqual, BIG_STRINGS); 2956 2957 // Check cross page boundary. 2958 movl(result, str1); // We need only low 32 bits 2959 andl(result, ((int)os::vm_page_size()-1)); 2960 cmpl(result, ((int)os::vm_page_size()-16)); 2961 jccb(Assembler::belowEqual, BIG_STRINGS); 2962 2963 subptr(rsp, 16); 2964 int stk_offset = -(1<<scale1); 2965 if (int_cnt2 < 0) { // not constant 2966 push(cnt2); 2967 stk_offset += wordSize; 2968 } 2969 movl(cnt2, cnt1); 2970 2971 bind(COPY_STR); 2972 if (ae == StrIntrinsicNode::LL) { 2973 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2974 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2975 } else { 2976 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2977 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2978 } 2979 decrement(cnt2); 2980 jccb(Assembler::notZero, COPY_STR); 2981 2982 if (int_cnt2 < 0) { // not constant 2983 pop(cnt2); 2984 } 2985 movptr(str1, rsp); // New string address 2986 2987 bind(BIG_STRINGS); 2988 // Load substring. 2989 if (int_cnt2 < 0) { // -1 2990 if (ae == StrIntrinsicNode::UL) { 2991 pmovzxbw(vec, Address(str2, 0)); 2992 } else { 2993 movdqu(vec, Address(str2, 0)); 2994 } 2995 push(cnt2); // substr count 2996 push(str2); // substr addr 2997 push(str1); // string addr 2998 } else { 2999 // Small (< 8 chars) constant substrings are loaded already. 3000 movl(cnt2, int_cnt2); 3001 } 3002 push(tmp); // original SP 3003 3004 } // Finished loading 3005 3006 //======================================================== 3007 // Start search 3008 // 3009 3010 movptr(result, str1); // string addr 3011 3012 if (int_cnt2 < 0) { // Only for non constant substring 3013 jmpb(SCAN_TO_SUBSTR); 3014 3015 // SP saved at sp+0 3016 // String saved at sp+1*wordSize 3017 // Substr saved at sp+2*wordSize 3018 // Substr count saved at sp+3*wordSize 3019 3020 // Reload substr for rescan, this code 3021 // is executed only for large substrings (> 8 chars) 3022 bind(RELOAD_SUBSTR); 3023 movptr(str2, Address(rsp, 2*wordSize)); 3024 movl(cnt2, Address(rsp, 3*wordSize)); 3025 if (ae == StrIntrinsicNode::UL) { 3026 pmovzxbw(vec, Address(str2, 0)); 3027 } else { 3028 movdqu(vec, Address(str2, 0)); 3029 } 3030 // We came here after the beginning of the substring was 3031 // matched but the rest of it was not so we need to search 3032 // again. Start from the next element after the previous match. 3033 subptr(str1, result); // Restore counter 3034 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3035 shrl(str1, 1); 3036 } 3037 addl(cnt1, str1); 3038 decrementl(cnt1); // Shift to next element 3039 cmpl(cnt1, cnt2); 3040 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3041 3042 addptr(result, (1<<scale1)); 3043 } // non constant 3044 3045 // Scan string for start of substr in 16-byte vectors 3046 bind(SCAN_TO_SUBSTR); 3047 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3048 pcmpestri(vec, Address(result, 0), mode); 3049 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3050 subl(cnt1, stride); 3051 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3052 cmpl(cnt1, cnt2); 3053 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3054 addptr(result, 16); 3055 3056 bind(ADJUST_STR); 3057 cmpl(cnt1, stride); // Do not read beyond string 3058 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3059 // Back-up string to avoid reading beyond string. 3060 lea(result, Address(result, cnt1, scale1, -16)); 3061 movl(cnt1, stride); 3062 jmpb(SCAN_TO_SUBSTR); 3063 3064 // Found a potential substr 3065 bind(FOUND_CANDIDATE); 3066 // After pcmpestri tmp(rcx) contains matched element index 3067 3068 // Make sure string is still long enough 3069 subl(cnt1, tmp); 3070 cmpl(cnt1, cnt2); 3071 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3072 // Left less then substring. 3073 3074 bind(RET_NOT_FOUND); 3075 movl(result, -1); 3076 jmp(CLEANUP); 3077 3078 bind(FOUND_SUBSTR); 3079 // Compute start addr of substr 3080 lea(result, Address(result, tmp, scale1)); 3081 if (int_cnt2 > 0) { // Constant substring 3082 // Repeat search for small substring (< 8 chars) 3083 // from new point without reloading substring. 3084 // Have to check that we don't read beyond string. 3085 cmpl(tmp, stride-int_cnt2); 3086 jccb(Assembler::greater, ADJUST_STR); 3087 // Fall through if matched whole substring. 3088 } else { // non constant 3089 assert(int_cnt2 == -1, "should be != 0"); 3090 3091 addl(tmp, cnt2); 3092 // Found result if we matched whole substring. 3093 cmpl(tmp, stride); 3094 jcc(Assembler::lessEqual, RET_FOUND); 3095 3096 // Repeat search for small substring (<= 8 chars) 3097 // from new point 'str1' without reloading substring. 3098 cmpl(cnt2, stride); 3099 // Have to check that we don't read beyond string. 3100 jccb(Assembler::lessEqual, ADJUST_STR); 3101 3102 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3103 // Compare the rest of substring (> 8 chars). 3104 movptr(str1, result); 3105 3106 cmpl(tmp, cnt2); 3107 // First 8 chars are already matched. 3108 jccb(Assembler::equal, CHECK_NEXT); 3109 3110 bind(SCAN_SUBSTR); 3111 pcmpestri(vec, Address(str1, 0), mode); 3112 // Need to reload strings pointers if not matched whole vector 3113 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3114 3115 bind(CHECK_NEXT); 3116 subl(cnt2, stride); 3117 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3118 addptr(str1, 16); 3119 if (ae == StrIntrinsicNode::UL) { 3120 addptr(str2, 8); 3121 } else { 3122 addptr(str2, 16); 3123 } 3124 subl(cnt1, stride); 3125 cmpl(cnt2, stride); // Do not read beyond substring 3126 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3127 // Back-up strings to avoid reading beyond substring. 3128 3129 if (ae == StrIntrinsicNode::UL) { 3130 lea(str2, Address(str2, cnt2, scale2, -8)); 3131 lea(str1, Address(str1, cnt2, scale1, -16)); 3132 } else { 3133 lea(str2, Address(str2, cnt2, scale2, -16)); 3134 lea(str1, Address(str1, cnt2, scale1, -16)); 3135 } 3136 subl(cnt1, cnt2); 3137 movl(cnt2, stride); 3138 addl(cnt1, stride); 3139 bind(CONT_SCAN_SUBSTR); 3140 if (ae == StrIntrinsicNode::UL) { 3141 pmovzxbw(vec, Address(str2, 0)); 3142 } else { 3143 movdqu(vec, Address(str2, 0)); 3144 } 3145 jmp(SCAN_SUBSTR); 3146 3147 bind(RET_FOUND_LONG); 3148 movptr(str1, Address(rsp, wordSize)); 3149 } // non constant 3150 3151 bind(RET_FOUND); 3152 // Compute substr offset 3153 subptr(result, str1); 3154 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3155 shrl(result, 1); // index 3156 } 3157 bind(CLEANUP); 3158 pop(rsp); // restore SP 3159 3160 } // string_indexof 3161 3162 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3163 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3164 ShortBranchVerifier sbv(this); 3165 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3166 3167 int stride = 8; 3168 3169 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3170 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3171 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3172 FOUND_SEQ_CHAR, DONE_LABEL; 3173 3174 movptr(result, str1); 3175 if (UseAVX >= 2) { 3176 cmpl(cnt1, stride); 3177 jcc(Assembler::less, SCAN_TO_CHAR); 3178 cmpl(cnt1, 2*stride); 3179 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3180 movdl(vec1, ch); 3181 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3182 vpxor(vec2, vec2); 3183 movl(tmp, cnt1); 3184 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3185 andl(cnt1,0x0000000F); //tail count (in chars) 3186 3187 bind(SCAN_TO_16_CHAR_LOOP); 3188 vmovdqu(vec3, Address(result, 0)); 3189 vpcmpeqw(vec3, vec3, vec1, 1); 3190 vptest(vec2, vec3); 3191 jcc(Assembler::carryClear, FOUND_CHAR); 3192 addptr(result, 32); 3193 subl(tmp, 2*stride); 3194 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3195 jmp(SCAN_TO_8_CHAR); 3196 bind(SCAN_TO_8_CHAR_INIT); 3197 movdl(vec1, ch); 3198 pshuflw(vec1, vec1, 0x00); 3199 pshufd(vec1, vec1, 0); 3200 pxor(vec2, vec2); 3201 } 3202 bind(SCAN_TO_8_CHAR); 3203 cmpl(cnt1, stride); 3204 jcc(Assembler::less, SCAN_TO_CHAR); 3205 if (UseAVX < 2) { 3206 movdl(vec1, ch); 3207 pshuflw(vec1, vec1, 0x00); 3208 pshufd(vec1, vec1, 0); 3209 pxor(vec2, vec2); 3210 } 3211 movl(tmp, cnt1); 3212 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3213 andl(cnt1,0x00000007); //tail count (in chars) 3214 3215 bind(SCAN_TO_8_CHAR_LOOP); 3216 movdqu(vec3, Address(result, 0)); 3217 pcmpeqw(vec3, vec1); 3218 ptest(vec2, vec3); 3219 jcc(Assembler::carryClear, FOUND_CHAR); 3220 addptr(result, 16); 3221 subl(tmp, stride); 3222 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3223 bind(SCAN_TO_CHAR); 3224 testl(cnt1, cnt1); 3225 jcc(Assembler::zero, RET_NOT_FOUND); 3226 bind(SCAN_TO_CHAR_LOOP); 3227 load_unsigned_short(tmp, Address(result, 0)); 3228 cmpl(ch, tmp); 3229 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3230 addptr(result, 2); 3231 subl(cnt1, 1); 3232 jccb(Assembler::zero, RET_NOT_FOUND); 3233 jmp(SCAN_TO_CHAR_LOOP); 3234 3235 bind(RET_NOT_FOUND); 3236 movl(result, -1); 3237 jmpb(DONE_LABEL); 3238 3239 bind(FOUND_CHAR); 3240 if (UseAVX >= 2) { 3241 vpmovmskb(tmp, vec3); 3242 } else { 3243 pmovmskb(tmp, vec3); 3244 } 3245 bsfl(ch, tmp); 3246 addptr(result, ch); 3247 3248 bind(FOUND_SEQ_CHAR); 3249 subptr(result, str1); 3250 shrl(result, 1); 3251 3252 bind(DONE_LABEL); 3253 } // string_indexof_char 3254 3255 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3256 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3257 ShortBranchVerifier sbv(this); 3258 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3259 3260 int stride = 16; 3261 3262 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3263 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3264 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3265 FOUND_SEQ_CHAR, DONE_LABEL; 3266 3267 movptr(result, str1); 3268 if (UseAVX >= 2) { 3269 cmpl(cnt1, stride); 3270 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3271 cmpl(cnt1, stride*2); 3272 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3273 movdl(vec1, ch); 3274 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3275 vpxor(vec2, vec2); 3276 movl(tmp, cnt1); 3277 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3278 andl(cnt1,0x0000001F); //tail count (in chars) 3279 3280 bind(SCAN_TO_32_CHAR_LOOP); 3281 vmovdqu(vec3, Address(result, 0)); 3282 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3283 vptest(vec2, vec3); 3284 jcc(Assembler::carryClear, FOUND_CHAR); 3285 addptr(result, 32); 3286 subl(tmp, stride*2); 3287 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3288 jmp(SCAN_TO_16_CHAR); 3289 3290 bind(SCAN_TO_16_CHAR_INIT); 3291 movdl(vec1, ch); 3292 pxor(vec2, vec2); 3293 pshufb(vec1, vec2); 3294 } 3295 3296 bind(SCAN_TO_16_CHAR); 3297 cmpl(cnt1, stride); 3298 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3299 if (UseAVX < 2) { 3300 movdl(vec1, ch); 3301 pxor(vec2, vec2); 3302 pshufb(vec1, vec2); 3303 } 3304 movl(tmp, cnt1); 3305 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3306 andl(cnt1,0x0000000F); //tail count (in bytes) 3307 3308 bind(SCAN_TO_16_CHAR_LOOP); 3309 movdqu(vec3, Address(result, 0)); 3310 pcmpeqb(vec3, vec1); 3311 ptest(vec2, vec3); 3312 jcc(Assembler::carryClear, FOUND_CHAR); 3313 addptr(result, 16); 3314 subl(tmp, stride); 3315 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3316 3317 bind(SCAN_TO_CHAR_INIT); 3318 testl(cnt1, cnt1); 3319 jcc(Assembler::zero, RET_NOT_FOUND); 3320 bind(SCAN_TO_CHAR_LOOP); 3321 load_unsigned_byte(tmp, Address(result, 0)); 3322 cmpl(ch, tmp); 3323 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3324 addptr(result, 1); 3325 subl(cnt1, 1); 3326 jccb(Assembler::zero, RET_NOT_FOUND); 3327 jmp(SCAN_TO_CHAR_LOOP); 3328 3329 bind(RET_NOT_FOUND); 3330 movl(result, -1); 3331 jmpb(DONE_LABEL); 3332 3333 bind(FOUND_CHAR); 3334 if (UseAVX >= 2) { 3335 vpmovmskb(tmp, vec3); 3336 } else { 3337 pmovmskb(tmp, vec3); 3338 } 3339 bsfl(ch, tmp); 3340 addptr(result, ch); 3341 3342 bind(FOUND_SEQ_CHAR); 3343 subptr(result, str1); 3344 3345 bind(DONE_LABEL); 3346 } // stringL_indexof_char 3347 3348 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3349 switch (eltype) { 3350 case T_BOOLEAN: return sizeof(jboolean); 3351 case T_BYTE: return sizeof(jbyte); 3352 case T_SHORT: return sizeof(jshort); 3353 case T_CHAR: return sizeof(jchar); 3354 case T_INT: return sizeof(jint); 3355 default: 3356 ShouldNotReachHere(); 3357 return -1; 3358 } 3359 } 3360 3361 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3362 switch (eltype) { 3363 // T_BOOLEAN used as surrogate for unsigned byte 3364 case T_BOOLEAN: movzbl(dst, src); break; 3365 case T_BYTE: movsbl(dst, src); break; 3366 case T_SHORT: movswl(dst, src); break; 3367 case T_CHAR: movzwl(dst, src); break; 3368 case T_INT: movl(dst, src); break; 3369 default: 3370 ShouldNotReachHere(); 3371 } 3372 } 3373 3374 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3375 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3376 } 3377 3378 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3379 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3380 } 3381 3382 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3383 const int vlen = Assembler::AVX_256bit; 3384 switch (eltype) { 3385 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3386 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3387 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3388 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3389 case T_INT: 3390 // do nothing 3391 break; 3392 default: 3393 ShouldNotReachHere(); 3394 } 3395 } 3396 3397 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3398 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3399 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3400 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3401 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3402 BasicType eltype) { 3403 ShortBranchVerifier sbv(this); 3404 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3405 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3406 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3407 3408 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3409 SHORT_UNROLLED_LOOP_EXIT, 3410 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3411 UNROLLED_VECTOR_LOOP_BEGIN, 3412 END; 3413 switch (eltype) { 3414 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3415 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3416 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3417 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3418 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3419 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3420 } 3421 3422 // For "renaming" for readibility of the code 3423 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3424 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3425 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3426 3427 const int elsize = arrays_hashcode_elsize(eltype); 3428 3429 /* 3430 if (cnt1 >= 2) { 3431 if (cnt1 >= 32) { 3432 UNROLLED VECTOR LOOP 3433 } 3434 UNROLLED SCALAR LOOP 3435 } 3436 SINGLE SCALAR 3437 */ 3438 3439 cmpl(cnt1, 32); 3440 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3441 3442 // cnt1 >= 32 && generate_vectorized_loop 3443 xorl(index, index); 3444 3445 // vresult = IntVector.zero(I256); 3446 for (int idx = 0; idx < 4; idx++) { 3447 vpxor(vresult[idx], vresult[idx]); 3448 } 3449 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3450 Register bound = tmp2; 3451 Register next = tmp3; 3452 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3453 movl(next, Address(tmp2, 0)); 3454 movdl(vnext, next); 3455 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3456 3457 // index = 0; 3458 // bound = cnt1 & ~(32 - 1); 3459 movl(bound, cnt1); 3460 andl(bound, ~(32 - 1)); 3461 // for (; index < bound; index += 32) { 3462 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3463 // result *= next; 3464 imull(result, next); 3465 // loop fission to upfront the cost of fetching from memory, OOO execution 3466 // can then hopefully do a better job of prefetching 3467 for (int idx = 0; idx < 4; idx++) { 3468 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3469 } 3470 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3471 for (int idx = 0; idx < 4; idx++) { 3472 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3473 arrays_hashcode_elvcast(vtmp[idx], eltype); 3474 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3475 } 3476 // index += 32; 3477 addl(index, 32); 3478 // index < bound; 3479 cmpl(index, bound); 3480 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3481 // } 3482 3483 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3484 subl(cnt1, bound); 3485 // release bound 3486 3487 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3488 for (int idx = 0; idx < 4; idx++) { 3489 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3490 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3491 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3492 } 3493 // result += vresult.reduceLanes(ADD); 3494 for (int idx = 0; idx < 4; idx++) { 3495 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3496 } 3497 3498 // } else if (cnt1 < 32) { 3499 3500 bind(SHORT_UNROLLED_BEGIN); 3501 // int i = 1; 3502 movl(index, 1); 3503 cmpl(index, cnt1); 3504 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3505 3506 // for (; i < cnt1 ; i += 2) { 3507 bind(SHORT_UNROLLED_LOOP_BEGIN); 3508 movl(tmp3, 961); 3509 imull(result, tmp3); 3510 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3511 movl(tmp3, tmp2); 3512 shll(tmp3, 5); 3513 subl(tmp3, tmp2); 3514 addl(result, tmp3); 3515 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3516 addl(result, tmp3); 3517 addl(index, 2); 3518 cmpl(index, cnt1); 3519 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3520 3521 // } 3522 // if (i >= cnt1) { 3523 bind(SHORT_UNROLLED_LOOP_EXIT); 3524 jccb(Assembler::greater, END); 3525 movl(tmp2, result); 3526 shll(result, 5); 3527 subl(result, tmp2); 3528 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3529 addl(result, tmp3); 3530 // } 3531 bind(END); 3532 3533 BLOCK_COMMENT("} // arrays_hashcode"); 3534 3535 } // arrays_hashcode 3536 3537 // helper function for string_compare 3538 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3539 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3540 Address::ScaleFactor scale2, Register index, int ae) { 3541 if (ae == StrIntrinsicNode::LL) { 3542 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3543 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3544 } else if (ae == StrIntrinsicNode::UU) { 3545 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3546 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3547 } else { 3548 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3549 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3550 } 3551 } 3552 3553 // Compare strings, used for char[] and byte[]. 3554 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3555 Register cnt1, Register cnt2, Register result, 3556 XMMRegister vec1, int ae, KRegister mask) { 3557 ShortBranchVerifier sbv(this); 3558 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3559 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3560 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3561 int stride2x2 = 0x40; 3562 Address::ScaleFactor scale = Address::no_scale; 3563 Address::ScaleFactor scale1 = Address::no_scale; 3564 Address::ScaleFactor scale2 = Address::no_scale; 3565 3566 if (ae != StrIntrinsicNode::LL) { 3567 stride2x2 = 0x20; 3568 } 3569 3570 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3571 shrl(cnt2, 1); 3572 } 3573 // Compute the minimum of the string lengths and the 3574 // difference of the string lengths (stack). 3575 // Do the conditional move stuff 3576 movl(result, cnt1); 3577 subl(cnt1, cnt2); 3578 push(cnt1); 3579 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3580 3581 // Is the minimum length zero? 3582 testl(cnt2, cnt2); 3583 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3584 if (ae == StrIntrinsicNode::LL) { 3585 // Load first bytes 3586 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3587 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3588 } else if (ae == StrIntrinsicNode::UU) { 3589 // Load first characters 3590 load_unsigned_short(result, Address(str1, 0)); 3591 load_unsigned_short(cnt1, Address(str2, 0)); 3592 } else { 3593 load_unsigned_byte(result, Address(str1, 0)); 3594 load_unsigned_short(cnt1, Address(str2, 0)); 3595 } 3596 subl(result, cnt1); 3597 jcc(Assembler::notZero, POP_LABEL); 3598 3599 if (ae == StrIntrinsicNode::UU) { 3600 // Divide length by 2 to get number of chars 3601 shrl(cnt2, 1); 3602 } 3603 cmpl(cnt2, 1); 3604 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3605 3606 // Check if the strings start at the same location and setup scale and stride 3607 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3608 cmpptr(str1, str2); 3609 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3610 if (ae == StrIntrinsicNode::LL) { 3611 scale = Address::times_1; 3612 stride = 16; 3613 } else { 3614 scale = Address::times_2; 3615 stride = 8; 3616 } 3617 } else { 3618 scale1 = Address::times_1; 3619 scale2 = Address::times_2; 3620 // scale not used 3621 stride = 8; 3622 } 3623 3624 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3625 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3626 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3627 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3628 Label COMPARE_TAIL_LONG; 3629 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3630 3631 int pcmpmask = 0x19; 3632 if (ae == StrIntrinsicNode::LL) { 3633 pcmpmask &= ~0x01; 3634 } 3635 3636 // Setup to compare 16-chars (32-bytes) vectors, 3637 // start from first character again because it has aligned address. 3638 if (ae == StrIntrinsicNode::LL) { 3639 stride2 = 32; 3640 } else { 3641 stride2 = 16; 3642 } 3643 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3644 adr_stride = stride << scale; 3645 } else { 3646 adr_stride1 = 8; //stride << scale1; 3647 adr_stride2 = 16; //stride << scale2; 3648 } 3649 3650 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3651 // rax and rdx are used by pcmpestri as elements counters 3652 movl(result, cnt2); 3653 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3654 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3655 3656 // fast path : compare first 2 8-char vectors. 3657 bind(COMPARE_16_CHARS); 3658 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3659 movdqu(vec1, Address(str1, 0)); 3660 } else { 3661 pmovzxbw(vec1, Address(str1, 0)); 3662 } 3663 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3664 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3665 3666 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3667 movdqu(vec1, Address(str1, adr_stride)); 3668 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3669 } else { 3670 pmovzxbw(vec1, Address(str1, adr_stride1)); 3671 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3672 } 3673 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3674 addl(cnt1, stride); 3675 3676 // Compare the characters at index in cnt1 3677 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3678 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3679 subl(result, cnt2); 3680 jmp(POP_LABEL); 3681 3682 // Setup the registers to start vector comparison loop 3683 bind(COMPARE_WIDE_VECTORS); 3684 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3685 lea(str1, Address(str1, result, scale)); 3686 lea(str2, Address(str2, result, scale)); 3687 } else { 3688 lea(str1, Address(str1, result, scale1)); 3689 lea(str2, Address(str2, result, scale2)); 3690 } 3691 subl(result, stride2); 3692 subl(cnt2, stride2); 3693 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3694 negptr(result); 3695 3696 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3697 bind(COMPARE_WIDE_VECTORS_LOOP); 3698 3699 #ifdef _LP64 3700 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3701 cmpl(cnt2, stride2x2); 3702 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3703 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3704 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3705 3706 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3707 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3708 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3709 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3710 } else { 3711 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3712 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3713 } 3714 kortestql(mask, mask); 3715 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3716 addptr(result, stride2x2); // update since we already compared at this addr 3717 subl(cnt2, stride2x2); // and sub the size too 3718 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3719 3720 vpxor(vec1, vec1); 3721 jmpb(COMPARE_WIDE_TAIL); 3722 }//if (VM_Version::supports_avx512vlbw()) 3723 #endif // _LP64 3724 3725 3726 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3727 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3728 vmovdqu(vec1, Address(str1, result, scale)); 3729 vpxor(vec1, Address(str2, result, scale)); 3730 } else { 3731 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3732 vpxor(vec1, Address(str2, result, scale2)); 3733 } 3734 vptest(vec1, vec1); 3735 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3736 addptr(result, stride2); 3737 subl(cnt2, stride2); 3738 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3739 // clean upper bits of YMM registers 3740 vpxor(vec1, vec1); 3741 3742 // compare wide vectors tail 3743 bind(COMPARE_WIDE_TAIL); 3744 testptr(result, result); 3745 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3746 3747 movl(result, stride2); 3748 movl(cnt2, result); 3749 negptr(result); 3750 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3751 3752 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3753 bind(VECTOR_NOT_EQUAL); 3754 // clean upper bits of YMM registers 3755 vpxor(vec1, vec1); 3756 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3757 lea(str1, Address(str1, result, scale)); 3758 lea(str2, Address(str2, result, scale)); 3759 } else { 3760 lea(str1, Address(str1, result, scale1)); 3761 lea(str2, Address(str2, result, scale2)); 3762 } 3763 jmp(COMPARE_16_CHARS); 3764 3765 // Compare tail chars, length between 1 to 15 chars 3766 bind(COMPARE_TAIL_LONG); 3767 movl(cnt2, result); 3768 cmpl(cnt2, stride); 3769 jcc(Assembler::less, COMPARE_SMALL_STR); 3770 3771 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3772 movdqu(vec1, Address(str1, 0)); 3773 } else { 3774 pmovzxbw(vec1, Address(str1, 0)); 3775 } 3776 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3777 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3778 subptr(cnt2, stride); 3779 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3780 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3781 lea(str1, Address(str1, result, scale)); 3782 lea(str2, Address(str2, result, scale)); 3783 } else { 3784 lea(str1, Address(str1, result, scale1)); 3785 lea(str2, Address(str2, result, scale2)); 3786 } 3787 negptr(cnt2); 3788 jmpb(WHILE_HEAD_LABEL); 3789 3790 bind(COMPARE_SMALL_STR); 3791 } else if (UseSSE42Intrinsics) { 3792 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3793 int pcmpmask = 0x19; 3794 // Setup to compare 8-char (16-byte) vectors, 3795 // start from first character again because it has aligned address. 3796 movl(result, cnt2); 3797 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3798 if (ae == StrIntrinsicNode::LL) { 3799 pcmpmask &= ~0x01; 3800 } 3801 jcc(Assembler::zero, COMPARE_TAIL); 3802 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3803 lea(str1, Address(str1, result, scale)); 3804 lea(str2, Address(str2, result, scale)); 3805 } else { 3806 lea(str1, Address(str1, result, scale1)); 3807 lea(str2, Address(str2, result, scale2)); 3808 } 3809 negptr(result); 3810 3811 // pcmpestri 3812 // inputs: 3813 // vec1- substring 3814 // rax - negative string length (elements count) 3815 // mem - scanned string 3816 // rdx - string length (elements count) 3817 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3818 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3819 // outputs: 3820 // rcx - first mismatched element index 3821 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3822 3823 bind(COMPARE_WIDE_VECTORS); 3824 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3825 movdqu(vec1, Address(str1, result, scale)); 3826 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3827 } else { 3828 pmovzxbw(vec1, Address(str1, result, scale1)); 3829 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3830 } 3831 // After pcmpestri cnt1(rcx) contains mismatched element index 3832 3833 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3834 addptr(result, stride); 3835 subptr(cnt2, stride); 3836 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3837 3838 // compare wide vectors tail 3839 testptr(result, result); 3840 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3841 3842 movl(cnt2, stride); 3843 movl(result, stride); 3844 negptr(result); 3845 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3846 movdqu(vec1, Address(str1, result, scale)); 3847 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3848 } else { 3849 pmovzxbw(vec1, Address(str1, result, scale1)); 3850 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3851 } 3852 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3853 3854 // Mismatched characters in the vectors 3855 bind(VECTOR_NOT_EQUAL); 3856 addptr(cnt1, result); 3857 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3858 subl(result, cnt2); 3859 jmpb(POP_LABEL); 3860 3861 bind(COMPARE_TAIL); // limit is zero 3862 movl(cnt2, result); 3863 // Fallthru to tail compare 3864 } 3865 // Shift str2 and str1 to the end of the arrays, negate min 3866 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3867 lea(str1, Address(str1, cnt2, scale)); 3868 lea(str2, Address(str2, cnt2, scale)); 3869 } else { 3870 lea(str1, Address(str1, cnt2, scale1)); 3871 lea(str2, Address(str2, cnt2, scale2)); 3872 } 3873 decrementl(cnt2); // first character was compared already 3874 negptr(cnt2); 3875 3876 // Compare the rest of the elements 3877 bind(WHILE_HEAD_LABEL); 3878 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3879 subl(result, cnt1); 3880 jccb(Assembler::notZero, POP_LABEL); 3881 increment(cnt2); 3882 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3883 3884 // Strings are equal up to min length. Return the length difference. 3885 bind(LENGTH_DIFF_LABEL); 3886 pop(result); 3887 if (ae == StrIntrinsicNode::UU) { 3888 // Divide diff by 2 to get number of chars 3889 sarl(result, 1); 3890 } 3891 jmpb(DONE_LABEL); 3892 3893 #ifdef _LP64 3894 if (VM_Version::supports_avx512vlbw()) { 3895 3896 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3897 3898 kmovql(cnt1, mask); 3899 notq(cnt1); 3900 bsfq(cnt2, cnt1); 3901 if (ae != StrIntrinsicNode::LL) { 3902 // Divide diff by 2 to get number of chars 3903 sarl(cnt2, 1); 3904 } 3905 addq(result, cnt2); 3906 if (ae == StrIntrinsicNode::LL) { 3907 load_unsigned_byte(cnt1, Address(str2, result)); 3908 load_unsigned_byte(result, Address(str1, result)); 3909 } else if (ae == StrIntrinsicNode::UU) { 3910 load_unsigned_short(cnt1, Address(str2, result, scale)); 3911 load_unsigned_short(result, Address(str1, result, scale)); 3912 } else { 3913 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3914 load_unsigned_byte(result, Address(str1, result, scale1)); 3915 } 3916 subl(result, cnt1); 3917 jmpb(POP_LABEL); 3918 }//if (VM_Version::supports_avx512vlbw()) 3919 #endif // _LP64 3920 3921 // Discard the stored length difference 3922 bind(POP_LABEL); 3923 pop(cnt1); 3924 3925 // That's it 3926 bind(DONE_LABEL); 3927 if(ae == StrIntrinsicNode::UL) { 3928 negl(result); 3929 } 3930 3931 } 3932 3933 // Search for Non-ASCII character (Negative byte value) in a byte array, 3934 // return the index of the first such character, otherwise the length 3935 // of the array segment searched. 3936 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3937 // @IntrinsicCandidate 3938 // public static int countPositives(byte[] ba, int off, int len) { 3939 // for (int i = off; i < off + len; i++) { 3940 // if (ba[i] < 0) { 3941 // return i - off; 3942 // } 3943 // } 3944 // return len; 3945 // } 3946 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3947 Register result, Register tmp1, 3948 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3949 // rsi: byte array 3950 // rcx: len 3951 // rax: result 3952 ShortBranchVerifier sbv(this); 3953 assert_different_registers(ary1, len, result, tmp1); 3954 assert_different_registers(vec1, vec2); 3955 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3956 3957 movl(result, len); // copy 3958 // len == 0 3959 testl(len, len); 3960 jcc(Assembler::zero, DONE); 3961 3962 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3963 VM_Version::supports_avx512vlbw() && 3964 VM_Version::supports_bmi2()) { 3965 3966 Label test_64_loop, test_tail, BREAK_LOOP; 3967 movl(tmp1, len); 3968 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3969 3970 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 3971 andl(len, 0xffffffc0); // vector count (in chars) 3972 jccb(Assembler::zero, test_tail); 3973 3974 lea(ary1, Address(ary1, len, Address::times_1)); 3975 negptr(len); 3976 3977 bind(test_64_loop); 3978 // Check whether our 64 elements of size byte contain negatives 3979 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3980 kortestql(mask1, mask1); 3981 jcc(Assembler::notZero, BREAK_LOOP); 3982 3983 addptr(len, 64); 3984 jccb(Assembler::notZero, test_64_loop); 3985 3986 bind(test_tail); 3987 // bail out when there is nothing to be done 3988 testl(tmp1, -1); 3989 jcc(Assembler::zero, DONE); 3990 3991 3992 // check the tail for absense of negatives 3993 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3994 #ifdef _LP64 3995 { 3996 Register tmp3_aliased = len; 3997 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3998 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3999 notq(tmp3_aliased); 4000 kmovql(mask2, tmp3_aliased); 4001 } 4002 #else 4003 Label k_init; 4004 jmp(k_init); 4005 4006 // We could not read 64-bits from a general purpose register thus we move 4007 // data required to compose 64 1's to the instruction stream 4008 // We emit 64 byte wide series of elements from 0..63 which later on would 4009 // be used as a compare targets with tail count contained in tmp1 register. 4010 // Result would be a k register having tmp1 consecutive number or 1 4011 // counting from least significant bit. 4012 address tmp = pc(); 4013 emit_int64(0x0706050403020100); 4014 emit_int64(0x0F0E0D0C0B0A0908); 4015 emit_int64(0x1716151413121110); 4016 emit_int64(0x1F1E1D1C1B1A1918); 4017 emit_int64(0x2726252423222120); 4018 emit_int64(0x2F2E2D2C2B2A2928); 4019 emit_int64(0x3736353433323130); 4020 emit_int64(0x3F3E3D3C3B3A3938); 4021 4022 bind(k_init); 4023 lea(len, InternalAddress(tmp)); 4024 // create mask to test for negative byte inside a vector 4025 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4026 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4027 4028 #endif 4029 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4030 ktestq(mask1, mask2); 4031 jcc(Assembler::zero, DONE); 4032 4033 // do a full check for negative registers in the tail 4034 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4035 // ary1 already pointing to the right place 4036 jmpb(TAIL_START); 4037 4038 bind(BREAK_LOOP); 4039 // At least one byte in the last 64 byte block was negative. 4040 // Set up to look at the last 64 bytes as if they were a tail 4041 lea(ary1, Address(ary1, len, Address::times_1)); 4042 addptr(result, len); 4043 // Ignore the very last byte: if all others are positive, 4044 // it must be negative, so we can skip right to the 2+1 byte 4045 // end comparison at this point 4046 orl(result, 63); 4047 movl(len, 63); 4048 // Fallthru to tail compare 4049 } else { 4050 4051 if (UseAVX >= 2 && UseSSE >= 2) { 4052 // With AVX2, use 32-byte vector compare 4053 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4054 4055 // Compare 32-byte vectors 4056 testl(len, 0xffffffe0); // vector count (in bytes) 4057 jccb(Assembler::zero, TAIL_START); 4058 4059 andl(len, 0xffffffe0); 4060 lea(ary1, Address(ary1, len, Address::times_1)); 4061 negptr(len); 4062 4063 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4064 movdl(vec2, tmp1); 4065 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4066 4067 bind(COMPARE_WIDE_VECTORS); 4068 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4069 vptest(vec1, vec2); 4070 jccb(Assembler::notZero, BREAK_LOOP); 4071 addptr(len, 32); 4072 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4073 4074 testl(result, 0x0000001f); // any bytes remaining? 4075 jcc(Assembler::zero, DONE); 4076 4077 // Quick test using the already prepared vector mask 4078 movl(len, result); 4079 andl(len, 0x0000001f); 4080 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4081 vptest(vec1, vec2); 4082 jcc(Assembler::zero, DONE); 4083 // There are zeros, jump to the tail to determine exactly where 4084 jmpb(TAIL_START); 4085 4086 bind(BREAK_LOOP); 4087 // At least one byte in the last 32-byte vector is negative. 4088 // Set up to look at the last 32 bytes as if they were a tail 4089 lea(ary1, Address(ary1, len, Address::times_1)); 4090 addptr(result, len); 4091 // Ignore the very last byte: if all others are positive, 4092 // it must be negative, so we can skip right to the 2+1 byte 4093 // end comparison at this point 4094 orl(result, 31); 4095 movl(len, 31); 4096 // Fallthru to tail compare 4097 } else if (UseSSE42Intrinsics) { 4098 // With SSE4.2, use double quad vector compare 4099 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4100 4101 // Compare 16-byte vectors 4102 testl(len, 0xfffffff0); // vector count (in bytes) 4103 jcc(Assembler::zero, TAIL_START); 4104 4105 andl(len, 0xfffffff0); 4106 lea(ary1, Address(ary1, len, Address::times_1)); 4107 negptr(len); 4108 4109 movl(tmp1, 0x80808080); 4110 movdl(vec2, tmp1); 4111 pshufd(vec2, vec2, 0); 4112 4113 bind(COMPARE_WIDE_VECTORS); 4114 movdqu(vec1, Address(ary1, len, Address::times_1)); 4115 ptest(vec1, vec2); 4116 jccb(Assembler::notZero, BREAK_LOOP); 4117 addptr(len, 16); 4118 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4119 4120 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4121 jcc(Assembler::zero, DONE); 4122 4123 // Quick test using the already prepared vector mask 4124 movl(len, result); 4125 andl(len, 0x0000000f); // tail count (in bytes) 4126 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4127 ptest(vec1, vec2); 4128 jcc(Assembler::zero, DONE); 4129 jmpb(TAIL_START); 4130 4131 bind(BREAK_LOOP); 4132 // At least one byte in the last 16-byte vector is negative. 4133 // Set up and look at the last 16 bytes as if they were a tail 4134 lea(ary1, Address(ary1, len, Address::times_1)); 4135 addptr(result, len); 4136 // Ignore the very last byte: if all others are positive, 4137 // it must be negative, so we can skip right to the 2+1 byte 4138 // end comparison at this point 4139 orl(result, 15); 4140 movl(len, 15); 4141 // Fallthru to tail compare 4142 } 4143 } 4144 4145 bind(TAIL_START); 4146 // Compare 4-byte vectors 4147 andl(len, 0xfffffffc); // vector count (in bytes) 4148 jccb(Assembler::zero, COMPARE_CHAR); 4149 4150 lea(ary1, Address(ary1, len, Address::times_1)); 4151 negptr(len); 4152 4153 bind(COMPARE_VECTORS); 4154 movl(tmp1, Address(ary1, len, Address::times_1)); 4155 andl(tmp1, 0x80808080); 4156 jccb(Assembler::notZero, TAIL_ADJUST); 4157 addptr(len, 4); 4158 jccb(Assembler::notZero, COMPARE_VECTORS); 4159 4160 // Compare trailing char (final 2-3 bytes), if any 4161 bind(COMPARE_CHAR); 4162 4163 testl(result, 0x2); // tail char 4164 jccb(Assembler::zero, COMPARE_BYTE); 4165 load_unsigned_short(tmp1, Address(ary1, 0)); 4166 andl(tmp1, 0x00008080); 4167 jccb(Assembler::notZero, CHAR_ADJUST); 4168 lea(ary1, Address(ary1, 2)); 4169 4170 bind(COMPARE_BYTE); 4171 testl(result, 0x1); // tail byte 4172 jccb(Assembler::zero, DONE); 4173 load_unsigned_byte(tmp1, Address(ary1, 0)); 4174 testl(tmp1, 0x00000080); 4175 jccb(Assembler::zero, DONE); 4176 subptr(result, 1); 4177 jmpb(DONE); 4178 4179 bind(TAIL_ADJUST); 4180 // there are negative bits in the last 4 byte block. 4181 // Adjust result and check the next three bytes 4182 addptr(result, len); 4183 orl(result, 3); 4184 lea(ary1, Address(ary1, len, Address::times_1)); 4185 jmpb(COMPARE_CHAR); 4186 4187 bind(CHAR_ADJUST); 4188 // We are looking at a char + optional byte tail, and found that one 4189 // of the bytes in the char is negative. Adjust the result, check the 4190 // first byte and readjust if needed. 4191 andl(result, 0xfffffffc); 4192 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4193 jccb(Assembler::notZero, DONE); 4194 addptr(result, 1); 4195 4196 // That's it 4197 bind(DONE); 4198 if (UseAVX >= 2 && UseSSE >= 2) { 4199 // clean upper bits of YMM registers 4200 vpxor(vec1, vec1); 4201 vpxor(vec2, vec2); 4202 } 4203 } 4204 4205 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4206 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4207 Register limit, Register result, Register chr, 4208 XMMRegister vec1, XMMRegister vec2, bool is_char, 4209 KRegister mask, bool expand_ary2) { 4210 // for expand_ary2, limit is the (smaller) size of the second array. 4211 ShortBranchVerifier sbv(this); 4212 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4213 4214 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4215 "Expansion only implemented for AVX2"); 4216 4217 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4218 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4219 4220 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4221 int scaleIncr = expand_ary2 ? 8 : 16; 4222 4223 if (is_array_equ) { 4224 // Check the input args 4225 cmpoop(ary1, ary2); 4226 jcc(Assembler::equal, TRUE_LABEL); 4227 4228 // Need additional checks for arrays_equals. 4229 testptr(ary1, ary1); 4230 jcc(Assembler::zero, FALSE_LABEL); 4231 testptr(ary2, ary2); 4232 jcc(Assembler::zero, FALSE_LABEL); 4233 4234 // Check the lengths 4235 movl(limit, Address(ary1, length_offset)); 4236 cmpl(limit, Address(ary2, length_offset)); 4237 jcc(Assembler::notEqual, FALSE_LABEL); 4238 } 4239 4240 // count == 0 4241 testl(limit, limit); 4242 jcc(Assembler::zero, TRUE_LABEL); 4243 4244 if (is_array_equ) { 4245 // Load array address 4246 lea(ary1, Address(ary1, base_offset)); 4247 lea(ary2, Address(ary2, base_offset)); 4248 } 4249 4250 if (is_array_equ && is_char) { 4251 // arrays_equals when used for char[]. 4252 shll(limit, 1); // byte count != 0 4253 } 4254 movl(result, limit); // copy 4255 4256 if (UseAVX >= 2) { 4257 // With AVX2, use 32-byte vector compare 4258 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4259 4260 // Compare 32-byte vectors 4261 if (expand_ary2) { 4262 andl(result, 0x0000000f); // tail count (in bytes) 4263 andl(limit, 0xfffffff0); // vector count (in bytes) 4264 jcc(Assembler::zero, COMPARE_TAIL); 4265 } else { 4266 andl(result, 0x0000001f); // tail count (in bytes) 4267 andl(limit, 0xffffffe0); // vector count (in bytes) 4268 jcc(Assembler::zero, COMPARE_TAIL_16); 4269 } 4270 4271 lea(ary1, Address(ary1, limit, scaleFactor)); 4272 lea(ary2, Address(ary2, limit, Address::times_1)); 4273 negptr(limit); 4274 4275 #ifdef _LP64 4276 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4277 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4278 4279 cmpl(limit, -64); 4280 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4281 4282 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4283 4284 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4285 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4286 kortestql(mask, mask); 4287 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4288 addptr(limit, 64); // update since we already compared at this addr 4289 cmpl(limit, -64); 4290 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4291 4292 // At this point we may still need to compare -limit+result bytes. 4293 // We could execute the next two instruction and just continue via non-wide path: 4294 // cmpl(limit, 0); 4295 // jcc(Assembler::equal, COMPARE_TAIL); // true 4296 // But since we stopped at the points ary{1,2}+limit which are 4297 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4298 // (|limit| <= 32 and result < 32), 4299 // we may just compare the last 64 bytes. 4300 // 4301 addptr(result, -64); // it is safe, bc we just came from this area 4302 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4303 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4304 kortestql(mask, mask); 4305 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4306 4307 jmp(TRUE_LABEL); 4308 4309 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4310 4311 }//if (VM_Version::supports_avx512vlbw()) 4312 #endif //_LP64 4313 bind(COMPARE_WIDE_VECTORS); 4314 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4315 if (expand_ary2) { 4316 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4317 } else { 4318 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4319 } 4320 vpxor(vec1, vec2); 4321 4322 vptest(vec1, vec1); 4323 jcc(Assembler::notZero, FALSE_LABEL); 4324 addptr(limit, scaleIncr * 2); 4325 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4326 4327 testl(result, result); 4328 jcc(Assembler::zero, TRUE_LABEL); 4329 4330 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4331 if (expand_ary2) { 4332 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4333 } else { 4334 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4335 } 4336 vpxor(vec1, vec2); 4337 4338 vptest(vec1, vec1); 4339 jcc(Assembler::notZero, FALSE_LABEL); 4340 jmp(TRUE_LABEL); 4341 4342 bind(COMPARE_TAIL_16); // limit is zero 4343 movl(limit, result); 4344 4345 // Compare 16-byte chunks 4346 andl(result, 0x0000000f); // tail count (in bytes) 4347 andl(limit, 0xfffffff0); // vector count (in bytes) 4348 jcc(Assembler::zero, COMPARE_TAIL); 4349 4350 lea(ary1, Address(ary1, limit, scaleFactor)); 4351 lea(ary2, Address(ary2, limit, Address::times_1)); 4352 negptr(limit); 4353 4354 bind(COMPARE_WIDE_VECTORS_16); 4355 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4356 if (expand_ary2) { 4357 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4358 } else { 4359 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4360 } 4361 pxor(vec1, vec2); 4362 4363 ptest(vec1, vec1); 4364 jcc(Assembler::notZero, FALSE_LABEL); 4365 addptr(limit, scaleIncr); 4366 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4367 4368 bind(COMPARE_TAIL); // limit is zero 4369 movl(limit, result); 4370 // Fallthru to tail compare 4371 } else if (UseSSE42Intrinsics) { 4372 // With SSE4.2, use double quad vector compare 4373 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4374 4375 // Compare 16-byte vectors 4376 andl(result, 0x0000000f); // tail count (in bytes) 4377 andl(limit, 0xfffffff0); // vector count (in bytes) 4378 jcc(Assembler::zero, COMPARE_TAIL); 4379 4380 lea(ary1, Address(ary1, limit, Address::times_1)); 4381 lea(ary2, Address(ary2, limit, Address::times_1)); 4382 negptr(limit); 4383 4384 bind(COMPARE_WIDE_VECTORS); 4385 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4386 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4387 pxor(vec1, vec2); 4388 4389 ptest(vec1, vec1); 4390 jcc(Assembler::notZero, FALSE_LABEL); 4391 addptr(limit, 16); 4392 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4393 4394 testl(result, result); 4395 jcc(Assembler::zero, TRUE_LABEL); 4396 4397 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4398 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4399 pxor(vec1, vec2); 4400 4401 ptest(vec1, vec1); 4402 jccb(Assembler::notZero, FALSE_LABEL); 4403 jmpb(TRUE_LABEL); 4404 4405 bind(COMPARE_TAIL); // limit is zero 4406 movl(limit, result); 4407 // Fallthru to tail compare 4408 } 4409 4410 // Compare 4-byte vectors 4411 if (expand_ary2) { 4412 testl(result, result); 4413 jccb(Assembler::zero, TRUE_LABEL); 4414 } else { 4415 andl(limit, 0xfffffffc); // vector count (in bytes) 4416 jccb(Assembler::zero, COMPARE_CHAR); 4417 } 4418 4419 lea(ary1, Address(ary1, limit, scaleFactor)); 4420 lea(ary2, Address(ary2, limit, Address::times_1)); 4421 negptr(limit); 4422 4423 bind(COMPARE_VECTORS); 4424 if (expand_ary2) { 4425 // There are no "vector" operations for bytes to shorts 4426 movzbl(chr, Address(ary2, limit, Address::times_1)); 4427 cmpw(Address(ary1, limit, Address::times_2), chr); 4428 jccb(Assembler::notEqual, FALSE_LABEL); 4429 addptr(limit, 1); 4430 jcc(Assembler::notZero, COMPARE_VECTORS); 4431 jmp(TRUE_LABEL); 4432 } else { 4433 movl(chr, Address(ary1, limit, Address::times_1)); 4434 cmpl(chr, Address(ary2, limit, Address::times_1)); 4435 jccb(Assembler::notEqual, FALSE_LABEL); 4436 addptr(limit, 4); 4437 jcc(Assembler::notZero, COMPARE_VECTORS); 4438 } 4439 4440 // Compare trailing char (final 2 bytes), if any 4441 bind(COMPARE_CHAR); 4442 testl(result, 0x2); // tail char 4443 jccb(Assembler::zero, COMPARE_BYTE); 4444 load_unsigned_short(chr, Address(ary1, 0)); 4445 load_unsigned_short(limit, Address(ary2, 0)); 4446 cmpl(chr, limit); 4447 jccb(Assembler::notEqual, FALSE_LABEL); 4448 4449 if (is_array_equ && is_char) { 4450 bind(COMPARE_BYTE); 4451 } else { 4452 lea(ary1, Address(ary1, 2)); 4453 lea(ary2, Address(ary2, 2)); 4454 4455 bind(COMPARE_BYTE); 4456 testl(result, 0x1); // tail byte 4457 jccb(Assembler::zero, TRUE_LABEL); 4458 load_unsigned_byte(chr, Address(ary1, 0)); 4459 load_unsigned_byte(limit, Address(ary2, 0)); 4460 cmpl(chr, limit); 4461 jccb(Assembler::notEqual, FALSE_LABEL); 4462 } 4463 bind(TRUE_LABEL); 4464 movl(result, 1); // return true 4465 jmpb(DONE); 4466 4467 bind(FALSE_LABEL); 4468 xorl(result, result); // return false 4469 4470 // That's it 4471 bind(DONE); 4472 if (UseAVX >= 2) { 4473 // clean upper bits of YMM registers 4474 vpxor(vec1, vec1); 4475 vpxor(vec2, vec2); 4476 } 4477 } 4478 4479 #ifdef _LP64 4480 4481 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4482 #define __ masm. 4483 Register dst = stub.data<0>(); 4484 XMMRegister src = stub.data<1>(); 4485 address target = stub.data<2>(); 4486 __ bind(stub.entry()); 4487 __ subptr(rsp, 8); 4488 __ movdbl(Address(rsp), src); 4489 __ call(RuntimeAddress(target)); 4490 __ pop(dst); 4491 __ jmp(stub.continuation()); 4492 #undef __ 4493 } 4494 4495 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4496 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4497 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4498 4499 address slowpath_target; 4500 if (dst_bt == T_INT) { 4501 if (src_bt == T_FLOAT) { 4502 cvttss2sil(dst, src); 4503 cmpl(dst, 0x80000000); 4504 slowpath_target = StubRoutines::x86::f2i_fixup(); 4505 } else { 4506 cvttsd2sil(dst, src); 4507 cmpl(dst, 0x80000000); 4508 slowpath_target = StubRoutines::x86::d2i_fixup(); 4509 } 4510 } else { 4511 if (src_bt == T_FLOAT) { 4512 cvttss2siq(dst, src); 4513 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4514 slowpath_target = StubRoutines::x86::f2l_fixup(); 4515 } else { 4516 cvttsd2siq(dst, src); 4517 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4518 slowpath_target = StubRoutines::x86::d2l_fixup(); 4519 } 4520 } 4521 4522 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4523 jcc(Assembler::equal, stub->entry()); 4524 bind(stub->continuation()); 4525 } 4526 4527 #endif // _LP64 4528 4529 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4530 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4531 switch(ideal_opc) { 4532 case Op_LShiftVS: 4533 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4534 case Op_LShiftVI: 4535 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4536 case Op_LShiftVL: 4537 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4538 case Op_RShiftVS: 4539 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4540 case Op_RShiftVI: 4541 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4542 case Op_RShiftVL: 4543 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4544 case Op_URShiftVS: 4545 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4546 case Op_URShiftVI: 4547 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4548 case Op_URShiftVL: 4549 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4550 case Op_RotateRightV: 4551 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4552 case Op_RotateLeftV: 4553 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4554 default: 4555 fatal("Unsupported masked operation"); break; 4556 } 4557 } 4558 4559 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4560 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4561 bool is_varshift) { 4562 switch (ideal_opc) { 4563 case Op_AddVB: 4564 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4565 case Op_AddVS: 4566 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4567 case Op_AddVI: 4568 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4569 case Op_AddVL: 4570 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4571 case Op_AddVF: 4572 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4573 case Op_AddVD: 4574 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4575 case Op_SubVB: 4576 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4577 case Op_SubVS: 4578 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4579 case Op_SubVI: 4580 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4581 case Op_SubVL: 4582 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4583 case Op_SubVF: 4584 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4585 case Op_SubVD: 4586 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4587 case Op_MulVS: 4588 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4589 case Op_MulVI: 4590 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4591 case Op_MulVL: 4592 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4593 case Op_MulVF: 4594 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4595 case Op_MulVD: 4596 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4597 case Op_DivVF: 4598 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4599 case Op_DivVD: 4600 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4601 case Op_SqrtVF: 4602 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4603 case Op_SqrtVD: 4604 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4605 case Op_AbsVB: 4606 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4607 case Op_AbsVS: 4608 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4609 case Op_AbsVI: 4610 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4611 case Op_AbsVL: 4612 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4613 case Op_FmaVF: 4614 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4615 case Op_FmaVD: 4616 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4617 case Op_VectorRearrange: 4618 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4619 case Op_LShiftVS: 4620 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4621 case Op_LShiftVI: 4622 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4623 case Op_LShiftVL: 4624 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4625 case Op_RShiftVS: 4626 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4627 case Op_RShiftVI: 4628 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4629 case Op_RShiftVL: 4630 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4631 case Op_URShiftVS: 4632 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4633 case Op_URShiftVI: 4634 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4635 case Op_URShiftVL: 4636 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4637 case Op_RotateLeftV: 4638 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4639 case Op_RotateRightV: 4640 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4641 case Op_MaxV: 4642 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4643 case Op_MinV: 4644 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4645 case Op_XorV: 4646 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4647 case Op_OrV: 4648 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4649 case Op_AndV: 4650 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4651 default: 4652 fatal("Unsupported masked operation"); break; 4653 } 4654 } 4655 4656 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4657 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4658 switch (ideal_opc) { 4659 case Op_AddVB: 4660 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4661 case Op_AddVS: 4662 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4663 case Op_AddVI: 4664 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4665 case Op_AddVL: 4666 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4667 case Op_AddVF: 4668 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4669 case Op_AddVD: 4670 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4671 case Op_SubVB: 4672 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4673 case Op_SubVS: 4674 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4675 case Op_SubVI: 4676 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4677 case Op_SubVL: 4678 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4679 case Op_SubVF: 4680 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4681 case Op_SubVD: 4682 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4683 case Op_MulVS: 4684 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4685 case Op_MulVI: 4686 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4687 case Op_MulVL: 4688 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4689 case Op_MulVF: 4690 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4691 case Op_MulVD: 4692 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4693 case Op_DivVF: 4694 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4695 case Op_DivVD: 4696 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4697 case Op_FmaVF: 4698 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4699 case Op_FmaVD: 4700 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4701 case Op_MaxV: 4702 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4703 case Op_MinV: 4704 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4705 case Op_XorV: 4706 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4707 case Op_OrV: 4708 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4709 case Op_AndV: 4710 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4711 default: 4712 fatal("Unsupported masked operation"); break; 4713 } 4714 } 4715 4716 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4717 KRegister src1, KRegister src2) { 4718 BasicType etype = T_ILLEGAL; 4719 switch(mask_len) { 4720 case 2: 4721 case 4: 4722 case 8: etype = T_BYTE; break; 4723 case 16: etype = T_SHORT; break; 4724 case 32: etype = T_INT; break; 4725 case 64: etype = T_LONG; break; 4726 default: fatal("Unsupported type"); break; 4727 } 4728 assert(etype != T_ILLEGAL, ""); 4729 switch(ideal_opc) { 4730 case Op_AndVMask: 4731 kand(etype, dst, src1, src2); break; 4732 case Op_OrVMask: 4733 kor(etype, dst, src1, src2); break; 4734 case Op_XorVMask: 4735 kxor(etype, dst, src1, src2); break; 4736 default: 4737 fatal("Unsupported masked operation"); break; 4738 } 4739 } 4740 4741 /* 4742 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4743 * If src is NaN, the result is 0. 4744 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4745 * the result is equal to the value of Integer.MIN_VALUE. 4746 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4747 * the result is equal to the value of Integer.MAX_VALUE. 4748 */ 4749 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4750 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4751 Register rscratch, AddressLiteral float_sign_flip, 4752 int vec_enc) { 4753 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4754 Label done; 4755 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4756 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4757 vptest(xtmp2, xtmp2, vec_enc); 4758 jccb(Assembler::equal, done); 4759 4760 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4761 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4762 4763 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4764 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4765 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4766 4767 // Recompute the mask for remaining special value. 4768 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4769 // Extract SRC values corresponding to TRUE mask lanes. 4770 vpand(xtmp4, xtmp2, src, vec_enc); 4771 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4772 // values are set. 4773 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4774 4775 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4776 bind(done); 4777 } 4778 4779 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4780 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4781 Register rscratch, AddressLiteral float_sign_flip, 4782 int vec_enc) { 4783 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4784 Label done; 4785 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4786 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4787 kortestwl(ktmp1, ktmp1); 4788 jccb(Assembler::equal, done); 4789 4790 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4791 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4792 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4793 4794 kxorwl(ktmp1, ktmp1, ktmp2); 4795 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4796 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4797 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4798 bind(done); 4799 } 4800 4801 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4802 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4803 Register rscratch, AddressLiteral double_sign_flip, 4804 int vec_enc) { 4805 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4806 4807 Label done; 4808 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4809 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4810 kortestwl(ktmp1, ktmp1); 4811 jccb(Assembler::equal, done); 4812 4813 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4814 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4815 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4816 4817 kxorwl(ktmp1, ktmp1, ktmp2); 4818 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4819 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4820 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4821 bind(done); 4822 } 4823 4824 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4825 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4826 Register rscratch, AddressLiteral float_sign_flip, 4827 int vec_enc) { 4828 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4829 Label done; 4830 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4831 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4832 kortestwl(ktmp1, ktmp1); 4833 jccb(Assembler::equal, done); 4834 4835 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4836 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4837 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4838 4839 kxorwl(ktmp1, ktmp1, ktmp2); 4840 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4841 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4842 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4843 bind(done); 4844 } 4845 4846 /* 4847 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4848 * If src is NaN, the result is 0. 4849 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4850 * the result is equal to the value of Long.MIN_VALUE. 4851 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4852 * the result is equal to the value of Long.MAX_VALUE. 4853 */ 4854 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4855 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4856 Register rscratch, AddressLiteral double_sign_flip, 4857 int vec_enc) { 4858 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4859 4860 Label done; 4861 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4862 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4863 kortestwl(ktmp1, ktmp1); 4864 jccb(Assembler::equal, done); 4865 4866 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4867 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4868 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4869 4870 kxorwl(ktmp1, ktmp1, ktmp2); 4871 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4872 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4873 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4874 bind(done); 4875 } 4876 4877 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4878 XMMRegister xtmp, int index, int vec_enc) { 4879 assert(vec_enc < Assembler::AVX_512bit, ""); 4880 if (vec_enc == Assembler::AVX_256bit) { 4881 vextractf128_high(xtmp, src); 4882 vshufps(dst, src, xtmp, index, vec_enc); 4883 } else { 4884 vshufps(dst, src, zero, index, vec_enc); 4885 } 4886 } 4887 4888 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4889 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4890 AddressLiteral float_sign_flip, int src_vec_enc) { 4891 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4892 4893 Label done; 4894 // Compare the destination lanes with float_sign_flip 4895 // value to get mask for all special values. 4896 movdqu(xtmp1, float_sign_flip, rscratch); 4897 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4898 ptest(xtmp2, xtmp2); 4899 jccb(Assembler::equal, done); 4900 4901 // Flip float_sign_flip to get max integer value. 4902 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4903 pxor(xtmp1, xtmp4); 4904 4905 // Set detination lanes corresponding to unordered source lanes as zero. 4906 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4907 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4908 4909 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4910 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4911 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4912 4913 // Recompute the mask for remaining special value. 4914 pxor(xtmp2, xtmp3); 4915 // Extract mask corresponding to non-negative source lanes. 4916 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4917 4918 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4919 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4920 pand(xtmp3, xtmp2); 4921 4922 // Replace destination lanes holding special value(0x80000000) with max int 4923 // if corresponding source lane holds a +ve value. 4924 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4925 bind(done); 4926 } 4927 4928 4929 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4930 XMMRegister xtmp, Register rscratch, int vec_enc) { 4931 switch(to_elem_bt) { 4932 case T_SHORT: 4933 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4934 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4935 vpackusdw(dst, dst, zero, vec_enc); 4936 if (vec_enc == Assembler::AVX_256bit) { 4937 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4938 } 4939 break; 4940 case T_BYTE: 4941 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4942 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4943 vpackusdw(dst, dst, zero, vec_enc); 4944 if (vec_enc == Assembler::AVX_256bit) { 4945 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4946 } 4947 vpackuswb(dst, dst, zero, vec_enc); 4948 break; 4949 default: assert(false, "%s", type2name(to_elem_bt)); 4950 } 4951 } 4952 4953 /* 4954 * Algorithm for vector D2L and F2I conversions:- 4955 * a) Perform vector D2L/F2I cast. 4956 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4957 * It signifies that source value could be any of the special floating point 4958 * values(NaN,-Inf,Inf,Max,-Min). 4959 * c) Set destination to zero if source is NaN value. 4960 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4961 */ 4962 4963 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4964 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4965 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4966 int to_elem_sz = type2aelembytes(to_elem_bt); 4967 assert(to_elem_sz <= 4, ""); 4968 vcvttps2dq(dst, src, vec_enc); 4969 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4970 if (to_elem_sz < 4) { 4971 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4972 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4973 } 4974 } 4975 4976 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4977 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4978 Register rscratch, int vec_enc) { 4979 int to_elem_sz = type2aelembytes(to_elem_bt); 4980 assert(to_elem_sz <= 4, ""); 4981 vcvttps2dq(dst, src, vec_enc); 4982 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4983 switch(to_elem_bt) { 4984 case T_INT: 4985 break; 4986 case T_SHORT: 4987 evpmovdw(dst, dst, vec_enc); 4988 break; 4989 case T_BYTE: 4990 evpmovdb(dst, dst, vec_enc); 4991 break; 4992 default: assert(false, "%s", type2name(to_elem_bt)); 4993 } 4994 } 4995 4996 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4997 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4998 Register rscratch, int vec_enc) { 4999 evcvttps2qq(dst, src, vec_enc); 5000 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5001 } 5002 5003 // Handling for downcasting from double to integer or sub-word types on AVX2. 5004 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5005 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5006 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5007 int to_elem_sz = type2aelembytes(to_elem_bt); 5008 assert(to_elem_sz < 8, ""); 5009 vcvttpd2dq(dst, src, vec_enc); 5010 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5011 float_sign_flip, vec_enc); 5012 if (to_elem_sz < 4) { 5013 // xtmp4 holds all zero lanes. 5014 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5015 } 5016 } 5017 5018 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5019 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5020 KRegister ktmp2, AddressLiteral sign_flip, 5021 Register rscratch, int vec_enc) { 5022 if (VM_Version::supports_avx512dq()) { 5023 evcvttpd2qq(dst, src, vec_enc); 5024 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5025 switch(to_elem_bt) { 5026 case T_LONG: 5027 break; 5028 case T_INT: 5029 evpmovsqd(dst, dst, vec_enc); 5030 break; 5031 case T_SHORT: 5032 evpmovsqd(dst, dst, vec_enc); 5033 evpmovdw(dst, dst, vec_enc); 5034 break; 5035 case T_BYTE: 5036 evpmovsqd(dst, dst, vec_enc); 5037 evpmovdb(dst, dst, vec_enc); 5038 break; 5039 default: assert(false, "%s", type2name(to_elem_bt)); 5040 } 5041 } else { 5042 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5043 vcvttpd2dq(dst, src, vec_enc); 5044 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5045 switch(to_elem_bt) { 5046 case T_INT: 5047 break; 5048 case T_SHORT: 5049 evpmovdw(dst, dst, vec_enc); 5050 break; 5051 case T_BYTE: 5052 evpmovdb(dst, dst, vec_enc); 5053 break; 5054 default: assert(false, "%s", type2name(to_elem_bt)); 5055 } 5056 } 5057 } 5058 5059 #ifdef _LP64 5060 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5061 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5062 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5063 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5064 // and re-instantiate original MXCSR.RC mode after that. 5065 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5066 5067 mov64(tmp, julong_cast(0.5L)); 5068 evpbroadcastq(xtmp1, tmp, vec_enc); 5069 vaddpd(xtmp1, src , xtmp1, vec_enc); 5070 evcvtpd2qq(dst, xtmp1, vec_enc); 5071 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5072 double_sign_flip, vec_enc);; 5073 5074 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5075 } 5076 5077 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5078 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5079 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5080 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5081 // and re-instantiate original MXCSR.RC mode after that. 5082 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5083 5084 movl(tmp, jint_cast(0.5)); 5085 movq(xtmp1, tmp); 5086 vbroadcastss(xtmp1, xtmp1, vec_enc); 5087 vaddps(xtmp1, src , xtmp1, vec_enc); 5088 vcvtps2dq(dst, xtmp1, vec_enc); 5089 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5090 float_sign_flip, vec_enc); 5091 5092 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5093 } 5094 5095 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5096 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5097 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5098 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5099 // and re-instantiate original MXCSR.RC mode after that. 5100 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5101 5102 movl(tmp, jint_cast(0.5)); 5103 movq(xtmp1, tmp); 5104 vbroadcastss(xtmp1, xtmp1, vec_enc); 5105 vaddps(xtmp1, src , xtmp1, vec_enc); 5106 vcvtps2dq(dst, xtmp1, vec_enc); 5107 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5108 5109 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5110 } 5111 #endif // _LP64 5112 5113 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5114 BasicType from_elem_bt, BasicType to_elem_bt) { 5115 switch (from_elem_bt) { 5116 case T_BYTE: 5117 switch (to_elem_bt) { 5118 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5119 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5120 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5121 default: ShouldNotReachHere(); 5122 } 5123 break; 5124 case T_SHORT: 5125 switch (to_elem_bt) { 5126 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5127 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5128 default: ShouldNotReachHere(); 5129 } 5130 break; 5131 case T_INT: 5132 assert(to_elem_bt == T_LONG, ""); 5133 vpmovzxdq(dst, src, vlen_enc); 5134 break; 5135 default: 5136 ShouldNotReachHere(); 5137 } 5138 } 5139 5140 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5141 BasicType from_elem_bt, BasicType to_elem_bt) { 5142 switch (from_elem_bt) { 5143 case T_BYTE: 5144 switch (to_elem_bt) { 5145 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5146 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5147 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5148 default: ShouldNotReachHere(); 5149 } 5150 break; 5151 case T_SHORT: 5152 switch (to_elem_bt) { 5153 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5154 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5155 default: ShouldNotReachHere(); 5156 } 5157 break; 5158 case T_INT: 5159 assert(to_elem_bt == T_LONG, ""); 5160 vpmovsxdq(dst, src, vlen_enc); 5161 break; 5162 default: 5163 ShouldNotReachHere(); 5164 } 5165 } 5166 5167 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5168 BasicType dst_bt, BasicType src_bt, int vlen) { 5169 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5170 assert(vlen_enc != AVX_512bit, ""); 5171 5172 int dst_bt_size = type2aelembytes(dst_bt); 5173 int src_bt_size = type2aelembytes(src_bt); 5174 if (dst_bt_size > src_bt_size) { 5175 switch (dst_bt_size / src_bt_size) { 5176 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5177 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5178 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5179 default: ShouldNotReachHere(); 5180 } 5181 } else { 5182 assert(dst_bt_size < src_bt_size, ""); 5183 switch (src_bt_size / dst_bt_size) { 5184 case 2: { 5185 if (vlen_enc == AVX_128bit) { 5186 vpacksswb(dst, src, src, vlen_enc); 5187 } else { 5188 vpacksswb(dst, src, src, vlen_enc); 5189 vpermq(dst, dst, 0x08, vlen_enc); 5190 } 5191 break; 5192 } 5193 case 4: { 5194 if (vlen_enc == AVX_128bit) { 5195 vpackssdw(dst, src, src, vlen_enc); 5196 vpacksswb(dst, dst, dst, vlen_enc); 5197 } else { 5198 vpackssdw(dst, src, src, vlen_enc); 5199 vpermq(dst, dst, 0x08, vlen_enc); 5200 vpacksswb(dst, dst, dst, AVX_128bit); 5201 } 5202 break; 5203 } 5204 case 8: { 5205 if (vlen_enc == AVX_128bit) { 5206 vpshufd(dst, src, 0x08, vlen_enc); 5207 vpackssdw(dst, dst, dst, vlen_enc); 5208 vpacksswb(dst, dst, dst, vlen_enc); 5209 } else { 5210 vpshufd(dst, src, 0x08, vlen_enc); 5211 vpermq(dst, dst, 0x08, vlen_enc); 5212 vpackssdw(dst, dst, dst, AVX_128bit); 5213 vpacksswb(dst, dst, dst, AVX_128bit); 5214 } 5215 break; 5216 } 5217 default: ShouldNotReachHere(); 5218 } 5219 } 5220 } 5221 5222 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5223 bool merge, BasicType bt, int vlen_enc) { 5224 if (bt == T_INT) { 5225 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5226 } else { 5227 assert(bt == T_LONG, ""); 5228 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5229 } 5230 } 5231 5232 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5233 bool merge, BasicType bt, int vlen_enc) { 5234 if (bt == T_INT) { 5235 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5236 } else { 5237 assert(bt == T_LONG, ""); 5238 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5239 } 5240 } 5241 5242 #ifdef _LP64 5243 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5244 Register rtmp2, XMMRegister xtmp, int mask_len, 5245 int vec_enc) { 5246 int index = 0; 5247 int vindex = 0; 5248 mov64(rtmp1, 0x0101010101010101L); 5249 pdepq(rtmp1, src, rtmp1); 5250 if (mask_len > 8) { 5251 movq(rtmp2, src); 5252 vpxor(xtmp, xtmp, xtmp, vec_enc); 5253 movq(xtmp, rtmp1); 5254 } 5255 movq(dst, rtmp1); 5256 5257 mask_len -= 8; 5258 while (mask_len > 0) { 5259 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5260 index++; 5261 if ((index % 2) == 0) { 5262 pxor(xtmp, xtmp); 5263 } 5264 mov64(rtmp1, 0x0101010101010101L); 5265 shrq(rtmp2, 8); 5266 pdepq(rtmp1, rtmp2, rtmp1); 5267 pinsrq(xtmp, rtmp1, index % 2); 5268 vindex = index / 2; 5269 if (vindex) { 5270 // Write entire 16 byte vector when both 64 bit 5271 // lanes are update to save redundant instructions. 5272 if (index % 2) { 5273 vinsertf128(dst, dst, xtmp, vindex); 5274 } 5275 } else { 5276 vmovdqu(dst, xtmp); 5277 } 5278 mask_len -= 8; 5279 } 5280 } 5281 5282 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5283 switch(opc) { 5284 case Op_VectorMaskTrueCount: 5285 popcntq(dst, tmp); 5286 break; 5287 case Op_VectorMaskLastTrue: 5288 if (VM_Version::supports_lzcnt()) { 5289 lzcntq(tmp, tmp); 5290 movl(dst, 63); 5291 subl(dst, tmp); 5292 } else { 5293 movl(dst, -1); 5294 bsrq(tmp, tmp); 5295 cmov32(Assembler::notZero, dst, tmp); 5296 } 5297 break; 5298 case Op_VectorMaskFirstTrue: 5299 if (VM_Version::supports_bmi1()) { 5300 if (masklen < 32) { 5301 orl(tmp, 1 << masklen); 5302 tzcntl(dst, tmp); 5303 } else if (masklen == 32) { 5304 tzcntl(dst, tmp); 5305 } else { 5306 assert(masklen == 64, ""); 5307 tzcntq(dst, tmp); 5308 } 5309 } else { 5310 if (masklen < 32) { 5311 orl(tmp, 1 << masklen); 5312 bsfl(dst, tmp); 5313 } else { 5314 assert(masklen == 32 || masklen == 64, ""); 5315 movl(dst, masklen); 5316 if (masklen == 32) { 5317 bsfl(tmp, tmp); 5318 } else { 5319 bsfq(tmp, tmp); 5320 } 5321 cmov32(Assembler::notZero, dst, tmp); 5322 } 5323 } 5324 break; 5325 case Op_VectorMaskToLong: 5326 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5327 break; 5328 default: assert(false, "Unhandled mask operation"); 5329 } 5330 } 5331 5332 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5333 int masklen, int masksize, int vec_enc) { 5334 assert(VM_Version::supports_popcnt(), ""); 5335 5336 if(VM_Version::supports_avx512bw()) { 5337 kmovql(tmp, mask); 5338 } else { 5339 assert(masklen <= 16, ""); 5340 kmovwl(tmp, mask); 5341 } 5342 5343 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5344 // operations needs to be clipped. 5345 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5346 andq(tmp, (1 << masklen) - 1); 5347 } 5348 5349 vector_mask_operation_helper(opc, dst, tmp, masklen); 5350 } 5351 5352 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5353 Register tmp, int masklen, BasicType bt, int vec_enc) { 5354 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5355 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5356 assert(VM_Version::supports_popcnt(), ""); 5357 5358 bool need_clip = false; 5359 switch(bt) { 5360 case T_BOOLEAN: 5361 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5362 vpxor(xtmp, xtmp, xtmp, vec_enc); 5363 vpsubb(xtmp, xtmp, mask, vec_enc); 5364 vpmovmskb(tmp, xtmp, vec_enc); 5365 need_clip = masklen < 16; 5366 break; 5367 case T_BYTE: 5368 vpmovmskb(tmp, mask, vec_enc); 5369 need_clip = masklen < 16; 5370 break; 5371 case T_SHORT: 5372 vpacksswb(xtmp, mask, mask, vec_enc); 5373 if (masklen >= 16) { 5374 vpermpd(xtmp, xtmp, 8, vec_enc); 5375 } 5376 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5377 need_clip = masklen < 16; 5378 break; 5379 case T_INT: 5380 case T_FLOAT: 5381 vmovmskps(tmp, mask, vec_enc); 5382 need_clip = masklen < 4; 5383 break; 5384 case T_LONG: 5385 case T_DOUBLE: 5386 vmovmskpd(tmp, mask, vec_enc); 5387 need_clip = masklen < 2; 5388 break; 5389 default: assert(false, "Unhandled type, %s", type2name(bt)); 5390 } 5391 5392 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5393 // operations needs to be clipped. 5394 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5395 // need_clip implies masklen < 32 5396 andq(tmp, (1 << masklen) - 1); 5397 } 5398 5399 vector_mask_operation_helper(opc, dst, tmp, masklen); 5400 } 5401 5402 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5403 Register rtmp2, int mask_len) { 5404 kmov(rtmp1, src); 5405 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5406 mov64(rtmp2, -1L); 5407 pextq(rtmp2, rtmp2, rtmp1); 5408 kmov(dst, rtmp2); 5409 } 5410 5411 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5412 XMMRegister mask, Register rtmp, Register rscratch, 5413 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5414 int vec_enc) { 5415 assert(type2aelembytes(bt) >= 4, ""); 5416 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5417 address compress_perm_table = nullptr; 5418 address expand_perm_table = nullptr; 5419 if (type2aelembytes(bt) == 8) { 5420 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5421 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5422 vmovmskpd(rtmp, mask, vec_enc); 5423 } else { 5424 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5425 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5426 vmovmskps(rtmp, mask, vec_enc); 5427 } 5428 shlq(rtmp, 5); // for 32 byte permute row. 5429 if (opcode == Op_CompressV) { 5430 lea(rscratch, ExternalAddress(compress_perm_table)); 5431 } else { 5432 lea(rscratch, ExternalAddress(expand_perm_table)); 5433 } 5434 addptr(rtmp, rscratch); 5435 vmovdqu(permv, Address(rtmp)); 5436 vpermps(dst, permv, src, Assembler::AVX_256bit); 5437 vpxor(xtmp, xtmp, xtmp, vec_enc); 5438 // Blend the result with zero vector using permute mask, each column entry 5439 // in a permute table row contains either a valid permute index or a -1 (default) 5440 // value, this can potentially be used as a blending mask after 5441 // compressing/expanding the source vector lanes. 5442 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5443 } 5444 5445 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5446 bool merge, BasicType bt, int vec_enc) { 5447 if (opcode == Op_CompressV) { 5448 switch(bt) { 5449 case T_BYTE: 5450 evpcompressb(dst, mask, src, merge, vec_enc); 5451 break; 5452 case T_CHAR: 5453 case T_SHORT: 5454 evpcompressw(dst, mask, src, merge, vec_enc); 5455 break; 5456 case T_INT: 5457 evpcompressd(dst, mask, src, merge, vec_enc); 5458 break; 5459 case T_FLOAT: 5460 evcompressps(dst, mask, src, merge, vec_enc); 5461 break; 5462 case T_LONG: 5463 evpcompressq(dst, mask, src, merge, vec_enc); 5464 break; 5465 case T_DOUBLE: 5466 evcompresspd(dst, mask, src, merge, vec_enc); 5467 break; 5468 default: 5469 fatal("Unsupported type %s", type2name(bt)); 5470 break; 5471 } 5472 } else { 5473 assert(opcode == Op_ExpandV, ""); 5474 switch(bt) { 5475 case T_BYTE: 5476 evpexpandb(dst, mask, src, merge, vec_enc); 5477 break; 5478 case T_CHAR: 5479 case T_SHORT: 5480 evpexpandw(dst, mask, src, merge, vec_enc); 5481 break; 5482 case T_INT: 5483 evpexpandd(dst, mask, src, merge, vec_enc); 5484 break; 5485 case T_FLOAT: 5486 evexpandps(dst, mask, src, merge, vec_enc); 5487 break; 5488 case T_LONG: 5489 evpexpandq(dst, mask, src, merge, vec_enc); 5490 break; 5491 case T_DOUBLE: 5492 evexpandpd(dst, mask, src, merge, vec_enc); 5493 break; 5494 default: 5495 fatal("Unsupported type %s", type2name(bt)); 5496 break; 5497 } 5498 } 5499 } 5500 #endif 5501 5502 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5503 KRegister ktmp1, int vec_enc) { 5504 if (opcode == Op_SignumVD) { 5505 vsubpd(dst, zero, one, vec_enc); 5506 // if src < 0 ? -1 : 1 5507 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5508 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5509 // if src == NaN, -0.0 or 0.0 return src. 5510 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5511 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5512 } else { 5513 assert(opcode == Op_SignumVF, ""); 5514 vsubps(dst, zero, one, vec_enc); 5515 // if src < 0 ? -1 : 1 5516 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5517 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5518 // if src == NaN, -0.0 or 0.0 return src. 5519 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5520 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5521 } 5522 } 5523 5524 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5525 XMMRegister xtmp1, int vec_enc) { 5526 if (opcode == Op_SignumVD) { 5527 vsubpd(dst, zero, one, vec_enc); 5528 // if src < 0 ? -1 : 1 5529 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5530 // if src == NaN, -0.0 or 0.0 return src. 5531 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5532 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5533 } else { 5534 assert(opcode == Op_SignumVF, ""); 5535 vsubps(dst, zero, one, vec_enc); 5536 // if src < 0 ? -1 : 1 5537 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5538 // if src == NaN, -0.0 or 0.0 return src. 5539 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5540 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5541 } 5542 } 5543 5544 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5545 if (VM_Version::supports_avx512bw()) { 5546 if (mask_len > 32) { 5547 kmovql(dst, src); 5548 } else { 5549 kmovdl(dst, src); 5550 if (mask_len != 32) { 5551 kshiftrdl(dst, dst, 32 - mask_len); 5552 } 5553 } 5554 } else { 5555 assert(mask_len <= 16, ""); 5556 kmovwl(dst, src); 5557 if (mask_len != 16) { 5558 kshiftrwl(dst, dst, 16 - mask_len); 5559 } 5560 } 5561 } 5562 5563 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5564 int lane_size = type2aelembytes(bt); 5565 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5566 if ((is_LP64 || lane_size < 8) && 5567 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5568 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5569 movptr(rtmp, imm32); 5570 switch(lane_size) { 5571 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5572 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5573 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5574 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5575 fatal("Unsupported lane size %d", lane_size); 5576 break; 5577 } 5578 } else { 5579 movptr(rtmp, imm32); 5580 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5581 switch(lane_size) { 5582 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5583 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5584 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5585 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5586 fatal("Unsupported lane size %d", lane_size); 5587 break; 5588 } 5589 } 5590 } 5591 5592 // 5593 // Following is lookup table based popcount computation algorithm:- 5594 // Index Bit set count 5595 // [ 0000 -> 0, 5596 // 0001 -> 1, 5597 // 0010 -> 1, 5598 // 0011 -> 2, 5599 // 0100 -> 1, 5600 // 0101 -> 2, 5601 // 0110 -> 2, 5602 // 0111 -> 3, 5603 // 1000 -> 1, 5604 // 1001 -> 2, 5605 // 1010 -> 3, 5606 // 1011 -> 3, 5607 // 1100 -> 2, 5608 // 1101 -> 3, 5609 // 1111 -> 4 ] 5610 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5611 // shuffle indices for lookup table access. 5612 // b. Right shift each byte of vector lane by 4 positions. 5613 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5614 // shuffle indices for lookup table access. 5615 // d. Add the bitset count of upper and lower 4 bits of each byte. 5616 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5617 // count of all the bytes of a quadword. 5618 // f. Perform step e. for upper 128bit vector lane. 5619 // g. Pack the bitset count of quadwords back to double word. 5620 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5621 5622 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5623 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5624 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5625 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5626 vpsrlw(dst, src, 4, vec_enc); 5627 vpand(dst, dst, xtmp1, vec_enc); 5628 vpand(xtmp1, src, xtmp1, vec_enc); 5629 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5630 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5631 vpshufb(dst, xtmp2, dst, vec_enc); 5632 vpaddb(dst, dst, xtmp1, vec_enc); 5633 } 5634 5635 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5636 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5637 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5638 // Following code is as per steps e,f,g and h of above algorithm. 5639 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5640 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5641 vpsadbw(dst, dst, xtmp2, vec_enc); 5642 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5643 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5644 vpackuswb(dst, xtmp1, dst, vec_enc); 5645 } 5646 5647 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5648 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5649 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5650 // Add the popcount of upper and lower bytes of word. 5651 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5652 vpsrlw(dst, xtmp1, 8, vec_enc); 5653 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5654 vpaddw(dst, dst, xtmp1, vec_enc); 5655 } 5656 5657 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5658 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5659 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5660 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5661 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5662 } 5663 5664 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5665 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5666 switch(bt) { 5667 case T_LONG: 5668 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5669 break; 5670 case T_INT: 5671 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5672 break; 5673 case T_CHAR: 5674 case T_SHORT: 5675 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5676 break; 5677 case T_BYTE: 5678 case T_BOOLEAN: 5679 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5680 break; 5681 default: 5682 fatal("Unsupported type %s", type2name(bt)); 5683 break; 5684 } 5685 } 5686 5687 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5688 KRegister mask, bool merge, int vec_enc) { 5689 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5690 switch(bt) { 5691 case T_LONG: 5692 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5693 evpopcntq(dst, mask, src, merge, vec_enc); 5694 break; 5695 case T_INT: 5696 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5697 evpopcntd(dst, mask, src, merge, vec_enc); 5698 break; 5699 case T_CHAR: 5700 case T_SHORT: 5701 assert(VM_Version::supports_avx512_bitalg(), ""); 5702 evpopcntw(dst, mask, src, merge, vec_enc); 5703 break; 5704 case T_BYTE: 5705 case T_BOOLEAN: 5706 assert(VM_Version::supports_avx512_bitalg(), ""); 5707 evpopcntb(dst, mask, src, merge, vec_enc); 5708 break; 5709 default: 5710 fatal("Unsupported type %s", type2name(bt)); 5711 break; 5712 } 5713 } 5714 5715 #ifndef _LP64 5716 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5717 assert(VM_Version::supports_avx512bw(), ""); 5718 kmovdl(tmp, src); 5719 kunpckdql(dst, tmp, tmp); 5720 } 5721 #endif 5722 5723 // Bit reversal algorithm first reverses the bits of each byte followed by 5724 // a byte level reversal for multi-byte primitive types (short/int/long). 5725 // Algorithm performs a lookup table access to get reverse bit sequence 5726 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5727 // is obtained by swapping the reverse bit sequences of upper and lower 5728 // nibble of a byte. 5729 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5730 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5731 if (VM_Version::supports_avx512vlbw()) { 5732 5733 // Get the reverse bit sequence of lower nibble of each byte. 5734 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5735 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5736 evpandq(dst, xtmp2, src, vec_enc); 5737 vpshufb(dst, xtmp1, dst, vec_enc); 5738 vpsllq(dst, dst, 4, vec_enc); 5739 5740 // Get the reverse bit sequence of upper nibble of each byte. 5741 vpandn(xtmp2, xtmp2, src, vec_enc); 5742 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5743 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5744 5745 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5746 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5747 evporq(xtmp2, dst, xtmp2, vec_enc); 5748 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5749 5750 } else if(vec_enc == Assembler::AVX_512bit) { 5751 // Shift based bit reversal. 5752 assert(bt == T_LONG || bt == T_INT, ""); 5753 5754 // Swap lower and upper nibble of each byte. 5755 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5756 5757 // Swap two least and most significant bits of each nibble. 5758 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5759 5760 // Swap adjacent pair of bits. 5761 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5762 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5763 5764 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5765 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5766 } else { 5767 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5768 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5769 5770 // Get the reverse bit sequence of lower nibble of each byte. 5771 vpand(dst, xtmp2, src, vec_enc); 5772 vpshufb(dst, xtmp1, dst, vec_enc); 5773 vpsllq(dst, dst, 4, vec_enc); 5774 5775 // Get the reverse bit sequence of upper nibble of each byte. 5776 vpandn(xtmp2, xtmp2, src, vec_enc); 5777 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5778 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5779 5780 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5781 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5782 vpor(xtmp2, dst, xtmp2, vec_enc); 5783 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5784 } 5785 } 5786 5787 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5788 XMMRegister xtmp, Register rscratch) { 5789 assert(VM_Version::supports_gfni(), ""); 5790 assert(rscratch != noreg || always_reachable(mask), "missing"); 5791 5792 // Galois field instruction based bit reversal based on following algorithm. 5793 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5794 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5795 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5796 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5797 } 5798 5799 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5800 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5801 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5802 evpandq(dst, xtmp1, src, vec_enc); 5803 vpsllq(dst, dst, nbits, vec_enc); 5804 vpandn(xtmp1, xtmp1, src, vec_enc); 5805 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5806 evporq(dst, dst, xtmp1, vec_enc); 5807 } 5808 5809 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5810 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5811 // Shift based bit reversal. 5812 assert(VM_Version::supports_evex(), ""); 5813 switch(bt) { 5814 case T_LONG: 5815 // Swap upper and lower double word of each quad word. 5816 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5817 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5818 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5819 break; 5820 case T_INT: 5821 // Swap upper and lower word of each double word. 5822 evprord(xtmp1, k0, src, 16, true, vec_enc); 5823 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5824 break; 5825 case T_CHAR: 5826 case T_SHORT: 5827 // Swap upper and lower byte of each word. 5828 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5829 break; 5830 case T_BYTE: 5831 evmovdquq(dst, k0, src, true, vec_enc); 5832 break; 5833 default: 5834 fatal("Unsupported type %s", type2name(bt)); 5835 break; 5836 } 5837 } 5838 5839 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5840 if (bt == T_BYTE) { 5841 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5842 evmovdquq(dst, k0, src, true, vec_enc); 5843 } else { 5844 vmovdqu(dst, src); 5845 } 5846 return; 5847 } 5848 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5849 // pre-computed shuffle indices. 5850 switch(bt) { 5851 case T_LONG: 5852 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5853 break; 5854 case T_INT: 5855 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5856 break; 5857 case T_CHAR: 5858 case T_SHORT: 5859 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5860 break; 5861 default: 5862 fatal("Unsupported type %s", type2name(bt)); 5863 break; 5864 } 5865 vpshufb(dst, src, dst, vec_enc); 5866 } 5867 5868 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5869 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5870 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5871 assert(is_integral_type(bt), ""); 5872 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5873 assert(VM_Version::supports_avx512cd(), ""); 5874 switch(bt) { 5875 case T_LONG: 5876 evplzcntq(dst, ktmp, src, merge, vec_enc); 5877 break; 5878 case T_INT: 5879 evplzcntd(dst, ktmp, src, merge, vec_enc); 5880 break; 5881 case T_SHORT: 5882 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5883 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5884 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5885 vpunpckhwd(dst, xtmp1, src, vec_enc); 5886 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5887 vpackusdw(dst, xtmp2, dst, vec_enc); 5888 break; 5889 case T_BYTE: 5890 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5891 // accessing the lookup table. 5892 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5893 // accessing the lookup table. 5894 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5895 assert(VM_Version::supports_avx512bw(), ""); 5896 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5897 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5898 vpand(xtmp2, dst, src, vec_enc); 5899 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5900 vpsrlw(xtmp3, src, 4, vec_enc); 5901 vpand(xtmp3, dst, xtmp3, vec_enc); 5902 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5903 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5904 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5905 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5906 break; 5907 default: 5908 fatal("Unsupported type %s", type2name(bt)); 5909 break; 5910 } 5911 } 5912 5913 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5914 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5915 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5916 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5917 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5918 // accessing the lookup table. 5919 vpand(dst, xtmp2, src, vec_enc); 5920 vpshufb(dst, xtmp1, dst, vec_enc); 5921 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5922 // accessing the lookup table. 5923 vpsrlw(xtmp3, src, 4, vec_enc); 5924 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5925 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5926 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5927 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5928 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5929 vpaddb(dst, dst, xtmp2, vec_enc); 5930 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5931 } 5932 5933 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5934 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5935 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5936 // Add zero counts of lower byte and upper byte of a word if 5937 // upper byte holds a zero value. 5938 vpsrlw(xtmp3, src, 8, vec_enc); 5939 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5940 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5941 vpsllw(xtmp2, dst, 8, vec_enc); 5942 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5943 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5944 vpsrlw(dst, dst, 8, vec_enc); 5945 } 5946 5947 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5948 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5949 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5950 // hence biased exponent can be used to compute leading zero count as per 5951 // following formula:- 5952 // LZCNT = 32 - (biased_exp - 127) 5953 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5954 5955 // Broadcast 0xFF 5956 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5957 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5958 5959 // Extract biased exponent. 5960 vcvtdq2ps(dst, src, vec_enc); 5961 vpsrld(dst, dst, 23, vec_enc); 5962 vpand(dst, dst, xtmp1, vec_enc); 5963 5964 // Broadcast 127. 5965 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5966 // Exponent = biased_exp - 127 5967 vpsubd(dst, dst, xtmp1, vec_enc); 5968 5969 // Exponent = Exponent + 1 5970 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5971 vpaddd(dst, dst, xtmp3, vec_enc); 5972 5973 // Replace -ve exponent with zero, exponent is -ve when src 5974 // lane contains a zero value. 5975 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5976 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5977 5978 // Rematerialize broadcast 32. 5979 vpslld(xtmp1, xtmp3, 5, vec_enc); 5980 // Exponent is 32 if corresponding source lane contains max_int value. 5981 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5982 // LZCNT = 32 - exponent 5983 vpsubd(dst, xtmp1, dst, vec_enc); 5984 5985 // Replace LZCNT with a value 1 if corresponding source lane 5986 // contains max_int value. 5987 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5988 5989 // Replace biased_exp with 0 if source lane value is less than zero. 5990 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5991 vblendvps(dst, dst, xtmp2, src, vec_enc); 5992 } 5993 5994 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5995 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5996 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5997 // Add zero counts of lower word and upper word of a double word if 5998 // upper word holds a zero value. 5999 vpsrld(xtmp3, src, 16, vec_enc); 6000 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6001 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6002 vpslld(xtmp2, dst, 16, vec_enc); 6003 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6004 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6005 vpsrld(dst, dst, 16, vec_enc); 6006 // Add zero counts of lower doubleword and upper doubleword of a 6007 // quadword if upper doubleword holds a zero value. 6008 vpsrlq(xtmp3, src, 32, vec_enc); 6009 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6010 vpsllq(xtmp2, dst, 32, vec_enc); 6011 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6012 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6013 vpsrlq(dst, dst, 32, vec_enc); 6014 } 6015 6016 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6017 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6018 Register rtmp, int vec_enc) { 6019 assert(is_integral_type(bt), "unexpected type"); 6020 assert(vec_enc < Assembler::AVX_512bit, ""); 6021 switch(bt) { 6022 case T_LONG: 6023 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6024 break; 6025 case T_INT: 6026 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6027 break; 6028 case T_SHORT: 6029 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6030 break; 6031 case T_BYTE: 6032 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6033 break; 6034 default: 6035 fatal("Unsupported type %s", type2name(bt)); 6036 break; 6037 } 6038 } 6039 6040 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6041 switch(bt) { 6042 case T_BYTE: 6043 vpsubb(dst, src1, src2, vec_enc); 6044 break; 6045 case T_SHORT: 6046 vpsubw(dst, src1, src2, vec_enc); 6047 break; 6048 case T_INT: 6049 vpsubd(dst, src1, src2, vec_enc); 6050 break; 6051 case T_LONG: 6052 vpsubq(dst, src1, src2, vec_enc); 6053 break; 6054 default: 6055 fatal("Unsupported type %s", type2name(bt)); 6056 break; 6057 } 6058 } 6059 6060 // Trailing zero count computation is based on leading zero count operation as per 6061 // following equation. All AVX3 targets support AVX512CD feature which offers 6062 // direct vector instruction to compute leading zero count. 6063 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6064 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6065 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6066 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6067 assert(is_integral_type(bt), ""); 6068 // xtmp = -1 6069 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6070 // xtmp = xtmp + src 6071 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6072 // xtmp = xtmp & ~src 6073 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6074 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6075 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6076 vpsub(bt, dst, xtmp4, dst, vec_enc); 6077 } 6078 6079 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6080 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6081 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6082 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6083 assert(is_integral_type(bt), ""); 6084 // xtmp = 0 6085 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6086 // xtmp = 0 - src 6087 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6088 // xtmp = xtmp | src 6089 vpor(xtmp3, xtmp3, src, vec_enc); 6090 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6091 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6092 vpsub(bt, dst, xtmp1, dst, vec_enc); 6093 } 6094 6095 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6096 Label done; 6097 Label neg_divisor_fastpath; 6098 cmpl(divisor, 0); 6099 jccb(Assembler::less, neg_divisor_fastpath); 6100 xorl(rdx, rdx); 6101 divl(divisor); 6102 jmpb(done); 6103 bind(neg_divisor_fastpath); 6104 // Fastpath for divisor < 0: 6105 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6106 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6107 movl(rdx, rax); 6108 subl(rdx, divisor); 6109 if (VM_Version::supports_bmi1()) { 6110 andnl(rax, rdx, rax); 6111 } else { 6112 notl(rdx); 6113 andl(rax, rdx); 6114 } 6115 shrl(rax, 31); 6116 bind(done); 6117 } 6118 6119 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6120 Label done; 6121 Label neg_divisor_fastpath; 6122 cmpl(divisor, 0); 6123 jccb(Assembler::less, neg_divisor_fastpath); 6124 xorl(rdx, rdx); 6125 divl(divisor); 6126 jmpb(done); 6127 bind(neg_divisor_fastpath); 6128 // Fastpath when divisor < 0: 6129 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6130 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6131 movl(rdx, rax); 6132 subl(rax, divisor); 6133 if (VM_Version::supports_bmi1()) { 6134 andnl(rax, rax, rdx); 6135 } else { 6136 notl(rax); 6137 andl(rax, rdx); 6138 } 6139 sarl(rax, 31); 6140 andl(rax, divisor); 6141 subl(rdx, rax); 6142 bind(done); 6143 } 6144 6145 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6146 Label done; 6147 Label neg_divisor_fastpath; 6148 6149 cmpl(divisor, 0); 6150 jccb(Assembler::less, neg_divisor_fastpath); 6151 xorl(rdx, rdx); 6152 divl(divisor); 6153 jmpb(done); 6154 bind(neg_divisor_fastpath); 6155 // Fastpath for divisor < 0: 6156 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6157 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6158 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6159 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6160 movl(rdx, rax); 6161 subl(rax, divisor); 6162 if (VM_Version::supports_bmi1()) { 6163 andnl(rax, rax, rdx); 6164 } else { 6165 notl(rax); 6166 andl(rax, rdx); 6167 } 6168 movl(tmp, rax); 6169 shrl(rax, 31); // quotient 6170 sarl(tmp, 31); 6171 andl(tmp, divisor); 6172 subl(rdx, tmp); // remainder 6173 bind(done); 6174 } 6175 6176 #ifdef _LP64 6177 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6178 XMMRegister xtmp2, Register rtmp) { 6179 if(VM_Version::supports_gfni()) { 6180 // Galois field instruction based bit reversal based on following algorithm. 6181 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6182 mov64(rtmp, 0x8040201008040201L); 6183 movq(xtmp1, src); 6184 movq(xtmp2, rtmp); 6185 gf2p8affineqb(xtmp1, xtmp2, 0); 6186 movq(dst, xtmp1); 6187 } else { 6188 // Swap even and odd numbered bits. 6189 movl(rtmp, src); 6190 andl(rtmp, 0x55555555); 6191 shll(rtmp, 1); 6192 movl(dst, src); 6193 andl(dst, 0xAAAAAAAA); 6194 shrl(dst, 1); 6195 orl(dst, rtmp); 6196 6197 // Swap LSB and MSB 2 bits of each nibble. 6198 movl(rtmp, dst); 6199 andl(rtmp, 0x33333333); 6200 shll(rtmp, 2); 6201 andl(dst, 0xCCCCCCCC); 6202 shrl(dst, 2); 6203 orl(dst, rtmp); 6204 6205 // Swap LSB and MSB 4 bits of each byte. 6206 movl(rtmp, dst); 6207 andl(rtmp, 0x0F0F0F0F); 6208 shll(rtmp, 4); 6209 andl(dst, 0xF0F0F0F0); 6210 shrl(dst, 4); 6211 orl(dst, rtmp); 6212 } 6213 bswapl(dst); 6214 } 6215 6216 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6217 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6218 if(VM_Version::supports_gfni()) { 6219 // Galois field instruction based bit reversal based on following algorithm. 6220 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6221 mov64(rtmp1, 0x8040201008040201L); 6222 movq(xtmp1, src); 6223 movq(xtmp2, rtmp1); 6224 gf2p8affineqb(xtmp1, xtmp2, 0); 6225 movq(dst, xtmp1); 6226 } else { 6227 // Swap even and odd numbered bits. 6228 movq(rtmp1, src); 6229 mov64(rtmp2, 0x5555555555555555L); 6230 andq(rtmp1, rtmp2); 6231 shlq(rtmp1, 1); 6232 movq(dst, src); 6233 notq(rtmp2); 6234 andq(dst, rtmp2); 6235 shrq(dst, 1); 6236 orq(dst, rtmp1); 6237 6238 // Swap LSB and MSB 2 bits of each nibble. 6239 movq(rtmp1, dst); 6240 mov64(rtmp2, 0x3333333333333333L); 6241 andq(rtmp1, rtmp2); 6242 shlq(rtmp1, 2); 6243 notq(rtmp2); 6244 andq(dst, rtmp2); 6245 shrq(dst, 2); 6246 orq(dst, rtmp1); 6247 6248 // Swap LSB and MSB 4 bits of each byte. 6249 movq(rtmp1, dst); 6250 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6251 andq(rtmp1, rtmp2); 6252 shlq(rtmp1, 4); 6253 notq(rtmp2); 6254 andq(dst, rtmp2); 6255 shrq(dst, 4); 6256 orq(dst, rtmp1); 6257 } 6258 bswapq(dst); 6259 } 6260 6261 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6262 Label done; 6263 Label neg_divisor_fastpath; 6264 cmpq(divisor, 0); 6265 jccb(Assembler::less, neg_divisor_fastpath); 6266 xorl(rdx, rdx); 6267 divq(divisor); 6268 jmpb(done); 6269 bind(neg_divisor_fastpath); 6270 // Fastpath for divisor < 0: 6271 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6272 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6273 movq(rdx, rax); 6274 subq(rdx, divisor); 6275 if (VM_Version::supports_bmi1()) { 6276 andnq(rax, rdx, rax); 6277 } else { 6278 notq(rdx); 6279 andq(rax, rdx); 6280 } 6281 shrq(rax, 63); 6282 bind(done); 6283 } 6284 6285 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6286 Label done; 6287 Label neg_divisor_fastpath; 6288 cmpq(divisor, 0); 6289 jccb(Assembler::less, neg_divisor_fastpath); 6290 xorq(rdx, rdx); 6291 divq(divisor); 6292 jmp(done); 6293 bind(neg_divisor_fastpath); 6294 // Fastpath when divisor < 0: 6295 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6296 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6297 movq(rdx, rax); 6298 subq(rax, divisor); 6299 if (VM_Version::supports_bmi1()) { 6300 andnq(rax, rax, rdx); 6301 } else { 6302 notq(rax); 6303 andq(rax, rdx); 6304 } 6305 sarq(rax, 63); 6306 andq(rax, divisor); 6307 subq(rdx, rax); 6308 bind(done); 6309 } 6310 6311 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6312 Label done; 6313 Label neg_divisor_fastpath; 6314 cmpq(divisor, 0); 6315 jccb(Assembler::less, neg_divisor_fastpath); 6316 xorq(rdx, rdx); 6317 divq(divisor); 6318 jmp(done); 6319 bind(neg_divisor_fastpath); 6320 // Fastpath for divisor < 0: 6321 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6322 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6323 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6324 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6325 movq(rdx, rax); 6326 subq(rax, divisor); 6327 if (VM_Version::supports_bmi1()) { 6328 andnq(rax, rax, rdx); 6329 } else { 6330 notq(rax); 6331 andq(rax, rdx); 6332 } 6333 movq(tmp, rax); 6334 shrq(rax, 63); // quotient 6335 sarq(tmp, 63); 6336 andq(tmp, divisor); 6337 subq(rdx, tmp); // remainder 6338 bind(done); 6339 } 6340 #endif 6341 6342 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6343 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6344 int vlen_enc) { 6345 assert(VM_Version::supports_avx512bw(), ""); 6346 // Byte shuffles are inlane operations and indices are determined using 6347 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6348 // normalized to index range 0-15. This makes sure that all the multiples 6349 // of an index value are placed at same relative position in 128 bit 6350 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6351 // will be 16th element in their respective 128 bit lanes. 6352 movl(rtmp, 16); 6353 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6354 6355 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6356 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6357 // original shuffle indices and move the shuffled lanes corresponding to true 6358 // mask to destination vector. 6359 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6360 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6361 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6362 6363 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6364 // and broadcasting second 128 bit lane. 6365 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6366 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6367 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6368 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6369 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6370 6371 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6372 // and broadcasting third 128 bit lane. 6373 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6374 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6375 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6376 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6377 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6378 6379 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6380 // and broadcasting third 128 bit lane. 6381 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6382 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6383 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6384 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6385 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6386 } 6387 6388 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6389 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6390 if (vlen_enc == AVX_128bit) { 6391 vpermilps(dst, src, shuffle, vlen_enc); 6392 } else if (bt == T_INT) { 6393 vpermd(dst, shuffle, src, vlen_enc); 6394 } else { 6395 assert(bt == T_FLOAT, ""); 6396 vpermps(dst, shuffle, src, vlen_enc); 6397 } 6398 } 6399 6400 #ifdef _LP64 6401 void C2_MacroAssembler::load_nklass_compact_c2(Register dst, Register obj, Register index, Address::ScaleFactor scale, int disp) { 6402 // Note: Don't clobber obj anywhere in that method! 6403 6404 // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract 6405 // obj-start, so that we can load from the object's mark-word instead. Usually the address 6406 // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2 6407 // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and 6408 // then passes that register as obj and 0 in disp. The following code extracts the base 6409 // and offset to load the mark-word. 6410 int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes(); 6411 movq(dst, Address(obj, index, scale, offset)); 6412 shrq(dst, markWord::klass_shift); 6413 } 6414 #endif