1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 // fast_lock and fast_unlock used by C2 176 177 // Because the transitions from emitted code to the runtime 178 // monitorenter/exit helper stubs are so slow it's critical that 179 // we inline both the stack-locking fast path and the inflated fast path. 180 // 181 // See also: cmpFastLock and cmpFastUnlock. 182 // 183 // What follows is a specialized inline transliteration of the code 184 // in enter() and exit(). If we're concerned about I$ bloat another 185 // option would be to emit TrySlowEnter and TrySlowExit methods 186 // at startup-time. These methods would accept arguments as 187 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 188 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 189 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 190 // In practice, however, the # of lock sites is bounded and is usually small. 191 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 192 // if the processor uses simple bimodal branch predictors keyed by EIP 193 // Since the helper routines would be called from multiple synchronization 194 // sites. 195 // 196 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 197 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 198 // to those specialized methods. That'd give us a mostly platform-independent 199 // implementation that the JITs could optimize and inline at their pleasure. 200 // Done correctly, the only time we'd need to cross to native could would be 201 // to park() or unpark() threads. We'd also need a few more unsafe operators 202 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 203 // (b) explicit barriers or fence operations. 204 // 205 // TODO: 206 // 207 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 208 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 209 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 210 // the lock operators would typically be faster than reifying Self. 211 // 212 // * Ideally I'd define the primitives as: 213 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 214 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 215 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 216 // Instead, we're stuck with a rather awkward and brittle register assignments below. 217 // Furthermore the register assignments are overconstrained, possibly resulting in 218 // sub-optimal code near the synchronization site. 219 // 220 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 221 // Alternately, use a better sp-proximity test. 222 // 223 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 224 // Either one is sufficient to uniquely identify a thread. 225 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 226 // 227 // * Intrinsify notify() and notifyAll() for the common cases where the 228 // object is locked by the calling thread but the waitlist is empty. 229 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 230 // 231 // * use jccb and jmpb instead of jcc and jmp to improve code density. 232 // But beware of excessive branch density on AMD Opterons. 233 // 234 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 235 // or failure of the fast path. If the fast path fails then we pass 236 // control to the slow path, typically in C. In fast_lock and 237 // fast_unlock we often branch to DONE_LABEL, just to find that C2 238 // will emit a conditional branch immediately after the node. 239 // So we have branches to branches and lots of ICC.ZF games. 240 // Instead, it might be better to have C2 pass a "FailureLabel" 241 // into fast_lock and fast_unlock. In the case of success, control 242 // will drop through the node. ICC.ZF is undefined at exit. 243 // In the case of failure, the node will branch directly to the 244 // FailureLabel 245 246 247 // obj: object to lock 248 // box: on-stack box address (displaced header location) - KILLED 249 // rax,: tmp -- KILLED 250 // scr: tmp -- KILLED 251 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 252 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 253 Metadata* method_data) { 254 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 255 // Ensure the register assignments are disjoint 256 assert(tmpReg == rax, ""); 257 assert(cx1Reg == noreg, ""); 258 assert(cx2Reg == noreg, ""); 259 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 260 261 // Possible cases that we'll encounter in fast_lock 262 // ------------------------------------------------ 263 // * Inflated 264 // -- unlocked 265 // -- Locked 266 // = by self 267 // = by other 268 // * neutral 269 // * stack-locked 270 // -- by self 271 // = sp-proximity test hits 272 // = sp-proximity test generates false-negative 273 // -- by other 274 // 275 276 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 277 278 if (DiagnoseSyncOnValueBasedClasses != 0) { 279 load_klass(tmpReg, objReg, scrReg); 280 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 281 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 282 jcc(Assembler::notZero, DONE_LABEL); 283 } 284 285 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 286 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 287 jcc(Assembler::notZero, IsInflated); 288 289 if (LockingMode == LM_MONITOR) { 290 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 291 testptr(objReg, objReg); 292 } else { 293 assert(LockingMode == LM_LEGACY, "must be"); 294 // Attempt stack-locking ... 295 orptr (tmpReg, markWord::unlocked_value); 296 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 297 lock(); 298 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 299 jcc(Assembler::equal, COUNT); // Success 300 301 // Recursive locking. 302 // The object is stack-locked: markword contains stack pointer to BasicLock. 303 // Locked by current thread if difference with current SP is less than one page. 304 subptr(tmpReg, rsp); 305 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 306 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 307 movptr(Address(boxReg, 0), tmpReg); 308 } 309 jmp(DONE_LABEL); 310 311 bind(IsInflated); 312 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 313 314 #ifndef _LP64 315 // The object is inflated. 316 317 // boxReg refers to the on-stack BasicLock in the current frame. 318 // We'd like to write: 319 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 320 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 321 // additional latency as we have another ST in the store buffer that must drain. 322 323 // avoid ST-before-CAS 324 // register juggle because we need tmpReg for cmpxchgptr below 325 movptr(scrReg, boxReg); 326 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 327 328 // Optimistic form: consider XORL tmpReg,tmpReg 329 movptr(tmpReg, NULL_WORD); 330 331 // Appears unlocked - try to swing _owner from null to non-null. 332 // Ideally, I'd manifest "Self" with get_thread and then attempt 333 // to CAS the register containing Self into m->Owner. 334 // But we don't have enough registers, so instead we can either try to CAS 335 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 336 // we later store "Self" into m->Owner. Transiently storing a stack address 337 // (rsp or the address of the box) into m->owner is harmless. 338 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 339 lock(); 340 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 341 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 342 // If we weren't able to swing _owner from null to the BasicLock 343 // then take the slow path. 344 jccb (Assembler::notZero, NO_COUNT); 345 // update _owner from BasicLock to thread 346 get_thread (scrReg); // beware: clobbers ICCs 347 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 348 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 349 350 // If the CAS fails we can either retry or pass control to the slow path. 351 // We use the latter tactic. 352 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 353 // If the CAS was successful ... 354 // Self has acquired the lock 355 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 356 // Intentional fall-through into DONE_LABEL ... 357 #else // _LP64 358 // It's inflated and we use scrReg for ObjectMonitor* in this section. 359 movq(scrReg, tmpReg); 360 xorq(tmpReg, tmpReg); 361 lock(); 362 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 363 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 364 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 365 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 366 // Propagate ICC.ZF from CAS above into DONE_LABEL. 367 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 368 369 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 370 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 371 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 372 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 373 #endif // _LP64 374 bind(DONE_LABEL); 375 376 // ZFlag == 1 count in fast path 377 // ZFlag == 0 count in slow path 378 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 379 380 bind(COUNT); 381 // Count monitors in fast path 382 increment(Address(thread, JavaThread::held_monitor_count_offset())); 383 384 xorl(tmpReg, tmpReg); // Set ZF == 1 385 386 bind(NO_COUNT); 387 388 // At NO_COUNT the icc ZFlag is set as follows ... 389 // fast_unlock uses the same protocol. 390 // ZFlag == 1 -> Success 391 // ZFlag == 0 -> Failure - force control through the slow path 392 } 393 394 // obj: object to unlock 395 // box: box address (displaced header location), killed. Must be EAX. 396 // tmp: killed, cannot be obj nor box. 397 // 398 // Some commentary on balanced locking: 399 // 400 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 401 // Methods that don't have provably balanced locking are forced to run in the 402 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 403 // The interpreter provides two properties: 404 // I1: At return-time the interpreter automatically and quietly unlocks any 405 // objects acquired the current activation (frame). Recall that the 406 // interpreter maintains an on-stack list of locks currently held by 407 // a frame. 408 // I2: If a method attempts to unlock an object that is not held by the 409 // the frame the interpreter throws IMSX. 410 // 411 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 412 // B() doesn't have provably balanced locking so it runs in the interpreter. 413 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 414 // is still locked by A(). 415 // 416 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 417 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 418 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 419 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 420 // Arguably given that the spec legislates the JNI case as undefined our implementation 421 // could reasonably *avoid* checking owner in fast_unlock(). 422 // In the interest of performance we elide m->Owner==Self check in unlock. 423 // A perfectly viable alternative is to elide the owner check except when 424 // Xcheck:jni is enabled. 425 426 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) { 427 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 428 assert(boxReg == rax, ""); 429 assert_different_registers(objReg, boxReg, tmpReg); 430 431 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 432 433 if (LockingMode == LM_LEGACY) { 434 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 435 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 436 } 437 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 438 if (LockingMode != LM_MONITOR) { 439 testptr(tmpReg, markWord::monitor_value); // Inflated? 440 jcc(Assembler::zero, Stacked); 441 } 442 443 // It's inflated. 444 445 // Despite our balanced locking property we still check that m->_owner == Self 446 // as java routines or native JNI code called by this thread might 447 // have released the lock. 448 // Refer to the comments in synchronizer.cpp for how we might encode extra 449 // state in _succ so we can avoid fetching EntryList|cxq. 450 // 451 // If there's no contention try a 1-0 exit. That is, exit without 452 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 453 // we detect and recover from the race that the 1-0 exit admits. 454 // 455 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 456 // before it STs null into _owner, releasing the lock. Updates 457 // to data protected by the critical section must be visible before 458 // we drop the lock (and thus before any other thread could acquire 459 // the lock and observe the fields protected by the lock). 460 // IA32's memory-model is SPO, so STs are ordered with respect to 461 // each other and there's no need for an explicit barrier (fence). 462 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 463 #ifndef _LP64 464 // Note that we could employ various encoding schemes to reduce 465 // the number of loads below (currently 4) to just 2 or 3. 466 // Refer to the comments in synchronizer.cpp. 467 // In practice the chain of fetches doesn't seem to impact performance, however. 468 xorptr(boxReg, boxReg); 469 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 470 jccb (Assembler::notZero, DONE_LABEL); 471 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 472 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 473 jccb (Assembler::notZero, DONE_LABEL); 474 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 475 jmpb (DONE_LABEL); 476 #else // _LP64 477 // It's inflated 478 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 479 480 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 481 jccb(Assembler::equal, LNotRecursive); 482 483 // Recursive inflated unlock 484 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 485 jmpb(LSuccess); 486 487 bind(LNotRecursive); 488 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 489 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 490 jccb (Assembler::notZero, CheckSucc); 491 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 492 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 493 jmpb (DONE_LABEL); 494 495 // Try to avoid passing control into the slow_path ... 496 bind (CheckSucc); 497 498 // The following optional optimization can be elided if necessary 499 // Effectively: if (succ == null) goto slow path 500 // The code reduces the window for a race, however, 501 // and thus benefits performance. 502 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 503 jccb (Assembler::zero, LGoSlowPath); 504 505 xorptr(boxReg, boxReg); 506 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 507 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 508 509 // Memory barrier/fence 510 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 511 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 512 // This is faster on Nehalem and AMD Shanghai/Barcelona. 513 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 514 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 515 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 516 lock(); addl(Address(rsp, 0), 0); 517 518 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 519 jccb (Assembler::notZero, LSuccess); 520 521 // Rare inopportune interleaving - race. 522 // The successor vanished in the small window above. 523 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 524 // We need to ensure progress and succession. 525 // Try to reacquire the lock. 526 // If that fails then the new owner is responsible for succession and this 527 // thread needs to take no further action and can exit via the fast path (success). 528 // If the re-acquire succeeds then pass control into the slow path. 529 // As implemented, this latter mode is horrible because we generated more 530 // coherence traffic on the lock *and* artificially extended the critical section 531 // length while by virtue of passing control into the slow path. 532 533 // box is really RAX -- the following CMPXCHG depends on that binding 534 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 535 lock(); 536 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 537 // There's no successor so we tried to regrab the lock. 538 // If that didn't work, then another thread grabbed the 539 // lock so we're done (and exit was a success). 540 jccb (Assembler::notEqual, LSuccess); 541 // Intentional fall-through into slow path 542 543 bind (LGoSlowPath); 544 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 545 jmpb (DONE_LABEL); 546 547 bind (LSuccess); 548 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 549 jmpb (DONE_LABEL); 550 551 #endif 552 if (LockingMode == LM_LEGACY) { 553 bind (Stacked); 554 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 555 lock(); 556 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 557 // Intentional fall-thru into DONE_LABEL 558 } 559 560 bind(DONE_LABEL); 561 562 // ZFlag == 1 count in fast path 563 // ZFlag == 0 count in slow path 564 jccb(Assembler::notZero, NO_COUNT); 565 566 bind(COUNT); 567 // Count monitors in fast path 568 #ifndef _LP64 569 get_thread(tmpReg); 570 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 571 #else // _LP64 572 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 573 #endif 574 575 xorl(tmpReg, tmpReg); // Set ZF == 1 576 577 bind(NO_COUNT); 578 } 579 580 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 581 Register t, Register thread) { 582 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 583 assert(rax_reg == rax, "Used for CAS"); 584 assert_different_registers(obj, box, rax_reg, t, thread); 585 586 // Handle inflated monitor. 587 Label inflated; 588 // Finish fast lock successfully. ZF value is irrelevant. 589 Label locked; 590 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 591 Label slow_path; 592 593 if (DiagnoseSyncOnValueBasedClasses != 0) { 594 load_klass(rax_reg, obj, t); 595 movl(rax_reg, Address(rax_reg, Klass::access_flags_offset())); 596 testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS); 597 jcc(Assembler::notZero, slow_path); 598 } 599 600 const Register mark = t; 601 602 { // Lightweight Lock 603 604 Label push; 605 606 const Register top = box; 607 608 // Load the mark. 609 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 610 611 // Prefetch top. 612 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 613 614 // Check for monitor (0b10). 615 testptr(mark, markWord::monitor_value); 616 jcc(Assembler::notZero, inflated); 617 618 // Check if lock-stack is full. 619 cmpl(top, LockStack::end_offset() - 1); 620 jcc(Assembler::greater, slow_path); 621 622 // Check if recursive. 623 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 624 jccb(Assembler::equal, push); 625 626 // Try to lock. Transition lock bits 0b01 => 0b00 627 movptr(rax_reg, mark); 628 orptr(rax_reg, markWord::unlocked_value); 629 andptr(mark, ~(int32_t)markWord::unlocked_value); 630 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 631 jcc(Assembler::notEqual, slow_path); 632 633 bind(push); 634 // After successful lock, push object on lock-stack. 635 movptr(Address(thread, top), obj); 636 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 637 jmpb(locked); 638 } 639 640 { // Handle inflated monitor. 641 bind(inflated); 642 643 const Register tagged_monitor = mark; 644 645 // CAS owner (null => current thread). 646 xorptr(rax_reg, rax_reg); 647 lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 648 jccb(Assembler::equal, locked); 649 650 // Check if recursive. 651 cmpptr(thread, rax_reg); 652 jccb(Assembler::notEqual, slow_path); 653 654 // Recursive. 655 increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 656 } 657 658 bind(locked); 659 increment(Address(thread, JavaThread::held_monitor_count_offset())); 660 // Set ZF = 1 661 xorl(rax_reg, rax_reg); 662 663 #ifdef ASSERT 664 // Check that locked label is reached with ZF set. 665 Label zf_correct; 666 Label zf_bad_zero; 667 jcc(Assembler::zero, zf_correct); 668 jmp(zf_bad_zero); 669 #endif 670 671 bind(slow_path); 672 #ifdef ASSERT 673 // Check that slow_path label is reached with ZF not set. 674 jcc(Assembler::notZero, zf_correct); 675 stop("Fast Lock ZF != 0"); 676 bind(zf_bad_zero); 677 stop("Fast Lock ZF != 1"); 678 bind(zf_correct); 679 #endif 680 // C2 uses the value of ZF to determine the continuation. 681 } 682 683 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 684 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 685 assert(reg_rax == rax, "Used for CAS"); 686 assert_different_registers(obj, reg_rax, t); 687 688 // Handle inflated monitor. 689 Label inflated, inflated_check_lock_stack; 690 // Finish fast unlock successfully. MUST jump with ZF == 1 691 Label unlocked; 692 693 // Assume success. 694 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 695 696 const Register mark = t; 697 const Register top = reg_rax; 698 699 Label dummy; 700 C2FastUnlockLightweightStub* stub = nullptr; 701 702 if (!Compile::current()->output()->in_scratch_emit_size()) { 703 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 704 Compile::current()->output()->add_stub(stub); 705 } 706 707 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 708 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 709 710 { // Lightweight Unlock 711 712 // Load top. 713 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 714 715 // Prefetch mark. 716 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 717 718 // Check if obj is top of lock-stack. 719 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 720 // Top of lock stack was not obj. Must be monitor. 721 jcc(Assembler::notEqual, inflated_check_lock_stack); 722 723 // Pop lock-stack. 724 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 725 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 726 727 // Check if recursive. 728 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 729 jcc(Assembler::equal, unlocked); 730 731 // We elide the monitor check, let the CAS fail instead. 732 733 // Try to unlock. Transition lock bits 0b00 => 0b01 734 movptr(reg_rax, mark); 735 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 736 orptr(mark, markWord::unlocked_value); 737 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 738 jcc(Assembler::notEqual, push_and_slow_path); 739 jmp(unlocked); 740 } 741 742 743 { // Handle inflated monitor. 744 bind(inflated_check_lock_stack); 745 #ifdef ASSERT 746 Label check_done; 747 subl(top, oopSize); 748 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 749 jcc(Assembler::below, check_done); 750 cmpptr(obj, Address(thread, top)); 751 jccb(Assembler::notEqual, inflated_check_lock_stack); 752 stop("Fast Unlock lock on stack"); 753 bind(check_done); 754 testptr(mark, markWord::monitor_value); 755 jccb(Assembler::notZero, inflated); 756 stop("Fast Unlock not monitor"); 757 #endif 758 759 bind(inflated); 760 761 // mark contains the tagged ObjectMonitor*. 762 const Register monitor = mark; 763 764 #ifndef _LP64 765 // Check if recursive. 766 xorptr(reg_rax, reg_rax); 767 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 768 jcc(Assembler::notZero, check_successor); 769 770 // Check if the entry lists are empty. 771 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 772 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 773 jcc(Assembler::notZero, check_successor); 774 775 // Release lock. 776 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 777 #else // _LP64 778 Label recursive; 779 780 // Check if recursive. 781 cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 782 jccb(Assembler::notEqual, recursive); 783 784 // Check if the entry lists are empty. 785 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 786 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 787 jcc(Assembler::notZero, check_successor); 788 789 // Release lock. 790 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 791 jmpb(unlocked); 792 793 // Recursive unlock. 794 bind(recursive); 795 decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 796 xorl(t, t); 797 #endif 798 } 799 800 bind(unlocked); 801 if (stub != nullptr) { 802 bind(stub->unlocked_continuation()); 803 } 804 805 #ifdef ASSERT 806 // Check that unlocked label is reached with ZF set. 807 Label zf_correct; 808 jcc(Assembler::zero, zf_correct); 809 stop("Fast Unlock ZF != 1"); 810 #endif 811 812 if (stub != nullptr) { 813 bind(stub->slow_path_continuation()); 814 } 815 #ifdef ASSERT 816 // Check that stub->continuation() label is reached with ZF not set. 817 jccb(Assembler::notZero, zf_correct); 818 stop("Fast Unlock ZF != 0"); 819 bind(zf_correct); 820 #endif 821 // C2 uses the value of ZF to determine the continuation. 822 } 823 824 //------------------------------------------------------------------------------------------- 825 // Generic instructions support for use in .ad files C2 code generation 826 827 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 828 if (dst != src) { 829 movdqu(dst, src); 830 } 831 if (opcode == Op_AbsVD) { 832 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 833 } else { 834 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 835 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 836 } 837 } 838 839 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 840 if (opcode == Op_AbsVD) { 841 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 842 } else { 843 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 844 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 845 } 846 } 847 848 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 849 if (dst != src) { 850 movdqu(dst, src); 851 } 852 if (opcode == Op_AbsVF) { 853 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 854 } else { 855 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 856 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 857 } 858 } 859 860 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 861 if (opcode == Op_AbsVF) { 862 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 863 } else { 864 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 865 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 866 } 867 } 868 869 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 870 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 871 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 872 873 if (opcode == Op_MinV) { 874 if (elem_bt == T_BYTE) { 875 pminsb(dst, src); 876 } else if (elem_bt == T_SHORT) { 877 pminsw(dst, src); 878 } else if (elem_bt == T_INT) { 879 pminsd(dst, src); 880 } else { 881 assert(elem_bt == T_LONG, "required"); 882 assert(tmp == xmm0, "required"); 883 assert_different_registers(dst, src, tmp); 884 movdqu(xmm0, dst); 885 pcmpgtq(xmm0, src); 886 blendvpd(dst, src); // xmm0 as mask 887 } 888 } else { // opcode == Op_MaxV 889 if (elem_bt == T_BYTE) { 890 pmaxsb(dst, src); 891 } else if (elem_bt == T_SHORT) { 892 pmaxsw(dst, src); 893 } else if (elem_bt == T_INT) { 894 pmaxsd(dst, src); 895 } else { 896 assert(elem_bt == T_LONG, "required"); 897 assert(tmp == xmm0, "required"); 898 assert_different_registers(dst, src, tmp); 899 movdqu(xmm0, src); 900 pcmpgtq(xmm0, dst); 901 blendvpd(dst, src); // xmm0 as mask 902 } 903 } 904 } 905 906 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 907 XMMRegister dst, XMMRegister src1, XMMRegister src2, 908 int vlen_enc) { 909 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 910 911 if (opcode == Op_MinV) { 912 if (elem_bt == T_BYTE) { 913 vpminsb(dst, src1, src2, vlen_enc); 914 } else if (elem_bt == T_SHORT) { 915 vpminsw(dst, src1, src2, vlen_enc); 916 } else if (elem_bt == T_INT) { 917 vpminsd(dst, src1, src2, vlen_enc); 918 } else { 919 assert(elem_bt == T_LONG, "required"); 920 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 921 vpminsq(dst, src1, src2, vlen_enc); 922 } else { 923 assert_different_registers(dst, src1, src2); 924 vpcmpgtq(dst, src1, src2, vlen_enc); 925 vblendvpd(dst, src1, src2, dst, vlen_enc); 926 } 927 } 928 } else { // opcode == Op_MaxV 929 if (elem_bt == T_BYTE) { 930 vpmaxsb(dst, src1, src2, vlen_enc); 931 } else if (elem_bt == T_SHORT) { 932 vpmaxsw(dst, src1, src2, vlen_enc); 933 } else if (elem_bt == T_INT) { 934 vpmaxsd(dst, src1, src2, vlen_enc); 935 } else { 936 assert(elem_bt == T_LONG, "required"); 937 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 938 vpmaxsq(dst, src1, src2, vlen_enc); 939 } else { 940 assert_different_registers(dst, src1, src2); 941 vpcmpgtq(dst, src1, src2, vlen_enc); 942 vblendvpd(dst, src2, src1, dst, vlen_enc); 943 } 944 } 945 } 946 } 947 948 // Float/Double min max 949 950 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 951 XMMRegister dst, XMMRegister a, XMMRegister b, 952 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 953 int vlen_enc) { 954 assert(UseAVX > 0, "required"); 955 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 956 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 957 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 958 assert_different_registers(a, tmp, atmp, btmp); 959 assert_different_registers(b, tmp, atmp, btmp); 960 961 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 962 bool is_double_word = is_double_word_type(elem_bt); 963 964 /* Note on 'non-obvious' assembly sequence: 965 * 966 * While there are vminps/vmaxps instructions, there are two important differences between hardware 967 * and Java on how they handle floats: 968 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 969 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 970 * 971 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 972 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 973 * (only useful when signs differ, noop otherwise) 974 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 975 976 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 977 * btmp = (b < +0.0) ? a : b 978 * atmp = (b < +0.0) ? b : a 979 * Tmp = Max_Float(atmp , btmp) 980 * Res = (atmp == NaN) ? atmp : Tmp 981 */ 982 983 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 984 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 985 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 986 XMMRegister mask; 987 988 if (!is_double_word && is_min) { 989 mask = a; 990 vblend = &MacroAssembler::vblendvps; 991 vmaxmin = &MacroAssembler::vminps; 992 vcmp = &MacroAssembler::vcmpps; 993 } else if (!is_double_word && !is_min) { 994 mask = b; 995 vblend = &MacroAssembler::vblendvps; 996 vmaxmin = &MacroAssembler::vmaxps; 997 vcmp = &MacroAssembler::vcmpps; 998 } else if (is_double_word && is_min) { 999 mask = a; 1000 vblend = &MacroAssembler::vblendvpd; 1001 vmaxmin = &MacroAssembler::vminpd; 1002 vcmp = &MacroAssembler::vcmppd; 1003 } else { 1004 assert(is_double_word && !is_min, "sanity"); 1005 mask = b; 1006 vblend = &MacroAssembler::vblendvpd; 1007 vmaxmin = &MacroAssembler::vmaxpd; 1008 vcmp = &MacroAssembler::vcmppd; 1009 } 1010 1011 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1012 XMMRegister maxmin, scratch; 1013 if (dst == btmp) { 1014 maxmin = btmp; 1015 scratch = tmp; 1016 } else { 1017 maxmin = tmp; 1018 scratch = btmp; 1019 } 1020 1021 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1022 if (precompute_mask && !is_double_word) { 1023 vpsrad(tmp, mask, 32, vlen_enc); 1024 mask = tmp; 1025 } else if (precompute_mask && is_double_word) { 1026 vpxor(tmp, tmp, tmp, vlen_enc); 1027 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1028 mask = tmp; 1029 } 1030 1031 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1032 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1033 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1034 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1035 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1036 } 1037 1038 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1039 XMMRegister dst, XMMRegister a, XMMRegister b, 1040 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1041 int vlen_enc) { 1042 assert(UseAVX > 2, "required"); 1043 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1044 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1045 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1046 assert_different_registers(dst, a, atmp, btmp); 1047 assert_different_registers(dst, b, atmp, btmp); 1048 1049 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1050 bool is_double_word = is_double_word_type(elem_bt); 1051 bool merge = true; 1052 1053 if (!is_double_word && is_min) { 1054 evpmovd2m(ktmp, a, vlen_enc); 1055 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1056 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1057 vminps(dst, atmp, btmp, vlen_enc); 1058 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1059 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1060 } else if (!is_double_word && !is_min) { 1061 evpmovd2m(ktmp, b, vlen_enc); 1062 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1063 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1064 vmaxps(dst, atmp, btmp, vlen_enc); 1065 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1066 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1067 } else if (is_double_word && is_min) { 1068 evpmovq2m(ktmp, a, vlen_enc); 1069 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1070 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1071 vminpd(dst, atmp, btmp, vlen_enc); 1072 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1073 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1074 } else { 1075 assert(is_double_word && !is_min, "sanity"); 1076 evpmovq2m(ktmp, b, vlen_enc); 1077 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1078 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1079 vmaxpd(dst, atmp, btmp, vlen_enc); 1080 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1081 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1082 } 1083 } 1084 1085 // Float/Double signum 1086 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1087 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1088 1089 Label DONE_LABEL; 1090 1091 if (opcode == Op_SignumF) { 1092 assert(UseSSE > 0, "required"); 1093 ucomiss(dst, zero); 1094 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1095 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1096 movflt(dst, one); 1097 jcc(Assembler::above, DONE_LABEL); 1098 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1099 } else if (opcode == Op_SignumD) { 1100 assert(UseSSE > 1, "required"); 1101 ucomisd(dst, zero); 1102 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1103 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1104 movdbl(dst, one); 1105 jcc(Assembler::above, DONE_LABEL); 1106 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1107 } 1108 1109 bind(DONE_LABEL); 1110 } 1111 1112 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1113 if (sign) { 1114 pmovsxbw(dst, src); 1115 } else { 1116 pmovzxbw(dst, src); 1117 } 1118 } 1119 1120 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1121 if (sign) { 1122 vpmovsxbw(dst, src, vector_len); 1123 } else { 1124 vpmovzxbw(dst, src, vector_len); 1125 } 1126 } 1127 1128 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1129 if (sign) { 1130 vpmovsxbd(dst, src, vector_len); 1131 } else { 1132 vpmovzxbd(dst, src, vector_len); 1133 } 1134 } 1135 1136 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1137 if (sign) { 1138 vpmovsxwd(dst, src, vector_len); 1139 } else { 1140 vpmovzxwd(dst, src, vector_len); 1141 } 1142 } 1143 1144 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1145 int shift, int vector_len) { 1146 if (opcode == Op_RotateLeftV) { 1147 if (etype == T_INT) { 1148 evprold(dst, src, shift, vector_len); 1149 } else { 1150 assert(etype == T_LONG, "expected type T_LONG"); 1151 evprolq(dst, src, shift, vector_len); 1152 } 1153 } else { 1154 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1155 if (etype == T_INT) { 1156 evprord(dst, src, shift, vector_len); 1157 } else { 1158 assert(etype == T_LONG, "expected type T_LONG"); 1159 evprorq(dst, src, shift, vector_len); 1160 } 1161 } 1162 } 1163 1164 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1165 XMMRegister shift, int vector_len) { 1166 if (opcode == Op_RotateLeftV) { 1167 if (etype == T_INT) { 1168 evprolvd(dst, src, shift, vector_len); 1169 } else { 1170 assert(etype == T_LONG, "expected type T_LONG"); 1171 evprolvq(dst, src, shift, vector_len); 1172 } 1173 } else { 1174 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1175 if (etype == T_INT) { 1176 evprorvd(dst, src, shift, vector_len); 1177 } else { 1178 assert(etype == T_LONG, "expected type T_LONG"); 1179 evprorvq(dst, src, shift, vector_len); 1180 } 1181 } 1182 } 1183 1184 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1185 if (opcode == Op_RShiftVI) { 1186 psrad(dst, shift); 1187 } else if (opcode == Op_LShiftVI) { 1188 pslld(dst, shift); 1189 } else { 1190 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1191 psrld(dst, shift); 1192 } 1193 } 1194 1195 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1196 switch (opcode) { 1197 case Op_RShiftVI: psrad(dst, shift); break; 1198 case Op_LShiftVI: pslld(dst, shift); break; 1199 case Op_URShiftVI: psrld(dst, shift); break; 1200 1201 default: assert(false, "%s", NodeClassNames[opcode]); 1202 } 1203 } 1204 1205 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1206 if (opcode == Op_RShiftVI) { 1207 vpsrad(dst, nds, shift, vector_len); 1208 } else if (opcode == Op_LShiftVI) { 1209 vpslld(dst, nds, shift, vector_len); 1210 } else { 1211 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1212 vpsrld(dst, nds, shift, vector_len); 1213 } 1214 } 1215 1216 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1217 switch (opcode) { 1218 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1219 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1220 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1221 1222 default: assert(false, "%s", NodeClassNames[opcode]); 1223 } 1224 } 1225 1226 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1227 switch (opcode) { 1228 case Op_RShiftVB: // fall-through 1229 case Op_RShiftVS: psraw(dst, shift); break; 1230 1231 case Op_LShiftVB: // fall-through 1232 case Op_LShiftVS: psllw(dst, shift); break; 1233 1234 case Op_URShiftVS: // fall-through 1235 case Op_URShiftVB: psrlw(dst, shift); break; 1236 1237 default: assert(false, "%s", NodeClassNames[opcode]); 1238 } 1239 } 1240 1241 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1242 switch (opcode) { 1243 case Op_RShiftVB: // fall-through 1244 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1245 1246 case Op_LShiftVB: // fall-through 1247 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1248 1249 case Op_URShiftVS: // fall-through 1250 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1251 1252 default: assert(false, "%s", NodeClassNames[opcode]); 1253 } 1254 } 1255 1256 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1257 switch (opcode) { 1258 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1259 case Op_LShiftVL: psllq(dst, shift); break; 1260 case Op_URShiftVL: psrlq(dst, shift); break; 1261 1262 default: assert(false, "%s", NodeClassNames[opcode]); 1263 } 1264 } 1265 1266 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1267 if (opcode == Op_RShiftVL) { 1268 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1269 } else if (opcode == Op_LShiftVL) { 1270 psllq(dst, shift); 1271 } else { 1272 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1273 psrlq(dst, shift); 1274 } 1275 } 1276 1277 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1278 switch (opcode) { 1279 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1280 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1281 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1282 1283 default: assert(false, "%s", NodeClassNames[opcode]); 1284 } 1285 } 1286 1287 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1288 if (opcode == Op_RShiftVL) { 1289 evpsraq(dst, nds, shift, vector_len); 1290 } else if (opcode == Op_LShiftVL) { 1291 vpsllq(dst, nds, shift, vector_len); 1292 } else { 1293 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1294 vpsrlq(dst, nds, shift, vector_len); 1295 } 1296 } 1297 1298 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1299 switch (opcode) { 1300 case Op_RShiftVB: // fall-through 1301 case Op_RShiftVS: // fall-through 1302 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1303 1304 case Op_LShiftVB: // fall-through 1305 case Op_LShiftVS: // fall-through 1306 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1307 1308 case Op_URShiftVB: // fall-through 1309 case Op_URShiftVS: // fall-through 1310 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1311 1312 default: assert(false, "%s", NodeClassNames[opcode]); 1313 } 1314 } 1315 1316 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1317 switch (opcode) { 1318 case Op_RShiftVB: // fall-through 1319 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1320 1321 case Op_LShiftVB: // fall-through 1322 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1323 1324 case Op_URShiftVB: // fall-through 1325 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1326 1327 default: assert(false, "%s", NodeClassNames[opcode]); 1328 } 1329 } 1330 1331 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1332 assert(UseAVX >= 2, "required"); 1333 switch (opcode) { 1334 case Op_RShiftVL: { 1335 if (UseAVX > 2) { 1336 assert(tmp == xnoreg, "not used"); 1337 if (!VM_Version::supports_avx512vl()) { 1338 vlen_enc = Assembler::AVX_512bit; 1339 } 1340 evpsravq(dst, src, shift, vlen_enc); 1341 } else { 1342 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1343 vpsrlvq(dst, src, shift, vlen_enc); 1344 vpsrlvq(tmp, tmp, shift, vlen_enc); 1345 vpxor(dst, dst, tmp, vlen_enc); 1346 vpsubq(dst, dst, tmp, vlen_enc); 1347 } 1348 break; 1349 } 1350 case Op_LShiftVL: { 1351 assert(tmp == xnoreg, "not used"); 1352 vpsllvq(dst, src, shift, vlen_enc); 1353 break; 1354 } 1355 case Op_URShiftVL: { 1356 assert(tmp == xnoreg, "not used"); 1357 vpsrlvq(dst, src, shift, vlen_enc); 1358 break; 1359 } 1360 default: assert(false, "%s", NodeClassNames[opcode]); 1361 } 1362 } 1363 1364 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1365 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1366 assert(opcode == Op_LShiftVB || 1367 opcode == Op_RShiftVB || 1368 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1369 bool sign = (opcode != Op_URShiftVB); 1370 assert(vector_len == 0, "required"); 1371 vextendbd(sign, dst, src, 1); 1372 vpmovzxbd(vtmp, shift, 1); 1373 varshiftd(opcode, dst, dst, vtmp, 1); 1374 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1375 vextracti128_high(vtmp, dst); 1376 vpackusdw(dst, dst, vtmp, 0); 1377 } 1378 1379 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1380 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1381 assert(opcode == Op_LShiftVB || 1382 opcode == Op_RShiftVB || 1383 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1384 bool sign = (opcode != Op_URShiftVB); 1385 int ext_vector_len = vector_len + 1; 1386 vextendbw(sign, dst, src, ext_vector_len); 1387 vpmovzxbw(vtmp, shift, ext_vector_len); 1388 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1389 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1390 if (vector_len == 0) { 1391 vextracti128_high(vtmp, dst); 1392 vpackuswb(dst, dst, vtmp, vector_len); 1393 } else { 1394 vextracti64x4_high(vtmp, dst); 1395 vpackuswb(dst, dst, vtmp, vector_len); 1396 vpermq(dst, dst, 0xD8, vector_len); 1397 } 1398 } 1399 1400 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1401 switch(typ) { 1402 case T_BYTE: 1403 pinsrb(dst, val, idx); 1404 break; 1405 case T_SHORT: 1406 pinsrw(dst, val, idx); 1407 break; 1408 case T_INT: 1409 pinsrd(dst, val, idx); 1410 break; 1411 case T_LONG: 1412 pinsrq(dst, val, idx); 1413 break; 1414 default: 1415 assert(false,"Should not reach here."); 1416 break; 1417 } 1418 } 1419 1420 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1421 switch(typ) { 1422 case T_BYTE: 1423 vpinsrb(dst, src, val, idx); 1424 break; 1425 case T_SHORT: 1426 vpinsrw(dst, src, val, idx); 1427 break; 1428 case T_INT: 1429 vpinsrd(dst, src, val, idx); 1430 break; 1431 case T_LONG: 1432 vpinsrq(dst, src, val, idx); 1433 break; 1434 default: 1435 assert(false,"Should not reach here."); 1436 break; 1437 } 1438 } 1439 1440 #ifdef _LP64 1441 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1442 XMMRegister dst, Register base, 1443 Register idx_base, 1444 Register offset, Register mask, 1445 Register mask_idx, Register rtmp, 1446 int vlen_enc) { 1447 vpxor(dst, dst, dst, vlen_enc); 1448 if (elem_bt == T_SHORT) { 1449 for (int i = 0; i < 4; i++) { 1450 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1451 Label skip_load; 1452 btq(mask, mask_idx); 1453 jccb(Assembler::carryClear, skip_load); 1454 movl(rtmp, Address(idx_base, i * 4)); 1455 if (offset != noreg) { 1456 addl(rtmp, offset); 1457 } 1458 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1459 bind(skip_load); 1460 incq(mask_idx); 1461 } 1462 } else { 1463 assert(elem_bt == T_BYTE, ""); 1464 for (int i = 0; i < 8; i++) { 1465 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1466 Label skip_load; 1467 btq(mask, mask_idx); 1468 jccb(Assembler::carryClear, skip_load); 1469 movl(rtmp, Address(idx_base, i * 4)); 1470 if (offset != noreg) { 1471 addl(rtmp, offset); 1472 } 1473 pinsrb(dst, Address(base, rtmp), i); 1474 bind(skip_load); 1475 incq(mask_idx); 1476 } 1477 } 1478 } 1479 #endif // _LP64 1480 1481 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1482 Register base, Register idx_base, 1483 Register offset, Register rtmp, 1484 int vlen_enc) { 1485 vpxor(dst, dst, dst, vlen_enc); 1486 if (elem_bt == T_SHORT) { 1487 for (int i = 0; i < 4; i++) { 1488 // dst[i] = src[offset + idx_base[i]] 1489 movl(rtmp, Address(idx_base, i * 4)); 1490 if (offset != noreg) { 1491 addl(rtmp, offset); 1492 } 1493 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1494 } 1495 } else { 1496 assert(elem_bt == T_BYTE, ""); 1497 for (int i = 0; i < 8; i++) { 1498 // dst[i] = src[offset + idx_base[i]] 1499 movl(rtmp, Address(idx_base, i * 4)); 1500 if (offset != noreg) { 1501 addl(rtmp, offset); 1502 } 1503 pinsrb(dst, Address(base, rtmp), i); 1504 } 1505 } 1506 } 1507 1508 /* 1509 * Gather using hybrid algorithm, first partially unroll scalar loop 1510 * to accumulate values from gather indices into a quad-word(64bit) slice. 1511 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1512 * permutation to place the slice into appropriate vector lane 1513 * locations in destination vector. Following pseudo code describes the 1514 * algorithm in detail: 1515 * 1516 * DST_VEC = ZERO_VEC 1517 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1518 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1519 * FOREACH_ITER: 1520 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1521 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1522 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1523 * PERM_INDEX = PERM_INDEX - TWO_VEC 1524 * 1525 * With each iteration, doubleword permute indices (0,1) corresponding 1526 * to gathered quadword gets right shifted by two lane positions. 1527 * 1528 */ 1529 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1530 Register base, Register idx_base, 1531 Register offset, Register mask, 1532 XMMRegister xtmp1, XMMRegister xtmp2, 1533 XMMRegister temp_dst, Register rtmp, 1534 Register mask_idx, Register length, 1535 int vector_len, int vlen_enc) { 1536 Label GATHER8_LOOP; 1537 assert(is_subword_type(elem_ty), ""); 1538 movl(length, vector_len); 1539 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1540 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1541 vallones(xtmp2, vlen_enc); 1542 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1543 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1544 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1545 1546 bind(GATHER8_LOOP); 1547 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1548 if (mask == noreg) { 1549 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1550 } else { 1551 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1552 } 1553 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1554 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1555 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1556 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1557 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1558 vpor(dst, dst, temp_dst, vlen_enc); 1559 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1560 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1561 jcc(Assembler::notEqual, GATHER8_LOOP); 1562 } 1563 1564 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1565 switch(typ) { 1566 case T_INT: 1567 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1568 break; 1569 case T_FLOAT: 1570 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1571 break; 1572 case T_LONG: 1573 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1574 break; 1575 case T_DOUBLE: 1576 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1577 break; 1578 default: 1579 assert(false,"Should not reach here."); 1580 break; 1581 } 1582 } 1583 1584 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1585 switch(typ) { 1586 case T_INT: 1587 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1588 break; 1589 case T_FLOAT: 1590 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1591 break; 1592 case T_LONG: 1593 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1594 break; 1595 case T_DOUBLE: 1596 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1597 break; 1598 default: 1599 assert(false,"Should not reach here."); 1600 break; 1601 } 1602 } 1603 1604 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1605 switch(typ) { 1606 case T_INT: 1607 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1608 break; 1609 case T_FLOAT: 1610 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1611 break; 1612 case T_LONG: 1613 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1614 break; 1615 case T_DOUBLE: 1616 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1617 break; 1618 default: 1619 assert(false,"Should not reach here."); 1620 break; 1621 } 1622 } 1623 1624 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1625 if (vlen_in_bytes <= 16) { 1626 pxor (dst, dst); 1627 psubb(dst, src); 1628 switch (elem_bt) { 1629 case T_BYTE: /* nothing to do */ break; 1630 case T_SHORT: pmovsxbw(dst, dst); break; 1631 case T_INT: pmovsxbd(dst, dst); break; 1632 case T_FLOAT: pmovsxbd(dst, dst); break; 1633 case T_LONG: pmovsxbq(dst, dst); break; 1634 case T_DOUBLE: pmovsxbq(dst, dst); break; 1635 1636 default: assert(false, "%s", type2name(elem_bt)); 1637 } 1638 } else { 1639 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1640 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1641 1642 vpxor (dst, dst, dst, vlen_enc); 1643 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1644 1645 switch (elem_bt) { 1646 case T_BYTE: /* nothing to do */ break; 1647 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1648 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1649 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1650 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1651 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1652 1653 default: assert(false, "%s", type2name(elem_bt)); 1654 } 1655 } 1656 } 1657 1658 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1659 if (novlbwdq) { 1660 vpmovsxbd(xtmp, src, vlen_enc); 1661 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1662 Assembler::eq, true, vlen_enc, noreg); 1663 } else { 1664 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1665 vpsubb(xtmp, xtmp, src, vlen_enc); 1666 evpmovb2m(dst, xtmp, vlen_enc); 1667 } 1668 } 1669 1670 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1671 switch (vlen_in_bytes) { 1672 case 4: movdl(dst, src); break; 1673 case 8: movq(dst, src); break; 1674 case 16: movdqu(dst, src); break; 1675 case 32: vmovdqu(dst, src); break; 1676 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1677 default: ShouldNotReachHere(); 1678 } 1679 } 1680 1681 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1682 assert(rscratch != noreg || always_reachable(src), "missing"); 1683 1684 if (reachable(src)) { 1685 load_vector(dst, as_Address(src), vlen_in_bytes); 1686 } else { 1687 lea(rscratch, src); 1688 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1689 } 1690 } 1691 1692 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1693 int vlen_enc = vector_length_encoding(vlen); 1694 if (VM_Version::supports_avx()) { 1695 if (bt == T_LONG) { 1696 if (VM_Version::supports_avx2()) { 1697 vpbroadcastq(dst, src, vlen_enc); 1698 } else { 1699 vmovddup(dst, src, vlen_enc); 1700 } 1701 } else if (bt == T_DOUBLE) { 1702 if (vlen_enc != Assembler::AVX_128bit) { 1703 vbroadcastsd(dst, src, vlen_enc, noreg); 1704 } else { 1705 vmovddup(dst, src, vlen_enc); 1706 } 1707 } else { 1708 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1709 vpbroadcastd(dst, src, vlen_enc); 1710 } else { 1711 vbroadcastss(dst, src, vlen_enc); 1712 } 1713 } 1714 } else if (VM_Version::supports_sse3()) { 1715 movddup(dst, src); 1716 } else { 1717 movq(dst, src); 1718 if (vlen == 16) { 1719 punpcklqdq(dst, dst); 1720 } 1721 } 1722 } 1723 1724 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1725 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1726 int offset = exact_log2(type2aelembytes(bt)) << 6; 1727 if (is_floating_point_type(bt)) { 1728 offset += 128; 1729 } 1730 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1731 load_vector(dst, addr, vlen_in_bytes); 1732 } 1733 1734 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1735 1736 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1737 int vector_len = Assembler::AVX_128bit; 1738 1739 switch (opcode) { 1740 case Op_AndReductionV: pand(dst, src); break; 1741 case Op_OrReductionV: por (dst, src); break; 1742 case Op_XorReductionV: pxor(dst, src); break; 1743 case Op_MinReductionV: 1744 switch (typ) { 1745 case T_BYTE: pminsb(dst, src); break; 1746 case T_SHORT: pminsw(dst, src); break; 1747 case T_INT: pminsd(dst, src); break; 1748 case T_LONG: assert(UseAVX > 2, "required"); 1749 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1750 default: assert(false, "wrong type"); 1751 } 1752 break; 1753 case Op_MaxReductionV: 1754 switch (typ) { 1755 case T_BYTE: pmaxsb(dst, src); break; 1756 case T_SHORT: pmaxsw(dst, src); break; 1757 case T_INT: pmaxsd(dst, src); break; 1758 case T_LONG: assert(UseAVX > 2, "required"); 1759 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1760 default: assert(false, "wrong type"); 1761 } 1762 break; 1763 case Op_AddReductionVF: addss(dst, src); break; 1764 case Op_AddReductionVD: addsd(dst, src); break; 1765 case Op_AddReductionVI: 1766 switch (typ) { 1767 case T_BYTE: paddb(dst, src); break; 1768 case T_SHORT: paddw(dst, src); break; 1769 case T_INT: paddd(dst, src); break; 1770 default: assert(false, "wrong type"); 1771 } 1772 break; 1773 case Op_AddReductionVL: paddq(dst, src); break; 1774 case Op_MulReductionVF: mulss(dst, src); break; 1775 case Op_MulReductionVD: mulsd(dst, src); break; 1776 case Op_MulReductionVI: 1777 switch (typ) { 1778 case T_SHORT: pmullw(dst, src); break; 1779 case T_INT: pmulld(dst, src); break; 1780 default: assert(false, "wrong type"); 1781 } 1782 break; 1783 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1784 evpmullq(dst, dst, src, vector_len); break; 1785 default: assert(false, "wrong opcode"); 1786 } 1787 } 1788 1789 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1790 int vector_len = Assembler::AVX_256bit; 1791 1792 switch (opcode) { 1793 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1794 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1795 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1796 case Op_MinReductionV: 1797 switch (typ) { 1798 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1799 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1800 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1801 case T_LONG: assert(UseAVX > 2, "required"); 1802 vpminsq(dst, src1, src2, vector_len); break; 1803 default: assert(false, "wrong type"); 1804 } 1805 break; 1806 case Op_MaxReductionV: 1807 switch (typ) { 1808 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1809 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1810 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1811 case T_LONG: assert(UseAVX > 2, "required"); 1812 vpmaxsq(dst, src1, src2, vector_len); break; 1813 default: assert(false, "wrong type"); 1814 } 1815 break; 1816 case Op_AddReductionVI: 1817 switch (typ) { 1818 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1819 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1820 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1821 default: assert(false, "wrong type"); 1822 } 1823 break; 1824 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1825 case Op_MulReductionVI: 1826 switch (typ) { 1827 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1828 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1829 default: assert(false, "wrong type"); 1830 } 1831 break; 1832 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1833 default: assert(false, "wrong opcode"); 1834 } 1835 } 1836 1837 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1838 XMMRegister dst, XMMRegister src, 1839 XMMRegister vtmp1, XMMRegister vtmp2) { 1840 switch (opcode) { 1841 case Op_AddReductionVF: 1842 case Op_MulReductionVF: 1843 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1844 break; 1845 1846 case Op_AddReductionVD: 1847 case Op_MulReductionVD: 1848 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1849 break; 1850 1851 default: assert(false, "wrong opcode"); 1852 } 1853 } 1854 1855 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1856 Register dst, Register src1, XMMRegister src2, 1857 XMMRegister vtmp1, XMMRegister vtmp2) { 1858 switch (vlen) { 1859 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1860 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1861 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1862 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1863 1864 default: assert(false, "wrong vector length"); 1865 } 1866 } 1867 1868 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1869 Register dst, Register src1, XMMRegister src2, 1870 XMMRegister vtmp1, XMMRegister vtmp2) { 1871 switch (vlen) { 1872 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1873 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1874 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1875 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1876 1877 default: assert(false, "wrong vector length"); 1878 } 1879 } 1880 1881 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1882 Register dst, Register src1, XMMRegister src2, 1883 XMMRegister vtmp1, XMMRegister vtmp2) { 1884 switch (vlen) { 1885 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1886 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1887 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1888 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1889 1890 default: assert(false, "wrong vector length"); 1891 } 1892 } 1893 1894 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1895 Register dst, Register src1, XMMRegister src2, 1896 XMMRegister vtmp1, XMMRegister vtmp2) { 1897 switch (vlen) { 1898 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1899 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1900 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1901 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1902 1903 default: assert(false, "wrong vector length"); 1904 } 1905 } 1906 1907 #ifdef _LP64 1908 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1909 Register dst, Register src1, XMMRegister src2, 1910 XMMRegister vtmp1, XMMRegister vtmp2) { 1911 switch (vlen) { 1912 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1913 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1914 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1915 1916 default: assert(false, "wrong vector length"); 1917 } 1918 } 1919 #endif // _LP64 1920 1921 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1922 switch (vlen) { 1923 case 2: 1924 assert(vtmp2 == xnoreg, ""); 1925 reduce2F(opcode, dst, src, vtmp1); 1926 break; 1927 case 4: 1928 assert(vtmp2 == xnoreg, ""); 1929 reduce4F(opcode, dst, src, vtmp1); 1930 break; 1931 case 8: 1932 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1933 break; 1934 case 16: 1935 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1936 break; 1937 default: assert(false, "wrong vector length"); 1938 } 1939 } 1940 1941 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1942 switch (vlen) { 1943 case 2: 1944 assert(vtmp2 == xnoreg, ""); 1945 reduce2D(opcode, dst, src, vtmp1); 1946 break; 1947 case 4: 1948 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1949 break; 1950 case 8: 1951 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1952 break; 1953 default: assert(false, "wrong vector length"); 1954 } 1955 } 1956 1957 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1958 if (opcode == Op_AddReductionVI) { 1959 if (vtmp1 != src2) { 1960 movdqu(vtmp1, src2); 1961 } 1962 phaddd(vtmp1, vtmp1); 1963 } else { 1964 pshufd(vtmp1, src2, 0x1); 1965 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1966 } 1967 movdl(vtmp2, src1); 1968 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1969 movdl(dst, vtmp1); 1970 } 1971 1972 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1973 if (opcode == Op_AddReductionVI) { 1974 if (vtmp1 != src2) { 1975 movdqu(vtmp1, src2); 1976 } 1977 phaddd(vtmp1, src2); 1978 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1979 } else { 1980 pshufd(vtmp2, src2, 0xE); 1981 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1982 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1983 } 1984 } 1985 1986 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1987 if (opcode == Op_AddReductionVI) { 1988 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1989 vextracti128_high(vtmp2, vtmp1); 1990 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1991 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1992 } else { 1993 vextracti128_high(vtmp1, src2); 1994 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1995 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1996 } 1997 } 1998 1999 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2000 vextracti64x4_high(vtmp2, src2); 2001 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2002 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2003 } 2004 2005 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2006 pshufd(vtmp2, src2, 0x1); 2007 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2008 movdqu(vtmp1, vtmp2); 2009 psrldq(vtmp1, 2); 2010 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2011 movdqu(vtmp2, vtmp1); 2012 psrldq(vtmp2, 1); 2013 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2014 movdl(vtmp2, src1); 2015 pmovsxbd(vtmp1, vtmp1); 2016 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2017 pextrb(dst, vtmp1, 0x0); 2018 movsbl(dst, dst); 2019 } 2020 2021 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2022 pshufd(vtmp1, src2, 0xE); 2023 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2024 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2025 } 2026 2027 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2028 vextracti128_high(vtmp2, src2); 2029 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2030 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2031 } 2032 2033 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2034 vextracti64x4_high(vtmp1, src2); 2035 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2036 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2037 } 2038 2039 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2040 pmovsxbw(vtmp2, src2); 2041 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2042 } 2043 2044 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2045 if (UseAVX > 1) { 2046 int vector_len = Assembler::AVX_256bit; 2047 vpmovsxbw(vtmp1, src2, vector_len); 2048 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2049 } else { 2050 pmovsxbw(vtmp2, src2); 2051 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2052 pshufd(vtmp2, src2, 0x1); 2053 pmovsxbw(vtmp2, src2); 2054 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2055 } 2056 } 2057 2058 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2059 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2060 int vector_len = Assembler::AVX_512bit; 2061 vpmovsxbw(vtmp1, src2, vector_len); 2062 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2063 } else { 2064 assert(UseAVX >= 2,"Should not reach here."); 2065 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2066 vextracti128_high(vtmp2, src2); 2067 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2068 } 2069 } 2070 2071 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2072 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2073 vextracti64x4_high(vtmp2, src2); 2074 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2075 } 2076 2077 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2078 if (opcode == Op_AddReductionVI) { 2079 if (vtmp1 != src2) { 2080 movdqu(vtmp1, src2); 2081 } 2082 phaddw(vtmp1, vtmp1); 2083 phaddw(vtmp1, vtmp1); 2084 } else { 2085 pshufd(vtmp2, src2, 0x1); 2086 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2087 movdqu(vtmp1, vtmp2); 2088 psrldq(vtmp1, 2); 2089 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2090 } 2091 movdl(vtmp2, src1); 2092 pmovsxwd(vtmp1, vtmp1); 2093 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2094 pextrw(dst, vtmp1, 0x0); 2095 movswl(dst, dst); 2096 } 2097 2098 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2099 if (opcode == Op_AddReductionVI) { 2100 if (vtmp1 != src2) { 2101 movdqu(vtmp1, src2); 2102 } 2103 phaddw(vtmp1, src2); 2104 } else { 2105 pshufd(vtmp1, src2, 0xE); 2106 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2107 } 2108 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2109 } 2110 2111 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2112 if (opcode == Op_AddReductionVI) { 2113 int vector_len = Assembler::AVX_256bit; 2114 vphaddw(vtmp2, src2, src2, vector_len); 2115 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2116 } else { 2117 vextracti128_high(vtmp2, src2); 2118 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2119 } 2120 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2121 } 2122 2123 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2124 int vector_len = Assembler::AVX_256bit; 2125 vextracti64x4_high(vtmp1, src2); 2126 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2127 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2128 } 2129 2130 #ifdef _LP64 2131 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2132 pshufd(vtmp2, src2, 0xE); 2133 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2134 movdq(vtmp1, src1); 2135 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2136 movdq(dst, vtmp1); 2137 } 2138 2139 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2140 vextracti128_high(vtmp1, src2); 2141 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2142 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2143 } 2144 2145 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2146 vextracti64x4_high(vtmp2, src2); 2147 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2148 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2149 } 2150 2151 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2152 mov64(temp, -1L); 2153 bzhiq(temp, temp, len); 2154 kmovql(dst, temp); 2155 } 2156 #endif // _LP64 2157 2158 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2159 reduce_operation_128(T_FLOAT, opcode, dst, src); 2160 pshufd(vtmp, src, 0x1); 2161 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2162 } 2163 2164 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2165 reduce2F(opcode, dst, src, vtmp); 2166 pshufd(vtmp, src, 0x2); 2167 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2168 pshufd(vtmp, src, 0x3); 2169 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2170 } 2171 2172 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2173 reduce4F(opcode, dst, src, vtmp2); 2174 vextractf128_high(vtmp2, src); 2175 reduce4F(opcode, dst, vtmp2, vtmp1); 2176 } 2177 2178 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2179 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2180 vextracti64x4_high(vtmp1, src); 2181 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2182 } 2183 2184 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2185 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2186 pshufd(vtmp, src, 0xE); 2187 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2188 } 2189 2190 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2191 reduce2D(opcode, dst, src, vtmp2); 2192 vextractf128_high(vtmp2, src); 2193 reduce2D(opcode, dst, vtmp2, vtmp1); 2194 } 2195 2196 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2197 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2198 vextracti64x4_high(vtmp1, src); 2199 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2200 } 2201 2202 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2203 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2204 } 2205 2206 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2207 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2208 } 2209 2210 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2211 int vec_enc) { 2212 switch(elem_bt) { 2213 case T_INT: 2214 case T_FLOAT: 2215 vmaskmovps(dst, src, mask, vec_enc); 2216 break; 2217 case T_LONG: 2218 case T_DOUBLE: 2219 vmaskmovpd(dst, src, mask, vec_enc); 2220 break; 2221 default: 2222 fatal("Unsupported type %s", type2name(elem_bt)); 2223 break; 2224 } 2225 } 2226 2227 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2228 int vec_enc) { 2229 switch(elem_bt) { 2230 case T_INT: 2231 case T_FLOAT: 2232 vmaskmovps(dst, src, mask, vec_enc); 2233 break; 2234 case T_LONG: 2235 case T_DOUBLE: 2236 vmaskmovpd(dst, src, mask, vec_enc); 2237 break; 2238 default: 2239 fatal("Unsupported type %s", type2name(elem_bt)); 2240 break; 2241 } 2242 } 2243 2244 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2245 XMMRegister dst, XMMRegister src, 2246 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2247 XMMRegister xmm_0, XMMRegister xmm_1) { 2248 const int permconst[] = {1, 14}; 2249 XMMRegister wsrc = src; 2250 XMMRegister wdst = xmm_0; 2251 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2252 2253 int vlen_enc = Assembler::AVX_128bit; 2254 if (vlen == 16) { 2255 vlen_enc = Assembler::AVX_256bit; 2256 } 2257 2258 for (int i = log2(vlen) - 1; i >=0; i--) { 2259 if (i == 0 && !is_dst_valid) { 2260 wdst = dst; 2261 } 2262 if (i == 3) { 2263 vextracti64x4_high(wtmp, wsrc); 2264 } else if (i == 2) { 2265 vextracti128_high(wtmp, wsrc); 2266 } else { // i = [0,1] 2267 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2268 } 2269 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2270 wsrc = wdst; 2271 vlen_enc = Assembler::AVX_128bit; 2272 } 2273 if (is_dst_valid) { 2274 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2275 } 2276 } 2277 2278 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2279 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2280 XMMRegister xmm_0, XMMRegister xmm_1) { 2281 XMMRegister wsrc = src; 2282 XMMRegister wdst = xmm_0; 2283 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2284 int vlen_enc = Assembler::AVX_128bit; 2285 if (vlen == 8) { 2286 vlen_enc = Assembler::AVX_256bit; 2287 } 2288 for (int i = log2(vlen) - 1; i >=0; i--) { 2289 if (i == 0 && !is_dst_valid) { 2290 wdst = dst; 2291 } 2292 if (i == 1) { 2293 vextracti128_high(wtmp, wsrc); 2294 } else if (i == 2) { 2295 vextracti64x4_high(wtmp, wsrc); 2296 } else { 2297 assert(i == 0, "%d", i); 2298 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2299 } 2300 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2301 wsrc = wdst; 2302 vlen_enc = Assembler::AVX_128bit; 2303 } 2304 if (is_dst_valid) { 2305 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2306 } 2307 } 2308 2309 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2310 switch (bt) { 2311 case T_BYTE: pextrb(dst, src, idx); break; 2312 case T_SHORT: pextrw(dst, src, idx); break; 2313 case T_INT: pextrd(dst, src, idx); break; 2314 case T_LONG: pextrq(dst, src, idx); break; 2315 2316 default: 2317 assert(false,"Should not reach here."); 2318 break; 2319 } 2320 } 2321 2322 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2323 int esize = type2aelembytes(typ); 2324 int elem_per_lane = 16/esize; 2325 int lane = elemindex / elem_per_lane; 2326 int eindex = elemindex % elem_per_lane; 2327 2328 if (lane >= 2) { 2329 assert(UseAVX > 2, "required"); 2330 vextractf32x4(dst, src, lane & 3); 2331 return dst; 2332 } else if (lane > 0) { 2333 assert(UseAVX > 0, "required"); 2334 vextractf128(dst, src, lane); 2335 return dst; 2336 } else { 2337 return src; 2338 } 2339 } 2340 2341 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2342 if (typ == T_BYTE) { 2343 movsbl(dst, dst); 2344 } else if (typ == T_SHORT) { 2345 movswl(dst, dst); 2346 } 2347 } 2348 2349 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2350 int esize = type2aelembytes(typ); 2351 int elem_per_lane = 16/esize; 2352 int eindex = elemindex % elem_per_lane; 2353 assert(is_integral_type(typ),"required"); 2354 2355 if (eindex == 0) { 2356 if (typ == T_LONG) { 2357 movq(dst, src); 2358 } else { 2359 movdl(dst, src); 2360 movsxl(typ, dst); 2361 } 2362 } else { 2363 extract(typ, dst, src, eindex); 2364 movsxl(typ, dst); 2365 } 2366 } 2367 2368 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2369 int esize = type2aelembytes(typ); 2370 int elem_per_lane = 16/esize; 2371 int eindex = elemindex % elem_per_lane; 2372 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2373 2374 if (eindex == 0) { 2375 movq(dst, src); 2376 } else { 2377 if (typ == T_FLOAT) { 2378 if (UseAVX == 0) { 2379 movdqu(dst, src); 2380 shufps(dst, dst, eindex); 2381 } else { 2382 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2383 } 2384 } else { 2385 if (UseAVX == 0) { 2386 movdqu(dst, src); 2387 psrldq(dst, eindex*esize); 2388 } else { 2389 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2390 } 2391 movq(dst, dst); 2392 } 2393 } 2394 // Zero upper bits 2395 if (typ == T_FLOAT) { 2396 if (UseAVX == 0) { 2397 assert(vtmp != xnoreg, "required."); 2398 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2399 pand(dst, vtmp); 2400 } else { 2401 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2402 } 2403 } 2404 } 2405 2406 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2407 switch(typ) { 2408 case T_BYTE: 2409 case T_BOOLEAN: 2410 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2411 break; 2412 case T_SHORT: 2413 case T_CHAR: 2414 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2415 break; 2416 case T_INT: 2417 case T_FLOAT: 2418 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2419 break; 2420 case T_LONG: 2421 case T_DOUBLE: 2422 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2423 break; 2424 default: 2425 assert(false,"Should not reach here."); 2426 break; 2427 } 2428 } 2429 2430 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2431 assert(rscratch != noreg || always_reachable(src2), "missing"); 2432 2433 switch(typ) { 2434 case T_BOOLEAN: 2435 case T_BYTE: 2436 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2437 break; 2438 case T_CHAR: 2439 case T_SHORT: 2440 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2441 break; 2442 case T_INT: 2443 case T_FLOAT: 2444 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2445 break; 2446 case T_LONG: 2447 case T_DOUBLE: 2448 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2449 break; 2450 default: 2451 assert(false,"Should not reach here."); 2452 break; 2453 } 2454 } 2455 2456 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2457 switch(typ) { 2458 case T_BYTE: 2459 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2460 break; 2461 case T_SHORT: 2462 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2463 break; 2464 case T_INT: 2465 case T_FLOAT: 2466 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2467 break; 2468 case T_LONG: 2469 case T_DOUBLE: 2470 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2471 break; 2472 default: 2473 assert(false,"Should not reach here."); 2474 break; 2475 } 2476 } 2477 2478 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2479 assert(vlen_in_bytes <= 32, ""); 2480 int esize = type2aelembytes(bt); 2481 if (vlen_in_bytes == 32) { 2482 assert(vtmp == xnoreg, "required."); 2483 if (esize >= 4) { 2484 vtestps(src1, src2, AVX_256bit); 2485 } else { 2486 vptest(src1, src2, AVX_256bit); 2487 } 2488 return; 2489 } 2490 if (vlen_in_bytes < 16) { 2491 // Duplicate the lower part to fill the whole register, 2492 // Don't need to do so for src2 2493 assert(vtmp != xnoreg, "required"); 2494 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2495 pshufd(vtmp, src1, shuffle_imm); 2496 } else { 2497 assert(vtmp == xnoreg, "required"); 2498 vtmp = src1; 2499 } 2500 if (esize >= 4 && VM_Version::supports_avx()) { 2501 vtestps(vtmp, src2, AVX_128bit); 2502 } else { 2503 ptest(vtmp, src2); 2504 } 2505 } 2506 2507 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2508 assert(UseAVX >= 2, "required"); 2509 #ifdef ASSERT 2510 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2511 bool is_bw_supported = VM_Version::supports_avx512bw(); 2512 if (is_bw && !is_bw_supported) { 2513 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2514 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2515 "XMM register should be 0-15"); 2516 } 2517 #endif // ASSERT 2518 switch (elem_bt) { 2519 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2520 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2521 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2522 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2523 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2524 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2525 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2526 } 2527 } 2528 2529 #ifdef _LP64 2530 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2531 assert(UseAVX >= 2, "required"); 2532 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2533 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2534 if ((UseAVX > 2) && 2535 (!is_bw || VM_Version::supports_avx512bw()) && 2536 (!is_vl || VM_Version::supports_avx512vl())) { 2537 switch (elem_bt) { 2538 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2539 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2540 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2541 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2542 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2543 } 2544 } else { 2545 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2546 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2547 switch (elem_bt) { 2548 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2549 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2550 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2551 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2552 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2553 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2554 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2555 } 2556 } 2557 } 2558 #endif 2559 2560 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2561 switch (to_elem_bt) { 2562 case T_SHORT: 2563 vpmovsxbw(dst, src, vlen_enc); 2564 break; 2565 case T_INT: 2566 vpmovsxbd(dst, src, vlen_enc); 2567 break; 2568 case T_FLOAT: 2569 vpmovsxbd(dst, src, vlen_enc); 2570 vcvtdq2ps(dst, dst, vlen_enc); 2571 break; 2572 case T_LONG: 2573 vpmovsxbq(dst, src, vlen_enc); 2574 break; 2575 case T_DOUBLE: { 2576 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2577 vpmovsxbd(dst, src, mid_vlen_enc); 2578 vcvtdq2pd(dst, dst, vlen_enc); 2579 break; 2580 } 2581 default: 2582 fatal("Unsupported type %s", type2name(to_elem_bt)); 2583 break; 2584 } 2585 } 2586 2587 //------------------------------------------------------------------------------------------- 2588 2589 // IndexOf for constant substrings with size >= 8 chars 2590 // which don't need to be loaded through stack. 2591 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2592 Register cnt1, Register cnt2, 2593 int int_cnt2, Register result, 2594 XMMRegister vec, Register tmp, 2595 int ae) { 2596 ShortBranchVerifier sbv(this); 2597 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2598 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2599 2600 // This method uses the pcmpestri instruction with bound registers 2601 // inputs: 2602 // xmm - substring 2603 // rax - substring length (elements count) 2604 // mem - scanned string 2605 // rdx - string length (elements count) 2606 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2607 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2608 // outputs: 2609 // rcx - matched index in string 2610 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2611 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2612 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2613 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2614 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2615 2616 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2617 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2618 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2619 2620 // Note, inline_string_indexOf() generates checks: 2621 // if (substr.count > string.count) return -1; 2622 // if (substr.count == 0) return 0; 2623 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2624 2625 // Load substring. 2626 if (ae == StrIntrinsicNode::UL) { 2627 pmovzxbw(vec, Address(str2, 0)); 2628 } else { 2629 movdqu(vec, Address(str2, 0)); 2630 } 2631 movl(cnt2, int_cnt2); 2632 movptr(result, str1); // string addr 2633 2634 if (int_cnt2 > stride) { 2635 jmpb(SCAN_TO_SUBSTR); 2636 2637 // Reload substr for rescan, this code 2638 // is executed only for large substrings (> 8 chars) 2639 bind(RELOAD_SUBSTR); 2640 if (ae == StrIntrinsicNode::UL) { 2641 pmovzxbw(vec, Address(str2, 0)); 2642 } else { 2643 movdqu(vec, Address(str2, 0)); 2644 } 2645 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2646 2647 bind(RELOAD_STR); 2648 // We came here after the beginning of the substring was 2649 // matched but the rest of it was not so we need to search 2650 // again. Start from the next element after the previous match. 2651 2652 // cnt2 is number of substring reminding elements and 2653 // cnt1 is number of string reminding elements when cmp failed. 2654 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2655 subl(cnt1, cnt2); 2656 addl(cnt1, int_cnt2); 2657 movl(cnt2, int_cnt2); // Now restore cnt2 2658 2659 decrementl(cnt1); // Shift to next element 2660 cmpl(cnt1, cnt2); 2661 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2662 2663 addptr(result, (1<<scale1)); 2664 2665 } // (int_cnt2 > 8) 2666 2667 // Scan string for start of substr in 16-byte vectors 2668 bind(SCAN_TO_SUBSTR); 2669 pcmpestri(vec, Address(result, 0), mode); 2670 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2671 subl(cnt1, stride); 2672 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2673 cmpl(cnt1, cnt2); 2674 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2675 addptr(result, 16); 2676 jmpb(SCAN_TO_SUBSTR); 2677 2678 // Found a potential substr 2679 bind(FOUND_CANDIDATE); 2680 // Matched whole vector if first element matched (tmp(rcx) == 0). 2681 if (int_cnt2 == stride) { 2682 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2683 } else { // int_cnt2 > 8 2684 jccb(Assembler::overflow, FOUND_SUBSTR); 2685 } 2686 // After pcmpestri tmp(rcx) contains matched element index 2687 // Compute start addr of substr 2688 lea(result, Address(result, tmp, scale1)); 2689 2690 // Make sure string is still long enough 2691 subl(cnt1, tmp); 2692 cmpl(cnt1, cnt2); 2693 if (int_cnt2 == stride) { 2694 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2695 } else { // int_cnt2 > 8 2696 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2697 } 2698 // Left less then substring. 2699 2700 bind(RET_NOT_FOUND); 2701 movl(result, -1); 2702 jmp(EXIT); 2703 2704 if (int_cnt2 > stride) { 2705 // This code is optimized for the case when whole substring 2706 // is matched if its head is matched. 2707 bind(MATCH_SUBSTR_HEAD); 2708 pcmpestri(vec, Address(result, 0), mode); 2709 // Reload only string if does not match 2710 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2711 2712 Label CONT_SCAN_SUBSTR; 2713 // Compare the rest of substring (> 8 chars). 2714 bind(FOUND_SUBSTR); 2715 // First 8 chars are already matched. 2716 negptr(cnt2); 2717 addptr(cnt2, stride); 2718 2719 bind(SCAN_SUBSTR); 2720 subl(cnt1, stride); 2721 cmpl(cnt2, -stride); // Do not read beyond substring 2722 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2723 // Back-up strings to avoid reading beyond substring: 2724 // cnt1 = cnt1 - cnt2 + 8 2725 addl(cnt1, cnt2); // cnt2 is negative 2726 addl(cnt1, stride); 2727 movl(cnt2, stride); negptr(cnt2); 2728 bind(CONT_SCAN_SUBSTR); 2729 if (int_cnt2 < (int)G) { 2730 int tail_off1 = int_cnt2<<scale1; 2731 int tail_off2 = int_cnt2<<scale2; 2732 if (ae == StrIntrinsicNode::UL) { 2733 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2734 } else { 2735 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2736 } 2737 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2738 } else { 2739 // calculate index in register to avoid integer overflow (int_cnt2*2) 2740 movl(tmp, int_cnt2); 2741 addptr(tmp, cnt2); 2742 if (ae == StrIntrinsicNode::UL) { 2743 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2744 } else { 2745 movdqu(vec, Address(str2, tmp, scale2, 0)); 2746 } 2747 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2748 } 2749 // Need to reload strings pointers if not matched whole vector 2750 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2751 addptr(cnt2, stride); 2752 jcc(Assembler::negative, SCAN_SUBSTR); 2753 // Fall through if found full substring 2754 2755 } // (int_cnt2 > 8) 2756 2757 bind(RET_FOUND); 2758 // Found result if we matched full small substring. 2759 // Compute substr offset 2760 subptr(result, str1); 2761 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2762 shrl(result, 1); // index 2763 } 2764 bind(EXIT); 2765 2766 } // string_indexofC8 2767 2768 // Small strings are loaded through stack if they cross page boundary. 2769 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2770 Register cnt1, Register cnt2, 2771 int int_cnt2, Register result, 2772 XMMRegister vec, Register tmp, 2773 int ae) { 2774 ShortBranchVerifier sbv(this); 2775 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2776 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2777 2778 // 2779 // int_cnt2 is length of small (< 8 chars) constant substring 2780 // or (-1) for non constant substring in which case its length 2781 // is in cnt2 register. 2782 // 2783 // Note, inline_string_indexOf() generates checks: 2784 // if (substr.count > string.count) return -1; 2785 // if (substr.count == 0) return 0; 2786 // 2787 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2788 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2789 // This method uses the pcmpestri instruction with bound registers 2790 // inputs: 2791 // xmm - substring 2792 // rax - substring length (elements count) 2793 // mem - scanned string 2794 // rdx - string length (elements count) 2795 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2796 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2797 // outputs: 2798 // rcx - matched index in string 2799 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2800 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2801 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2802 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2803 2804 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2805 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2806 FOUND_CANDIDATE; 2807 2808 { //======================================================== 2809 // We don't know where these strings are located 2810 // and we can't read beyond them. Load them through stack. 2811 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2812 2813 movptr(tmp, rsp); // save old SP 2814 2815 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2816 if (int_cnt2 == (1>>scale2)) { // One byte 2817 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2818 load_unsigned_byte(result, Address(str2, 0)); 2819 movdl(vec, result); // move 32 bits 2820 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2821 // Not enough header space in 32-bit VM: 12+3 = 15. 2822 movl(result, Address(str2, -1)); 2823 shrl(result, 8); 2824 movdl(vec, result); // move 32 bits 2825 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2826 load_unsigned_short(result, Address(str2, 0)); 2827 movdl(vec, result); // move 32 bits 2828 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2829 movdl(vec, Address(str2, 0)); // move 32 bits 2830 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2831 movq(vec, Address(str2, 0)); // move 64 bits 2832 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2833 // Array header size is 12 bytes in 32-bit VM 2834 // + 6 bytes for 3 chars == 18 bytes, 2835 // enough space to load vec and shift. 2836 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2837 if (ae == StrIntrinsicNode::UL) { 2838 int tail_off = int_cnt2-8; 2839 pmovzxbw(vec, Address(str2, tail_off)); 2840 psrldq(vec, -2*tail_off); 2841 } 2842 else { 2843 int tail_off = int_cnt2*(1<<scale2); 2844 movdqu(vec, Address(str2, tail_off-16)); 2845 psrldq(vec, 16-tail_off); 2846 } 2847 } 2848 } else { // not constant substring 2849 cmpl(cnt2, stride); 2850 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2851 2852 // We can read beyond string if srt+16 does not cross page boundary 2853 // since heaps are aligned and mapped by pages. 2854 assert(os::vm_page_size() < (int)G, "default page should be small"); 2855 movl(result, str2); // We need only low 32 bits 2856 andl(result, ((int)os::vm_page_size()-1)); 2857 cmpl(result, ((int)os::vm_page_size()-16)); 2858 jccb(Assembler::belowEqual, CHECK_STR); 2859 2860 // Move small strings to stack to allow load 16 bytes into vec. 2861 subptr(rsp, 16); 2862 int stk_offset = wordSize-(1<<scale2); 2863 push(cnt2); 2864 2865 bind(COPY_SUBSTR); 2866 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2867 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2868 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2869 } else if (ae == StrIntrinsicNode::UU) { 2870 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2871 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2872 } 2873 decrement(cnt2); 2874 jccb(Assembler::notZero, COPY_SUBSTR); 2875 2876 pop(cnt2); 2877 movptr(str2, rsp); // New substring address 2878 } // non constant 2879 2880 bind(CHECK_STR); 2881 cmpl(cnt1, stride); 2882 jccb(Assembler::aboveEqual, BIG_STRINGS); 2883 2884 // Check cross page boundary. 2885 movl(result, str1); // We need only low 32 bits 2886 andl(result, ((int)os::vm_page_size()-1)); 2887 cmpl(result, ((int)os::vm_page_size()-16)); 2888 jccb(Assembler::belowEqual, BIG_STRINGS); 2889 2890 subptr(rsp, 16); 2891 int stk_offset = -(1<<scale1); 2892 if (int_cnt2 < 0) { // not constant 2893 push(cnt2); 2894 stk_offset += wordSize; 2895 } 2896 movl(cnt2, cnt1); 2897 2898 bind(COPY_STR); 2899 if (ae == StrIntrinsicNode::LL) { 2900 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2901 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2902 } else { 2903 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2904 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2905 } 2906 decrement(cnt2); 2907 jccb(Assembler::notZero, COPY_STR); 2908 2909 if (int_cnt2 < 0) { // not constant 2910 pop(cnt2); 2911 } 2912 movptr(str1, rsp); // New string address 2913 2914 bind(BIG_STRINGS); 2915 // Load substring. 2916 if (int_cnt2 < 0) { // -1 2917 if (ae == StrIntrinsicNode::UL) { 2918 pmovzxbw(vec, Address(str2, 0)); 2919 } else { 2920 movdqu(vec, Address(str2, 0)); 2921 } 2922 push(cnt2); // substr count 2923 push(str2); // substr addr 2924 push(str1); // string addr 2925 } else { 2926 // Small (< 8 chars) constant substrings are loaded already. 2927 movl(cnt2, int_cnt2); 2928 } 2929 push(tmp); // original SP 2930 2931 } // Finished loading 2932 2933 //======================================================== 2934 // Start search 2935 // 2936 2937 movptr(result, str1); // string addr 2938 2939 if (int_cnt2 < 0) { // Only for non constant substring 2940 jmpb(SCAN_TO_SUBSTR); 2941 2942 // SP saved at sp+0 2943 // String saved at sp+1*wordSize 2944 // Substr saved at sp+2*wordSize 2945 // Substr count saved at sp+3*wordSize 2946 2947 // Reload substr for rescan, this code 2948 // is executed only for large substrings (> 8 chars) 2949 bind(RELOAD_SUBSTR); 2950 movptr(str2, Address(rsp, 2*wordSize)); 2951 movl(cnt2, Address(rsp, 3*wordSize)); 2952 if (ae == StrIntrinsicNode::UL) { 2953 pmovzxbw(vec, Address(str2, 0)); 2954 } else { 2955 movdqu(vec, Address(str2, 0)); 2956 } 2957 // We came here after the beginning of the substring was 2958 // matched but the rest of it was not so we need to search 2959 // again. Start from the next element after the previous match. 2960 subptr(str1, result); // Restore counter 2961 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2962 shrl(str1, 1); 2963 } 2964 addl(cnt1, str1); 2965 decrementl(cnt1); // Shift to next element 2966 cmpl(cnt1, cnt2); 2967 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2968 2969 addptr(result, (1<<scale1)); 2970 } // non constant 2971 2972 // Scan string for start of substr in 16-byte vectors 2973 bind(SCAN_TO_SUBSTR); 2974 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2975 pcmpestri(vec, Address(result, 0), mode); 2976 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2977 subl(cnt1, stride); 2978 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2979 cmpl(cnt1, cnt2); 2980 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2981 addptr(result, 16); 2982 2983 bind(ADJUST_STR); 2984 cmpl(cnt1, stride); // Do not read beyond string 2985 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2986 // Back-up string to avoid reading beyond string. 2987 lea(result, Address(result, cnt1, scale1, -16)); 2988 movl(cnt1, stride); 2989 jmpb(SCAN_TO_SUBSTR); 2990 2991 // Found a potential substr 2992 bind(FOUND_CANDIDATE); 2993 // After pcmpestri tmp(rcx) contains matched element index 2994 2995 // Make sure string is still long enough 2996 subl(cnt1, tmp); 2997 cmpl(cnt1, cnt2); 2998 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2999 // Left less then substring. 3000 3001 bind(RET_NOT_FOUND); 3002 movl(result, -1); 3003 jmp(CLEANUP); 3004 3005 bind(FOUND_SUBSTR); 3006 // Compute start addr of substr 3007 lea(result, Address(result, tmp, scale1)); 3008 if (int_cnt2 > 0) { // Constant substring 3009 // Repeat search for small substring (< 8 chars) 3010 // from new point without reloading substring. 3011 // Have to check that we don't read beyond string. 3012 cmpl(tmp, stride-int_cnt2); 3013 jccb(Assembler::greater, ADJUST_STR); 3014 // Fall through if matched whole substring. 3015 } else { // non constant 3016 assert(int_cnt2 == -1, "should be != 0"); 3017 3018 addl(tmp, cnt2); 3019 // Found result if we matched whole substring. 3020 cmpl(tmp, stride); 3021 jcc(Assembler::lessEqual, RET_FOUND); 3022 3023 // Repeat search for small substring (<= 8 chars) 3024 // from new point 'str1' without reloading substring. 3025 cmpl(cnt2, stride); 3026 // Have to check that we don't read beyond string. 3027 jccb(Assembler::lessEqual, ADJUST_STR); 3028 3029 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3030 // Compare the rest of substring (> 8 chars). 3031 movptr(str1, result); 3032 3033 cmpl(tmp, cnt2); 3034 // First 8 chars are already matched. 3035 jccb(Assembler::equal, CHECK_NEXT); 3036 3037 bind(SCAN_SUBSTR); 3038 pcmpestri(vec, Address(str1, 0), mode); 3039 // Need to reload strings pointers if not matched whole vector 3040 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3041 3042 bind(CHECK_NEXT); 3043 subl(cnt2, stride); 3044 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3045 addptr(str1, 16); 3046 if (ae == StrIntrinsicNode::UL) { 3047 addptr(str2, 8); 3048 } else { 3049 addptr(str2, 16); 3050 } 3051 subl(cnt1, stride); 3052 cmpl(cnt2, stride); // Do not read beyond substring 3053 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3054 // Back-up strings to avoid reading beyond substring. 3055 3056 if (ae == StrIntrinsicNode::UL) { 3057 lea(str2, Address(str2, cnt2, scale2, -8)); 3058 lea(str1, Address(str1, cnt2, scale1, -16)); 3059 } else { 3060 lea(str2, Address(str2, cnt2, scale2, -16)); 3061 lea(str1, Address(str1, cnt2, scale1, -16)); 3062 } 3063 subl(cnt1, cnt2); 3064 movl(cnt2, stride); 3065 addl(cnt1, stride); 3066 bind(CONT_SCAN_SUBSTR); 3067 if (ae == StrIntrinsicNode::UL) { 3068 pmovzxbw(vec, Address(str2, 0)); 3069 } else { 3070 movdqu(vec, Address(str2, 0)); 3071 } 3072 jmp(SCAN_SUBSTR); 3073 3074 bind(RET_FOUND_LONG); 3075 movptr(str1, Address(rsp, wordSize)); 3076 } // non constant 3077 3078 bind(RET_FOUND); 3079 // Compute substr offset 3080 subptr(result, str1); 3081 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3082 shrl(result, 1); // index 3083 } 3084 bind(CLEANUP); 3085 pop(rsp); // restore SP 3086 3087 } // string_indexof 3088 3089 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3090 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3091 ShortBranchVerifier sbv(this); 3092 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3093 3094 int stride = 8; 3095 3096 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3097 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3098 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3099 FOUND_SEQ_CHAR, DONE_LABEL; 3100 3101 movptr(result, str1); 3102 if (UseAVX >= 2) { 3103 cmpl(cnt1, stride); 3104 jcc(Assembler::less, SCAN_TO_CHAR); 3105 cmpl(cnt1, 2*stride); 3106 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3107 movdl(vec1, ch); 3108 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3109 vpxor(vec2, vec2); 3110 movl(tmp, cnt1); 3111 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3112 andl(cnt1,0x0000000F); //tail count (in chars) 3113 3114 bind(SCAN_TO_16_CHAR_LOOP); 3115 vmovdqu(vec3, Address(result, 0)); 3116 vpcmpeqw(vec3, vec3, vec1, 1); 3117 vptest(vec2, vec3); 3118 jcc(Assembler::carryClear, FOUND_CHAR); 3119 addptr(result, 32); 3120 subl(tmp, 2*stride); 3121 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3122 jmp(SCAN_TO_8_CHAR); 3123 bind(SCAN_TO_8_CHAR_INIT); 3124 movdl(vec1, ch); 3125 pshuflw(vec1, vec1, 0x00); 3126 pshufd(vec1, vec1, 0); 3127 pxor(vec2, vec2); 3128 } 3129 bind(SCAN_TO_8_CHAR); 3130 cmpl(cnt1, stride); 3131 jcc(Assembler::less, SCAN_TO_CHAR); 3132 if (UseAVX < 2) { 3133 movdl(vec1, ch); 3134 pshuflw(vec1, vec1, 0x00); 3135 pshufd(vec1, vec1, 0); 3136 pxor(vec2, vec2); 3137 } 3138 movl(tmp, cnt1); 3139 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3140 andl(cnt1,0x00000007); //tail count (in chars) 3141 3142 bind(SCAN_TO_8_CHAR_LOOP); 3143 movdqu(vec3, Address(result, 0)); 3144 pcmpeqw(vec3, vec1); 3145 ptest(vec2, vec3); 3146 jcc(Assembler::carryClear, FOUND_CHAR); 3147 addptr(result, 16); 3148 subl(tmp, stride); 3149 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3150 bind(SCAN_TO_CHAR); 3151 testl(cnt1, cnt1); 3152 jcc(Assembler::zero, RET_NOT_FOUND); 3153 bind(SCAN_TO_CHAR_LOOP); 3154 load_unsigned_short(tmp, Address(result, 0)); 3155 cmpl(ch, tmp); 3156 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3157 addptr(result, 2); 3158 subl(cnt1, 1); 3159 jccb(Assembler::zero, RET_NOT_FOUND); 3160 jmp(SCAN_TO_CHAR_LOOP); 3161 3162 bind(RET_NOT_FOUND); 3163 movl(result, -1); 3164 jmpb(DONE_LABEL); 3165 3166 bind(FOUND_CHAR); 3167 if (UseAVX >= 2) { 3168 vpmovmskb(tmp, vec3); 3169 } else { 3170 pmovmskb(tmp, vec3); 3171 } 3172 bsfl(ch, tmp); 3173 addptr(result, ch); 3174 3175 bind(FOUND_SEQ_CHAR); 3176 subptr(result, str1); 3177 shrl(result, 1); 3178 3179 bind(DONE_LABEL); 3180 } // string_indexof_char 3181 3182 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3183 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3184 ShortBranchVerifier sbv(this); 3185 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3186 3187 int stride = 16; 3188 3189 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3190 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3191 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3192 FOUND_SEQ_CHAR, DONE_LABEL; 3193 3194 movptr(result, str1); 3195 if (UseAVX >= 2) { 3196 cmpl(cnt1, stride); 3197 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3198 cmpl(cnt1, stride*2); 3199 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3200 movdl(vec1, ch); 3201 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3202 vpxor(vec2, vec2); 3203 movl(tmp, cnt1); 3204 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3205 andl(cnt1,0x0000001F); //tail count (in chars) 3206 3207 bind(SCAN_TO_32_CHAR_LOOP); 3208 vmovdqu(vec3, Address(result, 0)); 3209 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3210 vptest(vec2, vec3); 3211 jcc(Assembler::carryClear, FOUND_CHAR); 3212 addptr(result, 32); 3213 subl(tmp, stride*2); 3214 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3215 jmp(SCAN_TO_16_CHAR); 3216 3217 bind(SCAN_TO_16_CHAR_INIT); 3218 movdl(vec1, ch); 3219 pxor(vec2, vec2); 3220 pshufb(vec1, vec2); 3221 } 3222 3223 bind(SCAN_TO_16_CHAR); 3224 cmpl(cnt1, stride); 3225 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3226 if (UseAVX < 2) { 3227 movdl(vec1, ch); 3228 pxor(vec2, vec2); 3229 pshufb(vec1, vec2); 3230 } 3231 movl(tmp, cnt1); 3232 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3233 andl(cnt1,0x0000000F); //tail count (in bytes) 3234 3235 bind(SCAN_TO_16_CHAR_LOOP); 3236 movdqu(vec3, Address(result, 0)); 3237 pcmpeqb(vec3, vec1); 3238 ptest(vec2, vec3); 3239 jcc(Assembler::carryClear, FOUND_CHAR); 3240 addptr(result, 16); 3241 subl(tmp, stride); 3242 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3243 3244 bind(SCAN_TO_CHAR_INIT); 3245 testl(cnt1, cnt1); 3246 jcc(Assembler::zero, RET_NOT_FOUND); 3247 bind(SCAN_TO_CHAR_LOOP); 3248 load_unsigned_byte(tmp, Address(result, 0)); 3249 cmpl(ch, tmp); 3250 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3251 addptr(result, 1); 3252 subl(cnt1, 1); 3253 jccb(Assembler::zero, RET_NOT_FOUND); 3254 jmp(SCAN_TO_CHAR_LOOP); 3255 3256 bind(RET_NOT_FOUND); 3257 movl(result, -1); 3258 jmpb(DONE_LABEL); 3259 3260 bind(FOUND_CHAR); 3261 if (UseAVX >= 2) { 3262 vpmovmskb(tmp, vec3); 3263 } else { 3264 pmovmskb(tmp, vec3); 3265 } 3266 bsfl(ch, tmp); 3267 addptr(result, ch); 3268 3269 bind(FOUND_SEQ_CHAR); 3270 subptr(result, str1); 3271 3272 bind(DONE_LABEL); 3273 } // stringL_indexof_char 3274 3275 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3276 switch (eltype) { 3277 case T_BOOLEAN: return sizeof(jboolean); 3278 case T_BYTE: return sizeof(jbyte); 3279 case T_SHORT: return sizeof(jshort); 3280 case T_CHAR: return sizeof(jchar); 3281 case T_INT: return sizeof(jint); 3282 default: 3283 ShouldNotReachHere(); 3284 return -1; 3285 } 3286 } 3287 3288 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3289 switch (eltype) { 3290 // T_BOOLEAN used as surrogate for unsigned byte 3291 case T_BOOLEAN: movzbl(dst, src); break; 3292 case T_BYTE: movsbl(dst, src); break; 3293 case T_SHORT: movswl(dst, src); break; 3294 case T_CHAR: movzwl(dst, src); break; 3295 case T_INT: movl(dst, src); break; 3296 default: 3297 ShouldNotReachHere(); 3298 } 3299 } 3300 3301 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3302 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3303 } 3304 3305 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3306 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3307 } 3308 3309 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3310 const int vlen = Assembler::AVX_256bit; 3311 switch (eltype) { 3312 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3313 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3314 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3315 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3316 case T_INT: 3317 // do nothing 3318 break; 3319 default: 3320 ShouldNotReachHere(); 3321 } 3322 } 3323 3324 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3325 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3326 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3327 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3328 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3329 BasicType eltype) { 3330 ShortBranchVerifier sbv(this); 3331 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3332 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3333 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3334 3335 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3336 SHORT_UNROLLED_LOOP_EXIT, 3337 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3338 UNROLLED_VECTOR_LOOP_BEGIN, 3339 END; 3340 switch (eltype) { 3341 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3342 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3343 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3344 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3345 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3346 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3347 } 3348 3349 // For "renaming" for readibility of the code 3350 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3351 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3352 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3353 3354 const int elsize = arrays_hashcode_elsize(eltype); 3355 3356 /* 3357 if (cnt1 >= 2) { 3358 if (cnt1 >= 32) { 3359 UNROLLED VECTOR LOOP 3360 } 3361 UNROLLED SCALAR LOOP 3362 } 3363 SINGLE SCALAR 3364 */ 3365 3366 cmpl(cnt1, 32); 3367 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3368 3369 // cnt1 >= 32 && generate_vectorized_loop 3370 xorl(index, index); 3371 3372 // vresult = IntVector.zero(I256); 3373 for (int idx = 0; idx < 4; idx++) { 3374 vpxor(vresult[idx], vresult[idx]); 3375 } 3376 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3377 Register bound = tmp2; 3378 Register next = tmp3; 3379 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3380 movl(next, Address(tmp2, 0)); 3381 movdl(vnext, next); 3382 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3383 3384 // index = 0; 3385 // bound = cnt1 & ~(32 - 1); 3386 movl(bound, cnt1); 3387 andl(bound, ~(32 - 1)); 3388 // for (; index < bound; index += 32) { 3389 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3390 // result *= next; 3391 imull(result, next); 3392 // loop fission to upfront the cost of fetching from memory, OOO execution 3393 // can then hopefully do a better job of prefetching 3394 for (int idx = 0; idx < 4; idx++) { 3395 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3396 } 3397 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3398 for (int idx = 0; idx < 4; idx++) { 3399 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3400 arrays_hashcode_elvcast(vtmp[idx], eltype); 3401 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3402 } 3403 // index += 32; 3404 addl(index, 32); 3405 // index < bound; 3406 cmpl(index, bound); 3407 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3408 // } 3409 3410 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3411 subl(cnt1, bound); 3412 // release bound 3413 3414 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3415 for (int idx = 0; idx < 4; idx++) { 3416 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3417 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3418 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3419 } 3420 // result += vresult.reduceLanes(ADD); 3421 for (int idx = 0; idx < 4; idx++) { 3422 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3423 } 3424 3425 // } else if (cnt1 < 32) { 3426 3427 bind(SHORT_UNROLLED_BEGIN); 3428 // int i = 1; 3429 movl(index, 1); 3430 cmpl(index, cnt1); 3431 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3432 3433 // for (; i < cnt1 ; i += 2) { 3434 bind(SHORT_UNROLLED_LOOP_BEGIN); 3435 movl(tmp3, 961); 3436 imull(result, tmp3); 3437 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3438 movl(tmp3, tmp2); 3439 shll(tmp3, 5); 3440 subl(tmp3, tmp2); 3441 addl(result, tmp3); 3442 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3443 addl(result, tmp3); 3444 addl(index, 2); 3445 cmpl(index, cnt1); 3446 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3447 3448 // } 3449 // if (i >= cnt1) { 3450 bind(SHORT_UNROLLED_LOOP_EXIT); 3451 jccb(Assembler::greater, END); 3452 movl(tmp2, result); 3453 shll(result, 5); 3454 subl(result, tmp2); 3455 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3456 addl(result, tmp3); 3457 // } 3458 bind(END); 3459 3460 BLOCK_COMMENT("} // arrays_hashcode"); 3461 3462 } // arrays_hashcode 3463 3464 // helper function for string_compare 3465 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3466 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3467 Address::ScaleFactor scale2, Register index, int ae) { 3468 if (ae == StrIntrinsicNode::LL) { 3469 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3470 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3471 } else if (ae == StrIntrinsicNode::UU) { 3472 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3473 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3474 } else { 3475 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3476 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3477 } 3478 } 3479 3480 // Compare strings, used for char[] and byte[]. 3481 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3482 Register cnt1, Register cnt2, Register result, 3483 XMMRegister vec1, int ae, KRegister mask) { 3484 ShortBranchVerifier sbv(this); 3485 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3486 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3487 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3488 int stride2x2 = 0x40; 3489 Address::ScaleFactor scale = Address::no_scale; 3490 Address::ScaleFactor scale1 = Address::no_scale; 3491 Address::ScaleFactor scale2 = Address::no_scale; 3492 3493 if (ae != StrIntrinsicNode::LL) { 3494 stride2x2 = 0x20; 3495 } 3496 3497 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3498 shrl(cnt2, 1); 3499 } 3500 // Compute the minimum of the string lengths and the 3501 // difference of the string lengths (stack). 3502 // Do the conditional move stuff 3503 movl(result, cnt1); 3504 subl(cnt1, cnt2); 3505 push(cnt1); 3506 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3507 3508 // Is the minimum length zero? 3509 testl(cnt2, cnt2); 3510 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3511 if (ae == StrIntrinsicNode::LL) { 3512 // Load first bytes 3513 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3514 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3515 } else if (ae == StrIntrinsicNode::UU) { 3516 // Load first characters 3517 load_unsigned_short(result, Address(str1, 0)); 3518 load_unsigned_short(cnt1, Address(str2, 0)); 3519 } else { 3520 load_unsigned_byte(result, Address(str1, 0)); 3521 load_unsigned_short(cnt1, Address(str2, 0)); 3522 } 3523 subl(result, cnt1); 3524 jcc(Assembler::notZero, POP_LABEL); 3525 3526 if (ae == StrIntrinsicNode::UU) { 3527 // Divide length by 2 to get number of chars 3528 shrl(cnt2, 1); 3529 } 3530 cmpl(cnt2, 1); 3531 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3532 3533 // Check if the strings start at the same location and setup scale and stride 3534 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3535 cmpptr(str1, str2); 3536 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3537 if (ae == StrIntrinsicNode::LL) { 3538 scale = Address::times_1; 3539 stride = 16; 3540 } else { 3541 scale = Address::times_2; 3542 stride = 8; 3543 } 3544 } else { 3545 scale1 = Address::times_1; 3546 scale2 = Address::times_2; 3547 // scale not used 3548 stride = 8; 3549 } 3550 3551 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3552 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3553 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3554 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3555 Label COMPARE_TAIL_LONG; 3556 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3557 3558 int pcmpmask = 0x19; 3559 if (ae == StrIntrinsicNode::LL) { 3560 pcmpmask &= ~0x01; 3561 } 3562 3563 // Setup to compare 16-chars (32-bytes) vectors, 3564 // start from first character again because it has aligned address. 3565 if (ae == StrIntrinsicNode::LL) { 3566 stride2 = 32; 3567 } else { 3568 stride2 = 16; 3569 } 3570 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3571 adr_stride = stride << scale; 3572 } else { 3573 adr_stride1 = 8; //stride << scale1; 3574 adr_stride2 = 16; //stride << scale2; 3575 } 3576 3577 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3578 // rax and rdx are used by pcmpestri as elements counters 3579 movl(result, cnt2); 3580 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3581 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3582 3583 // fast path : compare first 2 8-char vectors. 3584 bind(COMPARE_16_CHARS); 3585 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3586 movdqu(vec1, Address(str1, 0)); 3587 } else { 3588 pmovzxbw(vec1, Address(str1, 0)); 3589 } 3590 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3591 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3592 3593 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3594 movdqu(vec1, Address(str1, adr_stride)); 3595 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3596 } else { 3597 pmovzxbw(vec1, Address(str1, adr_stride1)); 3598 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3599 } 3600 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3601 addl(cnt1, stride); 3602 3603 // Compare the characters at index in cnt1 3604 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3605 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3606 subl(result, cnt2); 3607 jmp(POP_LABEL); 3608 3609 // Setup the registers to start vector comparison loop 3610 bind(COMPARE_WIDE_VECTORS); 3611 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3612 lea(str1, Address(str1, result, scale)); 3613 lea(str2, Address(str2, result, scale)); 3614 } else { 3615 lea(str1, Address(str1, result, scale1)); 3616 lea(str2, Address(str2, result, scale2)); 3617 } 3618 subl(result, stride2); 3619 subl(cnt2, stride2); 3620 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3621 negptr(result); 3622 3623 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3624 bind(COMPARE_WIDE_VECTORS_LOOP); 3625 3626 #ifdef _LP64 3627 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3628 cmpl(cnt2, stride2x2); 3629 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3630 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3631 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3632 3633 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3634 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3635 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3636 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3637 } else { 3638 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3639 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3640 } 3641 kortestql(mask, mask); 3642 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3643 addptr(result, stride2x2); // update since we already compared at this addr 3644 subl(cnt2, stride2x2); // and sub the size too 3645 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3646 3647 vpxor(vec1, vec1); 3648 jmpb(COMPARE_WIDE_TAIL); 3649 }//if (VM_Version::supports_avx512vlbw()) 3650 #endif // _LP64 3651 3652 3653 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3654 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3655 vmovdqu(vec1, Address(str1, result, scale)); 3656 vpxor(vec1, Address(str2, result, scale)); 3657 } else { 3658 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3659 vpxor(vec1, Address(str2, result, scale2)); 3660 } 3661 vptest(vec1, vec1); 3662 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3663 addptr(result, stride2); 3664 subl(cnt2, stride2); 3665 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3666 // clean upper bits of YMM registers 3667 vpxor(vec1, vec1); 3668 3669 // compare wide vectors tail 3670 bind(COMPARE_WIDE_TAIL); 3671 testptr(result, result); 3672 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3673 3674 movl(result, stride2); 3675 movl(cnt2, result); 3676 negptr(result); 3677 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3678 3679 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3680 bind(VECTOR_NOT_EQUAL); 3681 // clean upper bits of YMM registers 3682 vpxor(vec1, vec1); 3683 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3684 lea(str1, Address(str1, result, scale)); 3685 lea(str2, Address(str2, result, scale)); 3686 } else { 3687 lea(str1, Address(str1, result, scale1)); 3688 lea(str2, Address(str2, result, scale2)); 3689 } 3690 jmp(COMPARE_16_CHARS); 3691 3692 // Compare tail chars, length between 1 to 15 chars 3693 bind(COMPARE_TAIL_LONG); 3694 movl(cnt2, result); 3695 cmpl(cnt2, stride); 3696 jcc(Assembler::less, COMPARE_SMALL_STR); 3697 3698 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3699 movdqu(vec1, Address(str1, 0)); 3700 } else { 3701 pmovzxbw(vec1, Address(str1, 0)); 3702 } 3703 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3704 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3705 subptr(cnt2, stride); 3706 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3707 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3708 lea(str1, Address(str1, result, scale)); 3709 lea(str2, Address(str2, result, scale)); 3710 } else { 3711 lea(str1, Address(str1, result, scale1)); 3712 lea(str2, Address(str2, result, scale2)); 3713 } 3714 negptr(cnt2); 3715 jmpb(WHILE_HEAD_LABEL); 3716 3717 bind(COMPARE_SMALL_STR); 3718 } else if (UseSSE42Intrinsics) { 3719 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3720 int pcmpmask = 0x19; 3721 // Setup to compare 8-char (16-byte) vectors, 3722 // start from first character again because it has aligned address. 3723 movl(result, cnt2); 3724 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3725 if (ae == StrIntrinsicNode::LL) { 3726 pcmpmask &= ~0x01; 3727 } 3728 jcc(Assembler::zero, COMPARE_TAIL); 3729 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3730 lea(str1, Address(str1, result, scale)); 3731 lea(str2, Address(str2, result, scale)); 3732 } else { 3733 lea(str1, Address(str1, result, scale1)); 3734 lea(str2, Address(str2, result, scale2)); 3735 } 3736 negptr(result); 3737 3738 // pcmpestri 3739 // inputs: 3740 // vec1- substring 3741 // rax - negative string length (elements count) 3742 // mem - scanned string 3743 // rdx - string length (elements count) 3744 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3745 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3746 // outputs: 3747 // rcx - first mismatched element index 3748 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3749 3750 bind(COMPARE_WIDE_VECTORS); 3751 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3752 movdqu(vec1, Address(str1, result, scale)); 3753 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3754 } else { 3755 pmovzxbw(vec1, Address(str1, result, scale1)); 3756 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3757 } 3758 // After pcmpestri cnt1(rcx) contains mismatched element index 3759 3760 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3761 addptr(result, stride); 3762 subptr(cnt2, stride); 3763 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3764 3765 // compare wide vectors tail 3766 testptr(result, result); 3767 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3768 3769 movl(cnt2, stride); 3770 movl(result, stride); 3771 negptr(result); 3772 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3773 movdqu(vec1, Address(str1, result, scale)); 3774 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3775 } else { 3776 pmovzxbw(vec1, Address(str1, result, scale1)); 3777 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3778 } 3779 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3780 3781 // Mismatched characters in the vectors 3782 bind(VECTOR_NOT_EQUAL); 3783 addptr(cnt1, result); 3784 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3785 subl(result, cnt2); 3786 jmpb(POP_LABEL); 3787 3788 bind(COMPARE_TAIL); // limit is zero 3789 movl(cnt2, result); 3790 // Fallthru to tail compare 3791 } 3792 // Shift str2 and str1 to the end of the arrays, negate min 3793 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3794 lea(str1, Address(str1, cnt2, scale)); 3795 lea(str2, Address(str2, cnt2, scale)); 3796 } else { 3797 lea(str1, Address(str1, cnt2, scale1)); 3798 lea(str2, Address(str2, cnt2, scale2)); 3799 } 3800 decrementl(cnt2); // first character was compared already 3801 negptr(cnt2); 3802 3803 // Compare the rest of the elements 3804 bind(WHILE_HEAD_LABEL); 3805 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3806 subl(result, cnt1); 3807 jccb(Assembler::notZero, POP_LABEL); 3808 increment(cnt2); 3809 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3810 3811 // Strings are equal up to min length. Return the length difference. 3812 bind(LENGTH_DIFF_LABEL); 3813 pop(result); 3814 if (ae == StrIntrinsicNode::UU) { 3815 // Divide diff by 2 to get number of chars 3816 sarl(result, 1); 3817 } 3818 jmpb(DONE_LABEL); 3819 3820 #ifdef _LP64 3821 if (VM_Version::supports_avx512vlbw()) { 3822 3823 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3824 3825 kmovql(cnt1, mask); 3826 notq(cnt1); 3827 bsfq(cnt2, cnt1); 3828 if (ae != StrIntrinsicNode::LL) { 3829 // Divide diff by 2 to get number of chars 3830 sarl(cnt2, 1); 3831 } 3832 addq(result, cnt2); 3833 if (ae == StrIntrinsicNode::LL) { 3834 load_unsigned_byte(cnt1, Address(str2, result)); 3835 load_unsigned_byte(result, Address(str1, result)); 3836 } else if (ae == StrIntrinsicNode::UU) { 3837 load_unsigned_short(cnt1, Address(str2, result, scale)); 3838 load_unsigned_short(result, Address(str1, result, scale)); 3839 } else { 3840 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3841 load_unsigned_byte(result, Address(str1, result, scale1)); 3842 } 3843 subl(result, cnt1); 3844 jmpb(POP_LABEL); 3845 }//if (VM_Version::supports_avx512vlbw()) 3846 #endif // _LP64 3847 3848 // Discard the stored length difference 3849 bind(POP_LABEL); 3850 pop(cnt1); 3851 3852 // That's it 3853 bind(DONE_LABEL); 3854 if(ae == StrIntrinsicNode::UL) { 3855 negl(result); 3856 } 3857 3858 } 3859 3860 // Search for Non-ASCII character (Negative byte value) in a byte array, 3861 // return the index of the first such character, otherwise the length 3862 // of the array segment searched. 3863 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3864 // @IntrinsicCandidate 3865 // public static int countPositives(byte[] ba, int off, int len) { 3866 // for (int i = off; i < off + len; i++) { 3867 // if (ba[i] < 0) { 3868 // return i - off; 3869 // } 3870 // } 3871 // return len; 3872 // } 3873 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3874 Register result, Register tmp1, 3875 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3876 // rsi: byte array 3877 // rcx: len 3878 // rax: result 3879 ShortBranchVerifier sbv(this); 3880 assert_different_registers(ary1, len, result, tmp1); 3881 assert_different_registers(vec1, vec2); 3882 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3883 3884 movl(result, len); // copy 3885 // len == 0 3886 testl(len, len); 3887 jcc(Assembler::zero, DONE); 3888 3889 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3890 VM_Version::supports_avx512vlbw() && 3891 VM_Version::supports_bmi2()) { 3892 3893 Label test_64_loop, test_tail, BREAK_LOOP; 3894 movl(tmp1, len); 3895 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3896 3897 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 3898 andl(len, 0xffffffc0); // vector count (in chars) 3899 jccb(Assembler::zero, test_tail); 3900 3901 lea(ary1, Address(ary1, len, Address::times_1)); 3902 negptr(len); 3903 3904 bind(test_64_loop); 3905 // Check whether our 64 elements of size byte contain negatives 3906 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3907 kortestql(mask1, mask1); 3908 jcc(Assembler::notZero, BREAK_LOOP); 3909 3910 addptr(len, 64); 3911 jccb(Assembler::notZero, test_64_loop); 3912 3913 bind(test_tail); 3914 // bail out when there is nothing to be done 3915 testl(tmp1, -1); 3916 jcc(Assembler::zero, DONE); 3917 3918 3919 // check the tail for absense of negatives 3920 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3921 #ifdef _LP64 3922 { 3923 Register tmp3_aliased = len; 3924 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3925 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3926 notq(tmp3_aliased); 3927 kmovql(mask2, tmp3_aliased); 3928 } 3929 #else 3930 Label k_init; 3931 jmp(k_init); 3932 3933 // We could not read 64-bits from a general purpose register thus we move 3934 // data required to compose 64 1's to the instruction stream 3935 // We emit 64 byte wide series of elements from 0..63 which later on would 3936 // be used as a compare targets with tail count contained in tmp1 register. 3937 // Result would be a k register having tmp1 consecutive number or 1 3938 // counting from least significant bit. 3939 address tmp = pc(); 3940 emit_int64(0x0706050403020100); 3941 emit_int64(0x0F0E0D0C0B0A0908); 3942 emit_int64(0x1716151413121110); 3943 emit_int64(0x1F1E1D1C1B1A1918); 3944 emit_int64(0x2726252423222120); 3945 emit_int64(0x2F2E2D2C2B2A2928); 3946 emit_int64(0x3736353433323130); 3947 emit_int64(0x3F3E3D3C3B3A3938); 3948 3949 bind(k_init); 3950 lea(len, InternalAddress(tmp)); 3951 // create mask to test for negative byte inside a vector 3952 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3953 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3954 3955 #endif 3956 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3957 ktestq(mask1, mask2); 3958 jcc(Assembler::zero, DONE); 3959 3960 // do a full check for negative registers in the tail 3961 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 3962 // ary1 already pointing to the right place 3963 jmpb(TAIL_START); 3964 3965 bind(BREAK_LOOP); 3966 // At least one byte in the last 64 byte block was negative. 3967 // Set up to look at the last 64 bytes as if they were a tail 3968 lea(ary1, Address(ary1, len, Address::times_1)); 3969 addptr(result, len); 3970 // Ignore the very last byte: if all others are positive, 3971 // it must be negative, so we can skip right to the 2+1 byte 3972 // end comparison at this point 3973 orl(result, 63); 3974 movl(len, 63); 3975 // Fallthru to tail compare 3976 } else { 3977 3978 if (UseAVX >= 2 && UseSSE >= 2) { 3979 // With AVX2, use 32-byte vector compare 3980 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3981 3982 // Compare 32-byte vectors 3983 testl(len, 0xffffffe0); // vector count (in bytes) 3984 jccb(Assembler::zero, TAIL_START); 3985 3986 andl(len, 0xffffffe0); 3987 lea(ary1, Address(ary1, len, Address::times_1)); 3988 negptr(len); 3989 3990 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3991 movdl(vec2, tmp1); 3992 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3993 3994 bind(COMPARE_WIDE_VECTORS); 3995 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3996 vptest(vec1, vec2); 3997 jccb(Assembler::notZero, BREAK_LOOP); 3998 addptr(len, 32); 3999 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4000 4001 testl(result, 0x0000001f); // any bytes remaining? 4002 jcc(Assembler::zero, DONE); 4003 4004 // Quick test using the already prepared vector mask 4005 movl(len, result); 4006 andl(len, 0x0000001f); 4007 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4008 vptest(vec1, vec2); 4009 jcc(Assembler::zero, DONE); 4010 // There are zeros, jump to the tail to determine exactly where 4011 jmpb(TAIL_START); 4012 4013 bind(BREAK_LOOP); 4014 // At least one byte in the last 32-byte vector is negative. 4015 // Set up to look at the last 32 bytes as if they were a tail 4016 lea(ary1, Address(ary1, len, Address::times_1)); 4017 addptr(result, len); 4018 // Ignore the very last byte: if all others are positive, 4019 // it must be negative, so we can skip right to the 2+1 byte 4020 // end comparison at this point 4021 orl(result, 31); 4022 movl(len, 31); 4023 // Fallthru to tail compare 4024 } else if (UseSSE42Intrinsics) { 4025 // With SSE4.2, use double quad vector compare 4026 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4027 4028 // Compare 16-byte vectors 4029 testl(len, 0xfffffff0); // vector count (in bytes) 4030 jcc(Assembler::zero, TAIL_START); 4031 4032 andl(len, 0xfffffff0); 4033 lea(ary1, Address(ary1, len, Address::times_1)); 4034 negptr(len); 4035 4036 movl(tmp1, 0x80808080); 4037 movdl(vec2, tmp1); 4038 pshufd(vec2, vec2, 0); 4039 4040 bind(COMPARE_WIDE_VECTORS); 4041 movdqu(vec1, Address(ary1, len, Address::times_1)); 4042 ptest(vec1, vec2); 4043 jccb(Assembler::notZero, BREAK_LOOP); 4044 addptr(len, 16); 4045 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4046 4047 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4048 jcc(Assembler::zero, DONE); 4049 4050 // Quick test using the already prepared vector mask 4051 movl(len, result); 4052 andl(len, 0x0000000f); // tail count (in bytes) 4053 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4054 ptest(vec1, vec2); 4055 jcc(Assembler::zero, DONE); 4056 jmpb(TAIL_START); 4057 4058 bind(BREAK_LOOP); 4059 // At least one byte in the last 16-byte vector is negative. 4060 // Set up and look at the last 16 bytes as if they were a tail 4061 lea(ary1, Address(ary1, len, Address::times_1)); 4062 addptr(result, len); 4063 // Ignore the very last byte: if all others are positive, 4064 // it must be negative, so we can skip right to the 2+1 byte 4065 // end comparison at this point 4066 orl(result, 15); 4067 movl(len, 15); 4068 // Fallthru to tail compare 4069 } 4070 } 4071 4072 bind(TAIL_START); 4073 // Compare 4-byte vectors 4074 andl(len, 0xfffffffc); // vector count (in bytes) 4075 jccb(Assembler::zero, COMPARE_CHAR); 4076 4077 lea(ary1, Address(ary1, len, Address::times_1)); 4078 negptr(len); 4079 4080 bind(COMPARE_VECTORS); 4081 movl(tmp1, Address(ary1, len, Address::times_1)); 4082 andl(tmp1, 0x80808080); 4083 jccb(Assembler::notZero, TAIL_ADJUST); 4084 addptr(len, 4); 4085 jccb(Assembler::notZero, COMPARE_VECTORS); 4086 4087 // Compare trailing char (final 2-3 bytes), if any 4088 bind(COMPARE_CHAR); 4089 4090 testl(result, 0x2); // tail char 4091 jccb(Assembler::zero, COMPARE_BYTE); 4092 load_unsigned_short(tmp1, Address(ary1, 0)); 4093 andl(tmp1, 0x00008080); 4094 jccb(Assembler::notZero, CHAR_ADJUST); 4095 lea(ary1, Address(ary1, 2)); 4096 4097 bind(COMPARE_BYTE); 4098 testl(result, 0x1); // tail byte 4099 jccb(Assembler::zero, DONE); 4100 load_unsigned_byte(tmp1, Address(ary1, 0)); 4101 testl(tmp1, 0x00000080); 4102 jccb(Assembler::zero, DONE); 4103 subptr(result, 1); 4104 jmpb(DONE); 4105 4106 bind(TAIL_ADJUST); 4107 // there are negative bits in the last 4 byte block. 4108 // Adjust result and check the next three bytes 4109 addptr(result, len); 4110 orl(result, 3); 4111 lea(ary1, Address(ary1, len, Address::times_1)); 4112 jmpb(COMPARE_CHAR); 4113 4114 bind(CHAR_ADJUST); 4115 // We are looking at a char + optional byte tail, and found that one 4116 // of the bytes in the char is negative. Adjust the result, check the 4117 // first byte and readjust if needed. 4118 andl(result, 0xfffffffc); 4119 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4120 jccb(Assembler::notZero, DONE); 4121 addptr(result, 1); 4122 4123 // That's it 4124 bind(DONE); 4125 if (UseAVX >= 2 && UseSSE >= 2) { 4126 // clean upper bits of YMM registers 4127 vpxor(vec1, vec1); 4128 vpxor(vec2, vec2); 4129 } 4130 } 4131 4132 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4133 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4134 Register limit, Register result, Register chr, 4135 XMMRegister vec1, XMMRegister vec2, bool is_char, 4136 KRegister mask, bool expand_ary2) { 4137 // for expand_ary2, limit is the (smaller) size of the second array. 4138 ShortBranchVerifier sbv(this); 4139 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4140 4141 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)), 4142 "Expansion only implemented for AVX2"); 4143 4144 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4145 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4146 4147 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1; 4148 int scaleIncr = expand_ary2 ? 8 : 16; 4149 4150 if (is_array_equ) { 4151 // Check the input args 4152 cmpoop(ary1, ary2); 4153 jcc(Assembler::equal, TRUE_LABEL); 4154 4155 // Need additional checks for arrays_equals. 4156 testptr(ary1, ary1); 4157 jcc(Assembler::zero, FALSE_LABEL); 4158 testptr(ary2, ary2); 4159 jcc(Assembler::zero, FALSE_LABEL); 4160 4161 // Check the lengths 4162 movl(limit, Address(ary1, length_offset)); 4163 cmpl(limit, Address(ary2, length_offset)); 4164 jcc(Assembler::notEqual, FALSE_LABEL); 4165 } 4166 4167 // count == 0 4168 testl(limit, limit); 4169 jcc(Assembler::zero, TRUE_LABEL); 4170 4171 if (is_array_equ) { 4172 // Load array address 4173 lea(ary1, Address(ary1, base_offset)); 4174 lea(ary2, Address(ary2, base_offset)); 4175 } 4176 4177 if (is_array_equ && is_char) { 4178 // arrays_equals when used for char[]. 4179 shll(limit, 1); // byte count != 0 4180 } 4181 movl(result, limit); // copy 4182 4183 if (UseAVX >= 2) { 4184 // With AVX2, use 32-byte vector compare 4185 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16; 4186 4187 // Compare 32-byte vectors 4188 if (expand_ary2) { 4189 andl(result, 0x0000000f); // tail count (in bytes) 4190 andl(limit, 0xfffffff0); // vector count (in bytes) 4191 jcc(Assembler::zero, COMPARE_TAIL); 4192 } else { 4193 andl(result, 0x0000001f); // tail count (in bytes) 4194 andl(limit, 0xffffffe0); // vector count (in bytes) 4195 jcc(Assembler::zero, COMPARE_TAIL_16); 4196 } 4197 4198 lea(ary1, Address(ary1, limit, scaleFactor)); 4199 lea(ary2, Address(ary2, limit, Address::times_1)); 4200 negptr(limit); 4201 4202 #ifdef _LP64 4203 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4204 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4205 4206 cmpl(limit, -64); 4207 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4208 4209 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4210 4211 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4212 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4213 kortestql(mask, mask); 4214 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4215 addptr(limit, 64); // update since we already compared at this addr 4216 cmpl(limit, -64); 4217 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4218 4219 // At this point we may still need to compare -limit+result bytes. 4220 // We could execute the next two instruction and just continue via non-wide path: 4221 // cmpl(limit, 0); 4222 // jcc(Assembler::equal, COMPARE_TAIL); // true 4223 // But since we stopped at the points ary{1,2}+limit which are 4224 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4225 // (|limit| <= 32 and result < 32), 4226 // we may just compare the last 64 bytes. 4227 // 4228 addptr(result, -64); // it is safe, bc we just came from this area 4229 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4230 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4231 kortestql(mask, mask); 4232 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4233 4234 jmp(TRUE_LABEL); 4235 4236 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4237 4238 }//if (VM_Version::supports_avx512vlbw()) 4239 #endif //_LP64 4240 bind(COMPARE_WIDE_VECTORS); 4241 vmovdqu(vec1, Address(ary1, limit, scaleFactor)); 4242 if (expand_ary2) { 4243 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit); 4244 } else { 4245 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4246 } 4247 vpxor(vec1, vec2); 4248 4249 vptest(vec1, vec1); 4250 jcc(Assembler::notZero, FALSE_LABEL); 4251 addptr(limit, scaleIncr * 2); 4252 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4253 4254 testl(result, result); 4255 jcc(Assembler::zero, TRUE_LABEL); 4256 4257 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32)); 4258 if (expand_ary2) { 4259 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit); 4260 } else { 4261 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4262 } 4263 vpxor(vec1, vec2); 4264 4265 vptest(vec1, vec1); 4266 jcc(Assembler::notZero, FALSE_LABEL); 4267 jmp(TRUE_LABEL); 4268 4269 bind(COMPARE_TAIL_16); // limit is zero 4270 movl(limit, result); 4271 4272 // Compare 16-byte chunks 4273 andl(result, 0x0000000f); // tail count (in bytes) 4274 andl(limit, 0xfffffff0); // vector count (in bytes) 4275 jcc(Assembler::zero, COMPARE_TAIL); 4276 4277 lea(ary1, Address(ary1, limit, scaleFactor)); 4278 lea(ary2, Address(ary2, limit, Address::times_1)); 4279 negptr(limit); 4280 4281 bind(COMPARE_WIDE_VECTORS_16); 4282 movdqu(vec1, Address(ary1, limit, scaleFactor)); 4283 if (expand_ary2) { 4284 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit); 4285 } else { 4286 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4287 } 4288 pxor(vec1, vec2); 4289 4290 ptest(vec1, vec1); 4291 jcc(Assembler::notZero, FALSE_LABEL); 4292 addptr(limit, scaleIncr); 4293 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16); 4294 4295 bind(COMPARE_TAIL); // limit is zero 4296 movl(limit, result); 4297 // Fallthru to tail compare 4298 } else if (UseSSE42Intrinsics) { 4299 // With SSE4.2, use double quad vector compare 4300 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4301 4302 // Compare 16-byte vectors 4303 andl(result, 0x0000000f); // tail count (in bytes) 4304 andl(limit, 0xfffffff0); // vector count (in bytes) 4305 jcc(Assembler::zero, COMPARE_TAIL); 4306 4307 lea(ary1, Address(ary1, limit, Address::times_1)); 4308 lea(ary2, Address(ary2, limit, Address::times_1)); 4309 negptr(limit); 4310 4311 bind(COMPARE_WIDE_VECTORS); 4312 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4313 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4314 pxor(vec1, vec2); 4315 4316 ptest(vec1, vec1); 4317 jcc(Assembler::notZero, FALSE_LABEL); 4318 addptr(limit, 16); 4319 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4320 4321 testl(result, result); 4322 jcc(Assembler::zero, TRUE_LABEL); 4323 4324 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4325 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4326 pxor(vec1, vec2); 4327 4328 ptest(vec1, vec1); 4329 jccb(Assembler::notZero, FALSE_LABEL); 4330 jmpb(TRUE_LABEL); 4331 4332 bind(COMPARE_TAIL); // limit is zero 4333 movl(limit, result); 4334 // Fallthru to tail compare 4335 } 4336 4337 // Compare 4-byte vectors 4338 if (expand_ary2) { 4339 testl(result, result); 4340 jccb(Assembler::zero, TRUE_LABEL); 4341 } else { 4342 andl(limit, 0xfffffffc); // vector count (in bytes) 4343 jccb(Assembler::zero, COMPARE_CHAR); 4344 } 4345 4346 lea(ary1, Address(ary1, limit, scaleFactor)); 4347 lea(ary2, Address(ary2, limit, Address::times_1)); 4348 negptr(limit); 4349 4350 bind(COMPARE_VECTORS); 4351 if (expand_ary2) { 4352 // There are no "vector" operations for bytes to shorts 4353 movzbl(chr, Address(ary2, limit, Address::times_1)); 4354 cmpw(Address(ary1, limit, Address::times_2), chr); 4355 jccb(Assembler::notEqual, FALSE_LABEL); 4356 addptr(limit, 1); 4357 jcc(Assembler::notZero, COMPARE_VECTORS); 4358 jmp(TRUE_LABEL); 4359 } else { 4360 movl(chr, Address(ary1, limit, Address::times_1)); 4361 cmpl(chr, Address(ary2, limit, Address::times_1)); 4362 jccb(Assembler::notEqual, FALSE_LABEL); 4363 addptr(limit, 4); 4364 jcc(Assembler::notZero, COMPARE_VECTORS); 4365 } 4366 4367 // Compare trailing char (final 2 bytes), if any 4368 bind(COMPARE_CHAR); 4369 testl(result, 0x2); // tail char 4370 jccb(Assembler::zero, COMPARE_BYTE); 4371 load_unsigned_short(chr, Address(ary1, 0)); 4372 load_unsigned_short(limit, Address(ary2, 0)); 4373 cmpl(chr, limit); 4374 jccb(Assembler::notEqual, FALSE_LABEL); 4375 4376 if (is_array_equ && is_char) { 4377 bind(COMPARE_BYTE); 4378 } else { 4379 lea(ary1, Address(ary1, 2)); 4380 lea(ary2, Address(ary2, 2)); 4381 4382 bind(COMPARE_BYTE); 4383 testl(result, 0x1); // tail byte 4384 jccb(Assembler::zero, TRUE_LABEL); 4385 load_unsigned_byte(chr, Address(ary1, 0)); 4386 load_unsigned_byte(limit, Address(ary2, 0)); 4387 cmpl(chr, limit); 4388 jccb(Assembler::notEqual, FALSE_LABEL); 4389 } 4390 bind(TRUE_LABEL); 4391 movl(result, 1); // return true 4392 jmpb(DONE); 4393 4394 bind(FALSE_LABEL); 4395 xorl(result, result); // return false 4396 4397 // That's it 4398 bind(DONE); 4399 if (UseAVX >= 2) { 4400 // clean upper bits of YMM registers 4401 vpxor(vec1, vec1); 4402 vpxor(vec2, vec2); 4403 } 4404 } 4405 4406 #ifdef _LP64 4407 4408 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4409 #define __ masm. 4410 Register dst = stub.data<0>(); 4411 XMMRegister src = stub.data<1>(); 4412 address target = stub.data<2>(); 4413 __ bind(stub.entry()); 4414 __ subptr(rsp, 8); 4415 __ movdbl(Address(rsp), src); 4416 __ call(RuntimeAddress(target)); 4417 __ pop(dst); 4418 __ jmp(stub.continuation()); 4419 #undef __ 4420 } 4421 4422 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4423 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4424 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4425 4426 address slowpath_target; 4427 if (dst_bt == T_INT) { 4428 if (src_bt == T_FLOAT) { 4429 cvttss2sil(dst, src); 4430 cmpl(dst, 0x80000000); 4431 slowpath_target = StubRoutines::x86::f2i_fixup(); 4432 } else { 4433 cvttsd2sil(dst, src); 4434 cmpl(dst, 0x80000000); 4435 slowpath_target = StubRoutines::x86::d2i_fixup(); 4436 } 4437 } else { 4438 if (src_bt == T_FLOAT) { 4439 cvttss2siq(dst, src); 4440 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4441 slowpath_target = StubRoutines::x86::f2l_fixup(); 4442 } else { 4443 cvttsd2siq(dst, src); 4444 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4445 slowpath_target = StubRoutines::x86::d2l_fixup(); 4446 } 4447 } 4448 4449 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4450 jcc(Assembler::equal, stub->entry()); 4451 bind(stub->continuation()); 4452 } 4453 4454 #endif // _LP64 4455 4456 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4457 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4458 switch(ideal_opc) { 4459 case Op_LShiftVS: 4460 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4461 case Op_LShiftVI: 4462 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4463 case Op_LShiftVL: 4464 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4465 case Op_RShiftVS: 4466 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4467 case Op_RShiftVI: 4468 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4469 case Op_RShiftVL: 4470 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4471 case Op_URShiftVS: 4472 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4473 case Op_URShiftVI: 4474 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4475 case Op_URShiftVL: 4476 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4477 case Op_RotateRightV: 4478 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4479 case Op_RotateLeftV: 4480 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4481 default: 4482 fatal("Unsupported masked operation"); break; 4483 } 4484 } 4485 4486 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4487 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4488 bool is_varshift) { 4489 switch (ideal_opc) { 4490 case Op_AddVB: 4491 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4492 case Op_AddVS: 4493 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4494 case Op_AddVI: 4495 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4496 case Op_AddVL: 4497 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4498 case Op_AddVF: 4499 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4500 case Op_AddVD: 4501 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4502 case Op_SubVB: 4503 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4504 case Op_SubVS: 4505 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4506 case Op_SubVI: 4507 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4508 case Op_SubVL: 4509 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4510 case Op_SubVF: 4511 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4512 case Op_SubVD: 4513 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4514 case Op_MulVS: 4515 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4516 case Op_MulVI: 4517 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4518 case Op_MulVL: 4519 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4520 case Op_MulVF: 4521 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4522 case Op_MulVD: 4523 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4524 case Op_DivVF: 4525 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4526 case Op_DivVD: 4527 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4528 case Op_SqrtVF: 4529 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4530 case Op_SqrtVD: 4531 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4532 case Op_AbsVB: 4533 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4534 case Op_AbsVS: 4535 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4536 case Op_AbsVI: 4537 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4538 case Op_AbsVL: 4539 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4540 case Op_FmaVF: 4541 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4542 case Op_FmaVD: 4543 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4544 case Op_VectorRearrange: 4545 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4546 case Op_LShiftVS: 4547 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4548 case Op_LShiftVI: 4549 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4550 case Op_LShiftVL: 4551 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4552 case Op_RShiftVS: 4553 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4554 case Op_RShiftVI: 4555 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4556 case Op_RShiftVL: 4557 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4558 case Op_URShiftVS: 4559 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4560 case Op_URShiftVI: 4561 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4562 case Op_URShiftVL: 4563 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4564 case Op_RotateLeftV: 4565 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4566 case Op_RotateRightV: 4567 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4568 case Op_MaxV: 4569 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4570 case Op_MinV: 4571 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4572 case Op_XorV: 4573 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4574 case Op_OrV: 4575 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4576 case Op_AndV: 4577 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4578 default: 4579 fatal("Unsupported masked operation"); break; 4580 } 4581 } 4582 4583 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4584 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4585 switch (ideal_opc) { 4586 case Op_AddVB: 4587 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4588 case Op_AddVS: 4589 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4590 case Op_AddVI: 4591 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4592 case Op_AddVL: 4593 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4594 case Op_AddVF: 4595 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4596 case Op_AddVD: 4597 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4598 case Op_SubVB: 4599 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4600 case Op_SubVS: 4601 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4602 case Op_SubVI: 4603 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4604 case Op_SubVL: 4605 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4606 case Op_SubVF: 4607 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4608 case Op_SubVD: 4609 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4610 case Op_MulVS: 4611 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4612 case Op_MulVI: 4613 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4614 case Op_MulVL: 4615 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4616 case Op_MulVF: 4617 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4618 case Op_MulVD: 4619 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4620 case Op_DivVF: 4621 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4622 case Op_DivVD: 4623 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4624 case Op_FmaVF: 4625 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4626 case Op_FmaVD: 4627 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4628 case Op_MaxV: 4629 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4630 case Op_MinV: 4631 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4632 case Op_XorV: 4633 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4634 case Op_OrV: 4635 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4636 case Op_AndV: 4637 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4638 default: 4639 fatal("Unsupported masked operation"); break; 4640 } 4641 } 4642 4643 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4644 KRegister src1, KRegister src2) { 4645 BasicType etype = T_ILLEGAL; 4646 switch(mask_len) { 4647 case 2: 4648 case 4: 4649 case 8: etype = T_BYTE; break; 4650 case 16: etype = T_SHORT; break; 4651 case 32: etype = T_INT; break; 4652 case 64: etype = T_LONG; break; 4653 default: fatal("Unsupported type"); break; 4654 } 4655 assert(etype != T_ILLEGAL, ""); 4656 switch(ideal_opc) { 4657 case Op_AndVMask: 4658 kand(etype, dst, src1, src2); break; 4659 case Op_OrVMask: 4660 kor(etype, dst, src1, src2); break; 4661 case Op_XorVMask: 4662 kxor(etype, dst, src1, src2); break; 4663 default: 4664 fatal("Unsupported masked operation"); break; 4665 } 4666 } 4667 4668 /* 4669 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4670 * If src is NaN, the result is 0. 4671 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4672 * the result is equal to the value of Integer.MIN_VALUE. 4673 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4674 * the result is equal to the value of Integer.MAX_VALUE. 4675 */ 4676 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4677 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4678 Register rscratch, AddressLiteral float_sign_flip, 4679 int vec_enc) { 4680 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4681 Label done; 4682 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4683 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4684 vptest(xtmp2, xtmp2, vec_enc); 4685 jccb(Assembler::equal, done); 4686 4687 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4688 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4689 4690 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4691 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4692 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4693 4694 // Recompute the mask for remaining special value. 4695 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4696 // Extract SRC values corresponding to TRUE mask lanes. 4697 vpand(xtmp4, xtmp2, src, vec_enc); 4698 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4699 // values are set. 4700 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4701 4702 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4703 bind(done); 4704 } 4705 4706 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4707 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4708 Register rscratch, AddressLiteral float_sign_flip, 4709 int vec_enc) { 4710 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4711 Label done; 4712 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4713 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4714 kortestwl(ktmp1, ktmp1); 4715 jccb(Assembler::equal, done); 4716 4717 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4718 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4719 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4720 4721 kxorwl(ktmp1, ktmp1, ktmp2); 4722 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4723 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4724 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4725 bind(done); 4726 } 4727 4728 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4729 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4730 Register rscratch, AddressLiteral double_sign_flip, 4731 int vec_enc) { 4732 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4733 4734 Label done; 4735 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4736 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4737 kortestwl(ktmp1, ktmp1); 4738 jccb(Assembler::equal, done); 4739 4740 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4741 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4742 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4743 4744 kxorwl(ktmp1, ktmp1, ktmp2); 4745 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4746 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4747 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4748 bind(done); 4749 } 4750 4751 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4752 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4753 Register rscratch, AddressLiteral float_sign_flip, 4754 int vec_enc) { 4755 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4756 Label done; 4757 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4758 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4759 kortestwl(ktmp1, ktmp1); 4760 jccb(Assembler::equal, done); 4761 4762 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4763 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4764 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4765 4766 kxorwl(ktmp1, ktmp1, ktmp2); 4767 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4768 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4769 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4770 bind(done); 4771 } 4772 4773 /* 4774 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4775 * If src is NaN, the result is 0. 4776 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4777 * the result is equal to the value of Long.MIN_VALUE. 4778 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4779 * the result is equal to the value of Long.MAX_VALUE. 4780 */ 4781 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4782 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4783 Register rscratch, AddressLiteral double_sign_flip, 4784 int vec_enc) { 4785 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4786 4787 Label done; 4788 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4789 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4790 kortestwl(ktmp1, ktmp1); 4791 jccb(Assembler::equal, done); 4792 4793 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4794 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4795 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4796 4797 kxorwl(ktmp1, ktmp1, ktmp2); 4798 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4799 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4800 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4801 bind(done); 4802 } 4803 4804 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4805 XMMRegister xtmp, int index, int vec_enc) { 4806 assert(vec_enc < Assembler::AVX_512bit, ""); 4807 if (vec_enc == Assembler::AVX_256bit) { 4808 vextractf128_high(xtmp, src); 4809 vshufps(dst, src, xtmp, index, vec_enc); 4810 } else { 4811 vshufps(dst, src, zero, index, vec_enc); 4812 } 4813 } 4814 4815 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4816 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4817 AddressLiteral float_sign_flip, int src_vec_enc) { 4818 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4819 4820 Label done; 4821 // Compare the destination lanes with float_sign_flip 4822 // value to get mask for all special values. 4823 movdqu(xtmp1, float_sign_flip, rscratch); 4824 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4825 ptest(xtmp2, xtmp2); 4826 jccb(Assembler::equal, done); 4827 4828 // Flip float_sign_flip to get max integer value. 4829 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4830 pxor(xtmp1, xtmp4); 4831 4832 // Set detination lanes corresponding to unordered source lanes as zero. 4833 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4834 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4835 4836 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4837 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4838 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4839 4840 // Recompute the mask for remaining special value. 4841 pxor(xtmp2, xtmp3); 4842 // Extract mask corresponding to non-negative source lanes. 4843 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4844 4845 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4846 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4847 pand(xtmp3, xtmp2); 4848 4849 // Replace destination lanes holding special value(0x80000000) with max int 4850 // if corresponding source lane holds a +ve value. 4851 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4852 bind(done); 4853 } 4854 4855 4856 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4857 XMMRegister xtmp, Register rscratch, int vec_enc) { 4858 switch(to_elem_bt) { 4859 case T_SHORT: 4860 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4861 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4862 vpackusdw(dst, dst, zero, vec_enc); 4863 if (vec_enc == Assembler::AVX_256bit) { 4864 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4865 } 4866 break; 4867 case T_BYTE: 4868 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4869 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4870 vpackusdw(dst, dst, zero, vec_enc); 4871 if (vec_enc == Assembler::AVX_256bit) { 4872 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4873 } 4874 vpackuswb(dst, dst, zero, vec_enc); 4875 break; 4876 default: assert(false, "%s", type2name(to_elem_bt)); 4877 } 4878 } 4879 4880 /* 4881 * Algorithm for vector D2L and F2I conversions:- 4882 * a) Perform vector D2L/F2I cast. 4883 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4884 * It signifies that source value could be any of the special floating point 4885 * values(NaN,-Inf,Inf,Max,-Min). 4886 * c) Set destination to zero if source is NaN value. 4887 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4888 */ 4889 4890 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4891 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4892 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4893 int to_elem_sz = type2aelembytes(to_elem_bt); 4894 assert(to_elem_sz <= 4, ""); 4895 vcvttps2dq(dst, src, vec_enc); 4896 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4897 if (to_elem_sz < 4) { 4898 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4899 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4900 } 4901 } 4902 4903 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4904 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4905 Register rscratch, int vec_enc) { 4906 int to_elem_sz = type2aelembytes(to_elem_bt); 4907 assert(to_elem_sz <= 4, ""); 4908 vcvttps2dq(dst, src, vec_enc); 4909 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4910 switch(to_elem_bt) { 4911 case T_INT: 4912 break; 4913 case T_SHORT: 4914 evpmovdw(dst, dst, vec_enc); 4915 break; 4916 case T_BYTE: 4917 evpmovdb(dst, dst, vec_enc); 4918 break; 4919 default: assert(false, "%s", type2name(to_elem_bt)); 4920 } 4921 } 4922 4923 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4924 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4925 Register rscratch, int vec_enc) { 4926 evcvttps2qq(dst, src, vec_enc); 4927 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 4928 } 4929 4930 // Handling for downcasting from double to integer or sub-word types on AVX2. 4931 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4932 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 4933 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4934 int to_elem_sz = type2aelembytes(to_elem_bt); 4935 assert(to_elem_sz < 8, ""); 4936 vcvttpd2dq(dst, src, vec_enc); 4937 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 4938 float_sign_flip, vec_enc); 4939 if (to_elem_sz < 4) { 4940 // xtmp4 holds all zero lanes. 4941 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 4942 } 4943 } 4944 4945 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 4946 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 4947 KRegister ktmp2, AddressLiteral sign_flip, 4948 Register rscratch, int vec_enc) { 4949 if (VM_Version::supports_avx512dq()) { 4950 evcvttpd2qq(dst, src, vec_enc); 4951 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4952 switch(to_elem_bt) { 4953 case T_LONG: 4954 break; 4955 case T_INT: 4956 evpmovsqd(dst, dst, vec_enc); 4957 break; 4958 case T_SHORT: 4959 evpmovsqd(dst, dst, vec_enc); 4960 evpmovdw(dst, dst, vec_enc); 4961 break; 4962 case T_BYTE: 4963 evpmovsqd(dst, dst, vec_enc); 4964 evpmovdb(dst, dst, vec_enc); 4965 break; 4966 default: assert(false, "%s", type2name(to_elem_bt)); 4967 } 4968 } else { 4969 assert(type2aelembytes(to_elem_bt) <= 4, ""); 4970 vcvttpd2dq(dst, src, vec_enc); 4971 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4972 switch(to_elem_bt) { 4973 case T_INT: 4974 break; 4975 case T_SHORT: 4976 evpmovdw(dst, dst, vec_enc); 4977 break; 4978 case T_BYTE: 4979 evpmovdb(dst, dst, vec_enc); 4980 break; 4981 default: assert(false, "%s", type2name(to_elem_bt)); 4982 } 4983 } 4984 } 4985 4986 #ifdef _LP64 4987 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 4988 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4989 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4990 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4991 // and re-instantiate original MXCSR.RC mode after that. 4992 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4993 4994 mov64(tmp, julong_cast(0.5L)); 4995 evpbroadcastq(xtmp1, tmp, vec_enc); 4996 vaddpd(xtmp1, src , xtmp1, vec_enc); 4997 evcvtpd2qq(dst, xtmp1, vec_enc); 4998 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4999 double_sign_flip, vec_enc);; 5000 5001 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5002 } 5003 5004 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5005 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5006 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5007 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5008 // and re-instantiate original MXCSR.RC mode after that. 5009 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5010 5011 movl(tmp, jint_cast(0.5)); 5012 movq(xtmp1, tmp); 5013 vbroadcastss(xtmp1, xtmp1, vec_enc); 5014 vaddps(xtmp1, src , xtmp1, vec_enc); 5015 vcvtps2dq(dst, xtmp1, vec_enc); 5016 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5017 float_sign_flip, vec_enc); 5018 5019 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5020 } 5021 5022 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5023 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5024 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5025 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5026 // and re-instantiate original MXCSR.RC mode after that. 5027 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5028 5029 movl(tmp, jint_cast(0.5)); 5030 movq(xtmp1, tmp); 5031 vbroadcastss(xtmp1, xtmp1, vec_enc); 5032 vaddps(xtmp1, src , xtmp1, vec_enc); 5033 vcvtps2dq(dst, xtmp1, vec_enc); 5034 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5035 5036 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5037 } 5038 #endif // _LP64 5039 5040 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5041 BasicType from_elem_bt, BasicType to_elem_bt) { 5042 switch (from_elem_bt) { 5043 case T_BYTE: 5044 switch (to_elem_bt) { 5045 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5046 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5047 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5048 default: ShouldNotReachHere(); 5049 } 5050 break; 5051 case T_SHORT: 5052 switch (to_elem_bt) { 5053 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5054 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5055 default: ShouldNotReachHere(); 5056 } 5057 break; 5058 case T_INT: 5059 assert(to_elem_bt == T_LONG, ""); 5060 vpmovzxdq(dst, src, vlen_enc); 5061 break; 5062 default: 5063 ShouldNotReachHere(); 5064 } 5065 } 5066 5067 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5068 BasicType from_elem_bt, BasicType to_elem_bt) { 5069 switch (from_elem_bt) { 5070 case T_BYTE: 5071 switch (to_elem_bt) { 5072 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5073 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5074 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5075 default: ShouldNotReachHere(); 5076 } 5077 break; 5078 case T_SHORT: 5079 switch (to_elem_bt) { 5080 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5081 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5082 default: ShouldNotReachHere(); 5083 } 5084 break; 5085 case T_INT: 5086 assert(to_elem_bt == T_LONG, ""); 5087 vpmovsxdq(dst, src, vlen_enc); 5088 break; 5089 default: 5090 ShouldNotReachHere(); 5091 } 5092 } 5093 5094 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5095 BasicType dst_bt, BasicType src_bt, int vlen) { 5096 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5097 assert(vlen_enc != AVX_512bit, ""); 5098 5099 int dst_bt_size = type2aelembytes(dst_bt); 5100 int src_bt_size = type2aelembytes(src_bt); 5101 if (dst_bt_size > src_bt_size) { 5102 switch (dst_bt_size / src_bt_size) { 5103 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5104 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5105 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5106 default: ShouldNotReachHere(); 5107 } 5108 } else { 5109 assert(dst_bt_size < src_bt_size, ""); 5110 switch (src_bt_size / dst_bt_size) { 5111 case 2: { 5112 if (vlen_enc == AVX_128bit) { 5113 vpacksswb(dst, src, src, vlen_enc); 5114 } else { 5115 vpacksswb(dst, src, src, vlen_enc); 5116 vpermq(dst, dst, 0x08, vlen_enc); 5117 } 5118 break; 5119 } 5120 case 4: { 5121 if (vlen_enc == AVX_128bit) { 5122 vpackssdw(dst, src, src, vlen_enc); 5123 vpacksswb(dst, dst, dst, vlen_enc); 5124 } else { 5125 vpackssdw(dst, src, src, vlen_enc); 5126 vpermq(dst, dst, 0x08, vlen_enc); 5127 vpacksswb(dst, dst, dst, AVX_128bit); 5128 } 5129 break; 5130 } 5131 case 8: { 5132 if (vlen_enc == AVX_128bit) { 5133 vpshufd(dst, src, 0x08, vlen_enc); 5134 vpackssdw(dst, dst, dst, vlen_enc); 5135 vpacksswb(dst, dst, dst, vlen_enc); 5136 } else { 5137 vpshufd(dst, src, 0x08, vlen_enc); 5138 vpermq(dst, dst, 0x08, vlen_enc); 5139 vpackssdw(dst, dst, dst, AVX_128bit); 5140 vpacksswb(dst, dst, dst, AVX_128bit); 5141 } 5142 break; 5143 } 5144 default: ShouldNotReachHere(); 5145 } 5146 } 5147 } 5148 5149 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5150 bool merge, BasicType bt, int vlen_enc) { 5151 if (bt == T_INT) { 5152 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5153 } else { 5154 assert(bt == T_LONG, ""); 5155 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5156 } 5157 } 5158 5159 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5160 bool merge, BasicType bt, int vlen_enc) { 5161 if (bt == T_INT) { 5162 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5163 } else { 5164 assert(bt == T_LONG, ""); 5165 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5166 } 5167 } 5168 5169 #ifdef _LP64 5170 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5171 Register rtmp2, XMMRegister xtmp, int mask_len, 5172 int vec_enc) { 5173 int index = 0; 5174 int vindex = 0; 5175 mov64(rtmp1, 0x0101010101010101L); 5176 pdepq(rtmp1, src, rtmp1); 5177 if (mask_len > 8) { 5178 movq(rtmp2, src); 5179 vpxor(xtmp, xtmp, xtmp, vec_enc); 5180 movq(xtmp, rtmp1); 5181 } 5182 movq(dst, rtmp1); 5183 5184 mask_len -= 8; 5185 while (mask_len > 0) { 5186 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5187 index++; 5188 if ((index % 2) == 0) { 5189 pxor(xtmp, xtmp); 5190 } 5191 mov64(rtmp1, 0x0101010101010101L); 5192 shrq(rtmp2, 8); 5193 pdepq(rtmp1, rtmp2, rtmp1); 5194 pinsrq(xtmp, rtmp1, index % 2); 5195 vindex = index / 2; 5196 if (vindex) { 5197 // Write entire 16 byte vector when both 64 bit 5198 // lanes are update to save redundant instructions. 5199 if (index % 2) { 5200 vinsertf128(dst, dst, xtmp, vindex); 5201 } 5202 } else { 5203 vmovdqu(dst, xtmp); 5204 } 5205 mask_len -= 8; 5206 } 5207 } 5208 5209 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5210 switch(opc) { 5211 case Op_VectorMaskTrueCount: 5212 popcntq(dst, tmp); 5213 break; 5214 case Op_VectorMaskLastTrue: 5215 if (VM_Version::supports_lzcnt()) { 5216 lzcntq(tmp, tmp); 5217 movl(dst, 63); 5218 subl(dst, tmp); 5219 } else { 5220 movl(dst, -1); 5221 bsrq(tmp, tmp); 5222 cmov32(Assembler::notZero, dst, tmp); 5223 } 5224 break; 5225 case Op_VectorMaskFirstTrue: 5226 if (VM_Version::supports_bmi1()) { 5227 if (masklen < 32) { 5228 orl(tmp, 1 << masklen); 5229 tzcntl(dst, tmp); 5230 } else if (masklen == 32) { 5231 tzcntl(dst, tmp); 5232 } else { 5233 assert(masklen == 64, ""); 5234 tzcntq(dst, tmp); 5235 } 5236 } else { 5237 if (masklen < 32) { 5238 orl(tmp, 1 << masklen); 5239 bsfl(dst, tmp); 5240 } else { 5241 assert(masklen == 32 || masklen == 64, ""); 5242 movl(dst, masklen); 5243 if (masklen == 32) { 5244 bsfl(tmp, tmp); 5245 } else { 5246 bsfq(tmp, tmp); 5247 } 5248 cmov32(Assembler::notZero, dst, tmp); 5249 } 5250 } 5251 break; 5252 case Op_VectorMaskToLong: 5253 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5254 break; 5255 default: assert(false, "Unhandled mask operation"); 5256 } 5257 } 5258 5259 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5260 int masklen, int masksize, int vec_enc) { 5261 assert(VM_Version::supports_popcnt(), ""); 5262 5263 if(VM_Version::supports_avx512bw()) { 5264 kmovql(tmp, mask); 5265 } else { 5266 assert(masklen <= 16, ""); 5267 kmovwl(tmp, mask); 5268 } 5269 5270 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5271 // operations needs to be clipped. 5272 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5273 andq(tmp, (1 << masklen) - 1); 5274 } 5275 5276 vector_mask_operation_helper(opc, dst, tmp, masklen); 5277 } 5278 5279 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5280 Register tmp, int masklen, BasicType bt, int vec_enc) { 5281 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5282 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5283 assert(VM_Version::supports_popcnt(), ""); 5284 5285 bool need_clip = false; 5286 switch(bt) { 5287 case T_BOOLEAN: 5288 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5289 vpxor(xtmp, xtmp, xtmp, vec_enc); 5290 vpsubb(xtmp, xtmp, mask, vec_enc); 5291 vpmovmskb(tmp, xtmp, vec_enc); 5292 need_clip = masklen < 16; 5293 break; 5294 case T_BYTE: 5295 vpmovmskb(tmp, mask, vec_enc); 5296 need_clip = masklen < 16; 5297 break; 5298 case T_SHORT: 5299 vpacksswb(xtmp, mask, mask, vec_enc); 5300 if (masklen >= 16) { 5301 vpermpd(xtmp, xtmp, 8, vec_enc); 5302 } 5303 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5304 need_clip = masklen < 16; 5305 break; 5306 case T_INT: 5307 case T_FLOAT: 5308 vmovmskps(tmp, mask, vec_enc); 5309 need_clip = masklen < 4; 5310 break; 5311 case T_LONG: 5312 case T_DOUBLE: 5313 vmovmskpd(tmp, mask, vec_enc); 5314 need_clip = masklen < 2; 5315 break; 5316 default: assert(false, "Unhandled type, %s", type2name(bt)); 5317 } 5318 5319 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5320 // operations needs to be clipped. 5321 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5322 // need_clip implies masklen < 32 5323 andq(tmp, (1 << masklen) - 1); 5324 } 5325 5326 vector_mask_operation_helper(opc, dst, tmp, masklen); 5327 } 5328 5329 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5330 Register rtmp2, int mask_len) { 5331 kmov(rtmp1, src); 5332 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5333 mov64(rtmp2, -1L); 5334 pextq(rtmp2, rtmp2, rtmp1); 5335 kmov(dst, rtmp2); 5336 } 5337 5338 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5339 XMMRegister mask, Register rtmp, Register rscratch, 5340 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5341 int vec_enc) { 5342 assert(type2aelembytes(bt) >= 4, ""); 5343 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5344 address compress_perm_table = nullptr; 5345 address expand_perm_table = nullptr; 5346 if (type2aelembytes(bt) == 8) { 5347 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5348 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5349 vmovmskpd(rtmp, mask, vec_enc); 5350 } else { 5351 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5352 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5353 vmovmskps(rtmp, mask, vec_enc); 5354 } 5355 shlq(rtmp, 5); // for 32 byte permute row. 5356 if (opcode == Op_CompressV) { 5357 lea(rscratch, ExternalAddress(compress_perm_table)); 5358 } else { 5359 lea(rscratch, ExternalAddress(expand_perm_table)); 5360 } 5361 addptr(rtmp, rscratch); 5362 vmovdqu(permv, Address(rtmp)); 5363 vpermps(dst, permv, src, Assembler::AVX_256bit); 5364 vpxor(xtmp, xtmp, xtmp, vec_enc); 5365 // Blend the result with zero vector using permute mask, each column entry 5366 // in a permute table row contains either a valid permute index or a -1 (default) 5367 // value, this can potentially be used as a blending mask after 5368 // compressing/expanding the source vector lanes. 5369 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5370 } 5371 5372 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5373 bool merge, BasicType bt, int vec_enc) { 5374 if (opcode == Op_CompressV) { 5375 switch(bt) { 5376 case T_BYTE: 5377 evpcompressb(dst, mask, src, merge, vec_enc); 5378 break; 5379 case T_CHAR: 5380 case T_SHORT: 5381 evpcompressw(dst, mask, src, merge, vec_enc); 5382 break; 5383 case T_INT: 5384 evpcompressd(dst, mask, src, merge, vec_enc); 5385 break; 5386 case T_FLOAT: 5387 evcompressps(dst, mask, src, merge, vec_enc); 5388 break; 5389 case T_LONG: 5390 evpcompressq(dst, mask, src, merge, vec_enc); 5391 break; 5392 case T_DOUBLE: 5393 evcompresspd(dst, mask, src, merge, vec_enc); 5394 break; 5395 default: 5396 fatal("Unsupported type %s", type2name(bt)); 5397 break; 5398 } 5399 } else { 5400 assert(opcode == Op_ExpandV, ""); 5401 switch(bt) { 5402 case T_BYTE: 5403 evpexpandb(dst, mask, src, merge, vec_enc); 5404 break; 5405 case T_CHAR: 5406 case T_SHORT: 5407 evpexpandw(dst, mask, src, merge, vec_enc); 5408 break; 5409 case T_INT: 5410 evpexpandd(dst, mask, src, merge, vec_enc); 5411 break; 5412 case T_FLOAT: 5413 evexpandps(dst, mask, src, merge, vec_enc); 5414 break; 5415 case T_LONG: 5416 evpexpandq(dst, mask, src, merge, vec_enc); 5417 break; 5418 case T_DOUBLE: 5419 evexpandpd(dst, mask, src, merge, vec_enc); 5420 break; 5421 default: 5422 fatal("Unsupported type %s", type2name(bt)); 5423 break; 5424 } 5425 } 5426 } 5427 #endif 5428 5429 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5430 KRegister ktmp1, int vec_enc) { 5431 if (opcode == Op_SignumVD) { 5432 vsubpd(dst, zero, one, vec_enc); 5433 // if src < 0 ? -1 : 1 5434 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5435 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5436 // if src == NaN, -0.0 or 0.0 return src. 5437 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5438 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5439 } else { 5440 assert(opcode == Op_SignumVF, ""); 5441 vsubps(dst, zero, one, vec_enc); 5442 // if src < 0 ? -1 : 1 5443 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5444 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5445 // if src == NaN, -0.0 or 0.0 return src. 5446 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5447 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5448 } 5449 } 5450 5451 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5452 XMMRegister xtmp1, int vec_enc) { 5453 if (opcode == Op_SignumVD) { 5454 vsubpd(dst, zero, one, vec_enc); 5455 // if src < 0 ? -1 : 1 5456 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5457 // if src == NaN, -0.0 or 0.0 return src. 5458 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5459 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5460 } else { 5461 assert(opcode == Op_SignumVF, ""); 5462 vsubps(dst, zero, one, vec_enc); 5463 // if src < 0 ? -1 : 1 5464 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5465 // if src == NaN, -0.0 or 0.0 return src. 5466 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5467 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5468 } 5469 } 5470 5471 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5472 if (VM_Version::supports_avx512bw()) { 5473 if (mask_len > 32) { 5474 kmovql(dst, src); 5475 } else { 5476 kmovdl(dst, src); 5477 if (mask_len != 32) { 5478 kshiftrdl(dst, dst, 32 - mask_len); 5479 } 5480 } 5481 } else { 5482 assert(mask_len <= 16, ""); 5483 kmovwl(dst, src); 5484 if (mask_len != 16) { 5485 kshiftrwl(dst, dst, 16 - mask_len); 5486 } 5487 } 5488 } 5489 5490 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5491 int lane_size = type2aelembytes(bt); 5492 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5493 if ((is_LP64 || lane_size < 8) && 5494 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5495 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5496 movptr(rtmp, imm32); 5497 switch(lane_size) { 5498 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5499 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5500 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5501 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5502 fatal("Unsupported lane size %d", lane_size); 5503 break; 5504 } 5505 } else { 5506 movptr(rtmp, imm32); 5507 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5508 switch(lane_size) { 5509 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5510 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5511 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5512 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5513 fatal("Unsupported lane size %d", lane_size); 5514 break; 5515 } 5516 } 5517 } 5518 5519 // 5520 // Following is lookup table based popcount computation algorithm:- 5521 // Index Bit set count 5522 // [ 0000 -> 0, 5523 // 0001 -> 1, 5524 // 0010 -> 1, 5525 // 0011 -> 2, 5526 // 0100 -> 1, 5527 // 0101 -> 2, 5528 // 0110 -> 2, 5529 // 0111 -> 3, 5530 // 1000 -> 1, 5531 // 1001 -> 2, 5532 // 1010 -> 3, 5533 // 1011 -> 3, 5534 // 1100 -> 2, 5535 // 1101 -> 3, 5536 // 1111 -> 4 ] 5537 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5538 // shuffle indices for lookup table access. 5539 // b. Right shift each byte of vector lane by 4 positions. 5540 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5541 // shuffle indices for lookup table access. 5542 // d. Add the bitset count of upper and lower 4 bits of each byte. 5543 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5544 // count of all the bytes of a quadword. 5545 // f. Perform step e. for upper 128bit vector lane. 5546 // g. Pack the bitset count of quadwords back to double word. 5547 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5548 5549 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5550 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5551 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5552 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5553 vpsrlw(dst, src, 4, vec_enc); 5554 vpand(dst, dst, xtmp1, vec_enc); 5555 vpand(xtmp1, src, xtmp1, vec_enc); 5556 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5557 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5558 vpshufb(dst, xtmp2, dst, vec_enc); 5559 vpaddb(dst, dst, xtmp1, vec_enc); 5560 } 5561 5562 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5563 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5564 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5565 // Following code is as per steps e,f,g and h of above algorithm. 5566 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5567 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5568 vpsadbw(dst, dst, xtmp2, vec_enc); 5569 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5570 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5571 vpackuswb(dst, xtmp1, dst, vec_enc); 5572 } 5573 5574 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5575 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5576 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5577 // Add the popcount of upper and lower bytes of word. 5578 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5579 vpsrlw(dst, xtmp1, 8, vec_enc); 5580 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5581 vpaddw(dst, dst, xtmp1, vec_enc); 5582 } 5583 5584 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5585 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5586 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5587 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5588 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5589 } 5590 5591 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5592 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5593 switch(bt) { 5594 case T_LONG: 5595 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5596 break; 5597 case T_INT: 5598 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5599 break; 5600 case T_CHAR: 5601 case T_SHORT: 5602 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5603 break; 5604 case T_BYTE: 5605 case T_BOOLEAN: 5606 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5607 break; 5608 default: 5609 fatal("Unsupported type %s", type2name(bt)); 5610 break; 5611 } 5612 } 5613 5614 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5615 KRegister mask, bool merge, int vec_enc) { 5616 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5617 switch(bt) { 5618 case T_LONG: 5619 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5620 evpopcntq(dst, mask, src, merge, vec_enc); 5621 break; 5622 case T_INT: 5623 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5624 evpopcntd(dst, mask, src, merge, vec_enc); 5625 break; 5626 case T_CHAR: 5627 case T_SHORT: 5628 assert(VM_Version::supports_avx512_bitalg(), ""); 5629 evpopcntw(dst, mask, src, merge, vec_enc); 5630 break; 5631 case T_BYTE: 5632 case T_BOOLEAN: 5633 assert(VM_Version::supports_avx512_bitalg(), ""); 5634 evpopcntb(dst, mask, src, merge, vec_enc); 5635 break; 5636 default: 5637 fatal("Unsupported type %s", type2name(bt)); 5638 break; 5639 } 5640 } 5641 5642 #ifndef _LP64 5643 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5644 assert(VM_Version::supports_avx512bw(), ""); 5645 kmovdl(tmp, src); 5646 kunpckdql(dst, tmp, tmp); 5647 } 5648 #endif 5649 5650 // Bit reversal algorithm first reverses the bits of each byte followed by 5651 // a byte level reversal for multi-byte primitive types (short/int/long). 5652 // Algorithm performs a lookup table access to get reverse bit sequence 5653 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5654 // is obtained by swapping the reverse bit sequences of upper and lower 5655 // nibble of a byte. 5656 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5657 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5658 if (VM_Version::supports_avx512vlbw()) { 5659 5660 // Get the reverse bit sequence of lower nibble of each byte. 5661 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5662 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5663 evpandq(dst, xtmp2, src, vec_enc); 5664 vpshufb(dst, xtmp1, dst, vec_enc); 5665 vpsllq(dst, dst, 4, vec_enc); 5666 5667 // Get the reverse bit sequence of upper nibble of each byte. 5668 vpandn(xtmp2, xtmp2, src, vec_enc); 5669 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5670 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5671 5672 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5673 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5674 evporq(xtmp2, dst, xtmp2, vec_enc); 5675 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5676 5677 } else if(vec_enc == Assembler::AVX_512bit) { 5678 // Shift based bit reversal. 5679 assert(bt == T_LONG || bt == T_INT, ""); 5680 5681 // Swap lower and upper nibble of each byte. 5682 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5683 5684 // Swap two least and most significant bits of each nibble. 5685 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5686 5687 // Swap adjacent pair of bits. 5688 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5689 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5690 5691 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5692 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5693 } else { 5694 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5695 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5696 5697 // Get the reverse bit sequence of lower nibble of each byte. 5698 vpand(dst, xtmp2, src, vec_enc); 5699 vpshufb(dst, xtmp1, dst, vec_enc); 5700 vpsllq(dst, dst, 4, vec_enc); 5701 5702 // Get the reverse bit sequence of upper nibble of each byte. 5703 vpandn(xtmp2, xtmp2, src, vec_enc); 5704 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5705 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5706 5707 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5708 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5709 vpor(xtmp2, dst, xtmp2, vec_enc); 5710 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5711 } 5712 } 5713 5714 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5715 XMMRegister xtmp, Register rscratch) { 5716 assert(VM_Version::supports_gfni(), ""); 5717 assert(rscratch != noreg || always_reachable(mask), "missing"); 5718 5719 // Galois field instruction based bit reversal based on following algorithm. 5720 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5721 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5722 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5723 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5724 } 5725 5726 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5727 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5728 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5729 evpandq(dst, xtmp1, src, vec_enc); 5730 vpsllq(dst, dst, nbits, vec_enc); 5731 vpandn(xtmp1, xtmp1, src, vec_enc); 5732 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5733 evporq(dst, dst, xtmp1, vec_enc); 5734 } 5735 5736 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5737 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5738 // Shift based bit reversal. 5739 assert(VM_Version::supports_evex(), ""); 5740 switch(bt) { 5741 case T_LONG: 5742 // Swap upper and lower double word of each quad word. 5743 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5744 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5745 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5746 break; 5747 case T_INT: 5748 // Swap upper and lower word of each double word. 5749 evprord(xtmp1, k0, src, 16, true, vec_enc); 5750 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5751 break; 5752 case T_CHAR: 5753 case T_SHORT: 5754 // Swap upper and lower byte of each word. 5755 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5756 break; 5757 case T_BYTE: 5758 evmovdquq(dst, k0, src, true, vec_enc); 5759 break; 5760 default: 5761 fatal("Unsupported type %s", type2name(bt)); 5762 break; 5763 } 5764 } 5765 5766 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5767 if (bt == T_BYTE) { 5768 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5769 evmovdquq(dst, k0, src, true, vec_enc); 5770 } else { 5771 vmovdqu(dst, src); 5772 } 5773 return; 5774 } 5775 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5776 // pre-computed shuffle indices. 5777 switch(bt) { 5778 case T_LONG: 5779 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5780 break; 5781 case T_INT: 5782 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5783 break; 5784 case T_CHAR: 5785 case T_SHORT: 5786 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5787 break; 5788 default: 5789 fatal("Unsupported type %s", type2name(bt)); 5790 break; 5791 } 5792 vpshufb(dst, src, dst, vec_enc); 5793 } 5794 5795 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5796 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5797 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5798 assert(is_integral_type(bt), ""); 5799 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5800 assert(VM_Version::supports_avx512cd(), ""); 5801 switch(bt) { 5802 case T_LONG: 5803 evplzcntq(dst, ktmp, src, merge, vec_enc); 5804 break; 5805 case T_INT: 5806 evplzcntd(dst, ktmp, src, merge, vec_enc); 5807 break; 5808 case T_SHORT: 5809 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5810 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5811 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5812 vpunpckhwd(dst, xtmp1, src, vec_enc); 5813 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5814 vpackusdw(dst, xtmp2, dst, vec_enc); 5815 break; 5816 case T_BYTE: 5817 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5818 // accessing the lookup table. 5819 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5820 // accessing the lookup table. 5821 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5822 assert(VM_Version::supports_avx512bw(), ""); 5823 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5824 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5825 vpand(xtmp2, dst, src, vec_enc); 5826 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5827 vpsrlw(xtmp3, src, 4, vec_enc); 5828 vpand(xtmp3, dst, xtmp3, vec_enc); 5829 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5830 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5831 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5832 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5833 break; 5834 default: 5835 fatal("Unsupported type %s", type2name(bt)); 5836 break; 5837 } 5838 } 5839 5840 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5841 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5842 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5843 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5844 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5845 // accessing the lookup table. 5846 vpand(dst, xtmp2, src, vec_enc); 5847 vpshufb(dst, xtmp1, dst, vec_enc); 5848 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5849 // accessing the lookup table. 5850 vpsrlw(xtmp3, src, 4, vec_enc); 5851 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5852 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5853 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5854 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5855 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5856 vpaddb(dst, dst, xtmp2, vec_enc); 5857 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5858 } 5859 5860 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5861 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5862 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5863 // Add zero counts of lower byte and upper byte of a word if 5864 // upper byte holds a zero value. 5865 vpsrlw(xtmp3, src, 8, vec_enc); 5866 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5867 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5868 vpsllw(xtmp2, dst, 8, vec_enc); 5869 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5870 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5871 vpsrlw(dst, dst, 8, vec_enc); 5872 } 5873 5874 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5875 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5876 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5877 // hence biased exponent can be used to compute leading zero count as per 5878 // following formula:- 5879 // LZCNT = 32 - (biased_exp - 127) 5880 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5881 5882 // Broadcast 0xFF 5883 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5884 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5885 5886 // Extract biased exponent. 5887 vcvtdq2ps(dst, src, vec_enc); 5888 vpsrld(dst, dst, 23, vec_enc); 5889 vpand(dst, dst, xtmp1, vec_enc); 5890 5891 // Broadcast 127. 5892 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5893 // Exponent = biased_exp - 127 5894 vpsubd(dst, dst, xtmp1, vec_enc); 5895 5896 // Exponent = Exponent + 1 5897 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5898 vpaddd(dst, dst, xtmp3, vec_enc); 5899 5900 // Replace -ve exponent with zero, exponent is -ve when src 5901 // lane contains a zero value. 5902 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5903 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5904 5905 // Rematerialize broadcast 32. 5906 vpslld(xtmp1, xtmp3, 5, vec_enc); 5907 // Exponent is 32 if corresponding source lane contains max_int value. 5908 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5909 // LZCNT = 32 - exponent 5910 vpsubd(dst, xtmp1, dst, vec_enc); 5911 5912 // Replace LZCNT with a value 1 if corresponding source lane 5913 // contains max_int value. 5914 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5915 5916 // Replace biased_exp with 0 if source lane value is less than zero. 5917 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5918 vblendvps(dst, dst, xtmp2, src, vec_enc); 5919 } 5920 5921 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5922 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5923 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5924 // Add zero counts of lower word and upper word of a double word if 5925 // upper word holds a zero value. 5926 vpsrld(xtmp3, src, 16, vec_enc); 5927 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5928 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 5929 vpslld(xtmp2, dst, 16, vec_enc); 5930 vpaddd(xtmp2, xtmp2, dst, vec_enc); 5931 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5932 vpsrld(dst, dst, 16, vec_enc); 5933 // Add zero counts of lower doubleword and upper doubleword of a 5934 // quadword if upper doubleword holds a zero value. 5935 vpsrlq(xtmp3, src, 32, vec_enc); 5936 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 5937 vpsllq(xtmp2, dst, 32, vec_enc); 5938 vpaddq(xtmp2, xtmp2, dst, vec_enc); 5939 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5940 vpsrlq(dst, dst, 32, vec_enc); 5941 } 5942 5943 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 5944 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5945 Register rtmp, int vec_enc) { 5946 assert(is_integral_type(bt), "unexpected type"); 5947 assert(vec_enc < Assembler::AVX_512bit, ""); 5948 switch(bt) { 5949 case T_LONG: 5950 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5951 break; 5952 case T_INT: 5953 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 5954 break; 5955 case T_SHORT: 5956 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5957 break; 5958 case T_BYTE: 5959 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5960 break; 5961 default: 5962 fatal("Unsupported type %s", type2name(bt)); 5963 break; 5964 } 5965 } 5966 5967 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5968 switch(bt) { 5969 case T_BYTE: 5970 vpsubb(dst, src1, src2, vec_enc); 5971 break; 5972 case T_SHORT: 5973 vpsubw(dst, src1, src2, vec_enc); 5974 break; 5975 case T_INT: 5976 vpsubd(dst, src1, src2, vec_enc); 5977 break; 5978 case T_LONG: 5979 vpsubq(dst, src1, src2, vec_enc); 5980 break; 5981 default: 5982 fatal("Unsupported type %s", type2name(bt)); 5983 break; 5984 } 5985 } 5986 5987 // Trailing zero count computation is based on leading zero count operation as per 5988 // following equation. All AVX3 targets support AVX512CD feature which offers 5989 // direct vector instruction to compute leading zero count. 5990 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5991 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5992 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5993 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 5994 assert(is_integral_type(bt), ""); 5995 // xtmp = -1 5996 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 5997 // xtmp = xtmp + src 5998 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 5999 // xtmp = xtmp & ~src 6000 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6001 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6002 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6003 vpsub(bt, dst, xtmp4, dst, vec_enc); 6004 } 6005 6006 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6007 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6008 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6009 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6010 assert(is_integral_type(bt), ""); 6011 // xtmp = 0 6012 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6013 // xtmp = 0 - src 6014 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6015 // xtmp = xtmp | src 6016 vpor(xtmp3, xtmp3, src, vec_enc); 6017 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6018 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6019 vpsub(bt, dst, xtmp1, dst, vec_enc); 6020 } 6021 6022 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6023 Label done; 6024 Label neg_divisor_fastpath; 6025 cmpl(divisor, 0); 6026 jccb(Assembler::less, neg_divisor_fastpath); 6027 xorl(rdx, rdx); 6028 divl(divisor); 6029 jmpb(done); 6030 bind(neg_divisor_fastpath); 6031 // Fastpath for divisor < 0: 6032 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6033 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6034 movl(rdx, rax); 6035 subl(rdx, divisor); 6036 if (VM_Version::supports_bmi1()) { 6037 andnl(rax, rdx, rax); 6038 } else { 6039 notl(rdx); 6040 andl(rax, rdx); 6041 } 6042 shrl(rax, 31); 6043 bind(done); 6044 } 6045 6046 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6047 Label done; 6048 Label neg_divisor_fastpath; 6049 cmpl(divisor, 0); 6050 jccb(Assembler::less, neg_divisor_fastpath); 6051 xorl(rdx, rdx); 6052 divl(divisor); 6053 jmpb(done); 6054 bind(neg_divisor_fastpath); 6055 // Fastpath when divisor < 0: 6056 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6057 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6058 movl(rdx, rax); 6059 subl(rax, divisor); 6060 if (VM_Version::supports_bmi1()) { 6061 andnl(rax, rax, rdx); 6062 } else { 6063 notl(rax); 6064 andl(rax, rdx); 6065 } 6066 sarl(rax, 31); 6067 andl(rax, divisor); 6068 subl(rdx, rax); 6069 bind(done); 6070 } 6071 6072 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6073 Label done; 6074 Label neg_divisor_fastpath; 6075 6076 cmpl(divisor, 0); 6077 jccb(Assembler::less, neg_divisor_fastpath); 6078 xorl(rdx, rdx); 6079 divl(divisor); 6080 jmpb(done); 6081 bind(neg_divisor_fastpath); 6082 // Fastpath for divisor < 0: 6083 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6084 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6085 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6086 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6087 movl(rdx, rax); 6088 subl(rax, divisor); 6089 if (VM_Version::supports_bmi1()) { 6090 andnl(rax, rax, rdx); 6091 } else { 6092 notl(rax); 6093 andl(rax, rdx); 6094 } 6095 movl(tmp, rax); 6096 shrl(rax, 31); // quotient 6097 sarl(tmp, 31); 6098 andl(tmp, divisor); 6099 subl(rdx, tmp); // remainder 6100 bind(done); 6101 } 6102 6103 #ifdef _LP64 6104 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6105 XMMRegister xtmp2, Register rtmp) { 6106 if(VM_Version::supports_gfni()) { 6107 // Galois field instruction based bit reversal based on following algorithm. 6108 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6109 mov64(rtmp, 0x8040201008040201L); 6110 movq(xtmp1, src); 6111 movq(xtmp2, rtmp); 6112 gf2p8affineqb(xtmp1, xtmp2, 0); 6113 movq(dst, xtmp1); 6114 } else { 6115 // Swap even and odd numbered bits. 6116 movl(rtmp, src); 6117 andl(rtmp, 0x55555555); 6118 shll(rtmp, 1); 6119 movl(dst, src); 6120 andl(dst, 0xAAAAAAAA); 6121 shrl(dst, 1); 6122 orl(dst, rtmp); 6123 6124 // Swap LSB and MSB 2 bits of each nibble. 6125 movl(rtmp, dst); 6126 andl(rtmp, 0x33333333); 6127 shll(rtmp, 2); 6128 andl(dst, 0xCCCCCCCC); 6129 shrl(dst, 2); 6130 orl(dst, rtmp); 6131 6132 // Swap LSB and MSB 4 bits of each byte. 6133 movl(rtmp, dst); 6134 andl(rtmp, 0x0F0F0F0F); 6135 shll(rtmp, 4); 6136 andl(dst, 0xF0F0F0F0); 6137 shrl(dst, 4); 6138 orl(dst, rtmp); 6139 } 6140 bswapl(dst); 6141 } 6142 6143 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6144 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6145 if(VM_Version::supports_gfni()) { 6146 // Galois field instruction based bit reversal based on following algorithm. 6147 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6148 mov64(rtmp1, 0x8040201008040201L); 6149 movq(xtmp1, src); 6150 movq(xtmp2, rtmp1); 6151 gf2p8affineqb(xtmp1, xtmp2, 0); 6152 movq(dst, xtmp1); 6153 } else { 6154 // Swap even and odd numbered bits. 6155 movq(rtmp1, src); 6156 mov64(rtmp2, 0x5555555555555555L); 6157 andq(rtmp1, rtmp2); 6158 shlq(rtmp1, 1); 6159 movq(dst, src); 6160 notq(rtmp2); 6161 andq(dst, rtmp2); 6162 shrq(dst, 1); 6163 orq(dst, rtmp1); 6164 6165 // Swap LSB and MSB 2 bits of each nibble. 6166 movq(rtmp1, dst); 6167 mov64(rtmp2, 0x3333333333333333L); 6168 andq(rtmp1, rtmp2); 6169 shlq(rtmp1, 2); 6170 notq(rtmp2); 6171 andq(dst, rtmp2); 6172 shrq(dst, 2); 6173 orq(dst, rtmp1); 6174 6175 // Swap LSB and MSB 4 bits of each byte. 6176 movq(rtmp1, dst); 6177 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6178 andq(rtmp1, rtmp2); 6179 shlq(rtmp1, 4); 6180 notq(rtmp2); 6181 andq(dst, rtmp2); 6182 shrq(dst, 4); 6183 orq(dst, rtmp1); 6184 } 6185 bswapq(dst); 6186 } 6187 6188 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6189 Label done; 6190 Label neg_divisor_fastpath; 6191 cmpq(divisor, 0); 6192 jccb(Assembler::less, neg_divisor_fastpath); 6193 xorl(rdx, rdx); 6194 divq(divisor); 6195 jmpb(done); 6196 bind(neg_divisor_fastpath); 6197 // Fastpath for divisor < 0: 6198 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6199 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6200 movq(rdx, rax); 6201 subq(rdx, divisor); 6202 if (VM_Version::supports_bmi1()) { 6203 andnq(rax, rdx, rax); 6204 } else { 6205 notq(rdx); 6206 andq(rax, rdx); 6207 } 6208 shrq(rax, 63); 6209 bind(done); 6210 } 6211 6212 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6213 Label done; 6214 Label neg_divisor_fastpath; 6215 cmpq(divisor, 0); 6216 jccb(Assembler::less, neg_divisor_fastpath); 6217 xorq(rdx, rdx); 6218 divq(divisor); 6219 jmp(done); 6220 bind(neg_divisor_fastpath); 6221 // Fastpath when divisor < 0: 6222 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6223 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6224 movq(rdx, rax); 6225 subq(rax, divisor); 6226 if (VM_Version::supports_bmi1()) { 6227 andnq(rax, rax, rdx); 6228 } else { 6229 notq(rax); 6230 andq(rax, rdx); 6231 } 6232 sarq(rax, 63); 6233 andq(rax, divisor); 6234 subq(rdx, rax); 6235 bind(done); 6236 } 6237 6238 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6239 Label done; 6240 Label neg_divisor_fastpath; 6241 cmpq(divisor, 0); 6242 jccb(Assembler::less, neg_divisor_fastpath); 6243 xorq(rdx, rdx); 6244 divq(divisor); 6245 jmp(done); 6246 bind(neg_divisor_fastpath); 6247 // Fastpath for divisor < 0: 6248 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6249 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6250 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6251 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6252 movq(rdx, rax); 6253 subq(rax, divisor); 6254 if (VM_Version::supports_bmi1()) { 6255 andnq(rax, rax, rdx); 6256 } else { 6257 notq(rax); 6258 andq(rax, rdx); 6259 } 6260 movq(tmp, rax); 6261 shrq(rax, 63); // quotient 6262 sarq(tmp, 63); 6263 andq(tmp, divisor); 6264 subq(rdx, tmp); // remainder 6265 bind(done); 6266 } 6267 #endif 6268 6269 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6270 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6271 int vlen_enc) { 6272 assert(VM_Version::supports_avx512bw(), ""); 6273 // Byte shuffles are inlane operations and indices are determined using 6274 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6275 // normalized to index range 0-15. This makes sure that all the multiples 6276 // of an index value are placed at same relative position in 128 bit 6277 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6278 // will be 16th element in their respective 128 bit lanes. 6279 movl(rtmp, 16); 6280 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6281 6282 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6283 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6284 // original shuffle indices and move the shuffled lanes corresponding to true 6285 // mask to destination vector. 6286 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6287 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6288 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6289 6290 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6291 // and broadcasting second 128 bit lane. 6292 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6293 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6294 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6295 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6296 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6297 6298 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6299 // and broadcasting third 128 bit lane. 6300 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6301 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6302 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6303 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6304 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6305 6306 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6307 // and broadcasting third 128 bit lane. 6308 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6309 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6310 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6311 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6312 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6313 } 6314 6315 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6316 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6317 if (vlen_enc == AVX_128bit) { 6318 vpermilps(dst, src, shuffle, vlen_enc); 6319 } else if (bt == T_INT) { 6320 vpermd(dst, shuffle, src, vlen_enc); 6321 } else { 6322 assert(bt == T_FLOAT, ""); 6323 vpermps(dst, shuffle, src, vlen_enc); 6324 } 6325 }